[CPU] Plugin migration on oneDNN (v1.6) (#3725)

2021-01-26 16:31:10 +03:00 · 2021-01-26 16:31:10 +03:00 · d58b4c65c8
commit d58b4c65c8
parent 0284cd69a8
140 changed files with 8355 additions and 5511 deletions
--- a/inference-engine/src/inference_engine/CMakeLists.txt
+++ b/inference-engine/src/inference_engine/CMakeLists.txt
@ -120,7 +120,7 @@ target_include_directories(${TARGET_NAME}_obj PRIVATE "${CMAKE_CURRENT_SOURCE_DI
 target_link_libraries(${TARGET_NAME}_obj PRIVATE ${TARGET_NAME}_reader_api)

 if(ENABLE_MKL_DNN)
-    target_include_directories(${TARGET_NAME}_obj SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu/xbyak")
+    target_include_directories(${TARGET_NAME}_obj SYSTEM PRIVATE "${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu/x64/xbyak")
 endif()

 set_ie_threading_interface_for(${TARGET_NAME}_obj)
--- a/inference-engine/src/legacy_api/src/net_pass.cpp
+++ b/inference-engine/src/legacy_api/src/net_pass.cpp
@ -1539,6 +1539,9 @@ void ConvertPrecision(CNNNetwork& net, Precision from, Precision to) {
        case getPrecisionMask(Precision::U16, Precision::I32):
            convertPrecisionForAll<Precision::U16, Precision::I32>(net);
            break;
+        case getPrecisionMask(Precision::I16, Precision::I32):
+            convertPrecisionForAll<Precision::I16, Precision::I32>(net);
+            break;
        default:
            THROW_IE_EXCEPTION << "Precision conversion from " << from << " to " << to
                               << " currently is not supported. You may expand precision"
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@ -12,6 +12,7 @@ if (WIN32)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNOMINMAX")
 endif()

+## TODO
 set(LAYERS
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_batchnorm_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_bin_conv_node.cpp
@ -34,8 +35,8 @@ set(LAYERS
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reorder_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reshape_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_rnn.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_roi_pooling_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_roi_align_node.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_roi_pooling_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_softmax_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_split_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_tensoriterator_node.cpp
@ -144,27 +145,13 @@ include_directories(
        $<TARGET_PROPERTY:inference_engine_plugin_api,INTERFACE_INCLUDE_DIRECTORIES>
        ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn
        ${CMAKE_CURRENT_SOURCE_DIR}
-        ${CMAKE_BINARY_DIR}/include)
-
-include_directories(SYSTEM
-        ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/common
-        ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/src/cpu
-        ${IE_MAIN_SOURCE_DIR}/thirdparty/mkl-dnn/include)
+        ${CMAKE_BINARY_DIR}/include
+)

 if (GEMM STREQUAL "MKL")
    log_rpath_from_dir(MKL "${MKL}/lib")
 endif()

-if (THREADING STREQUAL "TBB")
-    set(MKLDNN_THR MKLDNN_THR_TBB)
-elseif (THREADING STREQUAL "TBB_AUTO")
-    set(MKLDNN_THR MKLDNN_THR_TBB_AUTO)
-elseif (THREADING STREQUAL "OMP")
-    set(MKLDNN_THR MKLDNN_THR_OMP)
-else()
-    set(MKLDNN_THR MKLDNN_THR_SEQ)
-endif()
-
 # create plugin

 ie_add_plugin(NAME ${TARGET_NAME}
@ -174,11 +161,12 @@ ie_add_plugin(NAME ${TARGET_NAME}

 set_ie_threading_interface_for(${TARGET_NAME})

-target_compile_definitions(${TARGET_NAME} PUBLIC -DMKLDNN_THR=${MKLDNN_THR})
-
 target_link_libraries(${TARGET_NAME} PRIVATE mkldnn inference_engine inference_engine_legacy
                                             inference_engine_transformations inference_engine_lp_transformations openvino::conditional_compilation)

+target_include_directories(${TARGET_NAME} PRIVATE
+        $<TARGET_PROPERTY:mkldnn,INCLUDE_DIRECTORIES>)
+
 # Cross compiled function
 # TODO: The same for proposal, proposalONNX, topk
 cross_compiled_file(${TARGET_NAME}
@ -201,18 +189,22 @@ ie_add_api_validator_post_build_step(TARGET ${TARGET_NAME})
 #  add test object library

 add_library(${TARGET_NAME}_obj OBJECT ${SOURCES} ${HEADERS})
+target_link_libraries(${TARGET_NAME}_obj PUBLIC mkldnn)

 target_include_directories(${TARGET_NAME}_obj PRIVATE $<TARGET_PROPERTY:inference_engine_preproc_s,INTERFACE_INCLUDE_DIRECTORIES>
                                                      $<TARGET_PROPERTY:inference_engine_legacy,INTERFACE_INCLUDE_DIRECTORIES>
                                                      $<TARGET_PROPERTY:inference_engine_transformations,INTERFACE_INCLUDE_DIRECTORIES>
                                                      $<TARGET_PROPERTY:openvino::itt,INTERFACE_INCLUDE_DIRECTORIES>
                                                      $<TARGET_PROPERTY:openvino::conditional_compilation,INTERFACE_INCLUDE_DIRECTORIES>
-                                                      $<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>)
+                                                      $<TARGET_PROPERTY:inference_engine_lp_transformations,INTERFACE_INCLUDE_DIRECTORIES>
+                                              PUBLIC  ${CMAKE_CURRENT_SOURCE_DIR}
+                                                      $<TARGET_PROPERTY:mkldnn,INCLUDE_DIRECTORIES>)

 set_ie_threading_interface_for(${TARGET_NAME}_obj)

-target_compile_definitions(${TARGET_NAME}_obj PUBLIC -DMKLDNN_THR=${MKLDNN_THR}
-                                              PRIVATE USE_STATIC_IE IMPLEMENT_INFERENCE_ENGINE_PLUGIN)
+target_compile_definitions(${TARGET_NAME}_obj
+        PRIVATE USE_STATIC_IE IMPLEMENT_INFERENCE_ENGINE_PLUGIN
+)

 set_target_properties(${TARGET_NAME}_obj PROPERTIES EXCLUDE_FROM_ALL ON)

--- a/inference-engine/src/mkldnn_plugin/bf16transformer.cpp
+++ b/inference-engine/src/mkldnn_plugin/bf16transformer.cpp
@ -56,12 +56,6 @@ void BF16Transformer::convertToBFloat16(InferenceEngine::CNNNetwork &network) {
    InputsDataMap inputs = network.getInputsInfo();
    OutputsDataMap outputs = network.getOutputsInfo();
    for (auto iter : sortedLayers) {
-        if (CaselessEq<std::string>()(iter->type, "convolution")) {
-            auto dims = iter->insData[0].lock()->getDims();
-            if ((dims.size() == 4 || dims.size() == 5) && (dims[1] == 1 || dims[1] == 3))
-                continue;
-        }
-
        //  check, if memory output node needs to be transformed
        if (iter->type == "Memory" && iter->outData.size() == 0 &&
            iter->insData[0].lock()->getPrecision() == Precision::FP32) {
--- a/inference-engine/src/mkldnn_plugin/config.cpp
+++ b/inference-engine/src/mkldnn_plugin/config.cpp
@ -10,12 +10,11 @@

 #include "ie_plugin_config.hpp"
 #include "ie_common.h"
+#include "ie_parallel.hpp"
+#include "ie_system_conf.h"

 #include <cpp_interfaces/exception2status.hpp>
 #include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
-#include <ie_parallel.hpp>
-#include <ie_system_conf.h>
-

 namespace MKLDNNPlugin {

--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_engine.h
@ -1,68 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "desc_layer.h"
-#include "desc_tensor.h"
-#include "desc_tensor_comb.h"
-
-#include "cpu_prim_layer.h"
-#include "cpu_prim_tensor.h"
-
-#include "mkldnn.hpp"
-#include <memory>
-#include <vector>
-
-using namespace InferenceEngine;
-
-namespace MKLDNNPlugin {
-class CpuEngine;
-
-using CpuEnginePtr = std::shared_ptr<CpuEngine>;
-
-class CpuEngine : public details::no_copy {
-public:
-    CpuEngine() : eng(mkldnn::engine(mkldnn::engine::kind::cpu, 0)) {}
-
-    void bindThreads();
-
-    void createDescription(DescTensorPtr tns, bool isWeights = false);
-
-    void createDescription(DescLayerPtr layer);
-
-    void setFlatFormat(DescTensorPtr tns);
-
-    void createPrimitive(DescTensorPtr tns);
-
-    void createPrimitive(DescLayerPtr tns);
-
-    void setData(const TBlob<float> &src, DescTensorPtr dst);
-
-    void getData(const DescTensorPtr src, TBlob<float> &dst);
-
-    void subtraction(DescTensorPtr dst, DescTensorPtr sub);
-
-    void subtraction(DescTensorPtr dst, std::vector<float> sub);
-
-    void score(std::vector<DescLayerPtr> layers);
-
-    void score(DescLayerPtr layer);
-
-    void process(std::vector<mkldnn::primitive> exec_queue);
-
-    mkldnn::engine eng;  // TODO: Move me back to private section
-
-private:
-    static inline mkldnn::memory::desc *get_desc(std::vector<DescTensorPtr> tensors, size_t indx = 0);
-
-    static inline mkldnn::memory::desc *get_desc(DescTensorPtr tns);
-
-    static inline mkldnn::memory *get_prim(std::vector<DescTensorPtr> tns, size_t indx = 0);
-
-    static inline mkldnn::memory *get_prim(DescTensorPtr tns);
-
-    void createPrimitiveCombined(DescTensorComb &tns, void *data);
-};
-}  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_layer.h
@ -1,50 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "prim_layer.h"
-#include "mkldnn.hpp"
-#include <memory>
-
-using namespace InferenceEngine;
-using namespace mkldnn;
-
-namespace MKLDNNPlugin {
-
-class CpuPrimLayer : public PrimLayer {
-    friend class CpuEngine;
-
-    mkldnn::engine eng;
-    std::shared_ptr<mkldnn::primitive> prim;
-
-public:
-    explicit CpuPrimLayer(engine eng) : eng(eng) {}
-};
-
-template<typename LYR>
-class Layer : public CpuPrimLayer {
-    typename LYR::desc desc;
-    typename LYR::primitive_desc prim_desc;
-
-public:
-    Layer(typename LYR::desc desc, engine eng) :
-            CpuPrimLayer(eng),
-            desc(desc),
-            prim_desc(desc, eng) {}
-
-    friend class CpuEngine;
-};
-
-class ReorderLayer : public CpuPrimLayer {
-    reorder::primitive_desc prim_desc;
-
-public:
-    ReorderLayer(reorder::primitive_desc desc, engine eng) :
-            CpuPrimLayer(eng),
-            prim_desc(desc) {}
-
-    friend class CpuEngine;
-};
-}  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/cpu_prim_tensor.h
@ -1,34 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "prim_tensor.h"
-#include "mkldnn.hpp"
-#include <memory>
-
-namespace MKLDNNPlugin {
-
-class CpuPrimTensor;
-
-using CpuPrimTensorPtr = std::shared_ptr<CpuPrimTensor>;
-
-class CpuPrimTensor : public PrimTensor {
-public:
-    using Memory = std::shared_ptr<mkldnn::memory>;
-    using PrimitiveDesc = std::shared_ptr<mkldnn::memory::primitive_desc>;
-
-    explicit CpuPrimTensor(mkldnn::memory::desc desc) :
-            desc(desc) {}
-
-
-    mkldnn::memory getPrimitive() { return *(memory.get()); }
-
-private:
-    Memory memory;
-    mkldnn::memory::desc desc;
-
-    friend class CpuEngine;
-};
-}  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/desc_iterator.hpp
@ -1,166 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "mkldnn.hpp"
-
-#include <string>
-#include <mkldnn_types.h>
-#include <mkldnn.h>
-
-namespace mkldnn {
-
-struct primitive_desc_iterator : public handle<mkldnn_primitive_desc_iterator_t> {
-    template <typename T>
-    primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr, const engine &aengine) {
-        mkldnn_primitive_desc_iterator_t result;
-        auto sts = mkldnn_primitive_desc_iterator_create_v2(
-                &result, &adesc.data, aattr.get(), aengine.get(), nullptr);
-
-        if (sts == mkldnn_status_t::mkldnn_success)
-            reset(result);
-        else if (sts == mkldnn_status_t::mkldnn_unimplemented)
-            reset(nullptr);
-        else
-            THROW_IE_EXCEPTION << "could not create a primitive descriptor iterator";
-    }
-
-    template <typename T, typename TF>
-    primitive_desc_iterator(const T &adesc, const mkldnn::primitive_attr &aattr,
-            const engine &aengine, const TF &hint_fwd_primitive_desc) {
-        mkldnn_primitive_desc_iterator_t result;
-        auto sts = mkldnn_primitive_desc_iterator_create_v2(&result,
-                &adesc.data,
-                aattr.get(),
-                aengine.get(),
-                hint_fwd_primitive_desc.get());
-
-        if (sts == mkldnn_status_t::mkldnn_success)
-            reset(result);
-        else if (sts == mkldnn_status_t::mkldnn_unimplemented)
-            reset(nullptr);
-        else
-            THROW_IE_EXCEPTION << "could not create a primitive descriptor iterator";
-    }
-
-    bool is_not_end() const {
-        return (handle::get() != nullptr);
-    }
-
-    memory::primitive_desc fetch() const {
-        memory::primitive_desc adesc;
-        mkldnn_primitive_desc_t cdesc = nullptr;
-
-        cdesc = mkldnn_primitive_desc_iterator_fetch(get());
-
-        adesc.reset(cdesc);
-        return adesc;
-    }
-
-    primitive_desc_iterator operator++(int) {
-        mkldnn_status_t status = mkldnn_primitive_desc_iterator_next(get());
-        if (status == mkldnn_status_t::mkldnn_iterator_ends)
-            reset(nullptr);
-        else if (status != mkldnn_status_t::mkldnn_success)
-            THROW_IE_EXCEPTION << "could not get next iteration";
-
-        return *this;
-    }
-
-    memory::primitive_desc src_primitive_desc(size_t index = 0) const {
-        memory::primitive_desc adesc;
-        memory::primitive_desc cdesc_elem;
-        mkldnn_primitive_desc_t cdesc = nullptr;
-        cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
-                                               mkldnn::convert_to_c(src_pd), index);
-        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                          "could not clone a src primititve descriptor");
-        adesc.reset(cdesc);
-        return adesc;
-    }
-
-    memory::primitive_desc dst_primitive_desc(size_t index = 0) const {
-        memory::primitive_desc adesc;
-        memory::primitive_desc cdesc_elem;
-        mkldnn_primitive_desc_t cdesc = nullptr;
-        cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
-                                               mkldnn::convert_to_c(dst_pd), index);
-        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                          "could not clone a dst primitive descriptor");
-        adesc.reset(cdesc);
-        return adesc;
-    }
-
-
-    memory::primitive_desc diff_src_primitive_desc(size_t index = 0) const {
-        memory::primitive_desc adesc;
-        memory::primitive_desc cdesc_elem;
-        mkldnn_primitive_desc_t cdesc = nullptr;
-        cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
-                                               mkldnn::convert_to_c(diff_src_pd), index);
-        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                          "could not clone a diff_src primititve descriptor");
-        adesc.reset(cdesc);
-        return adesc;
-    }
-
-    memory::primitive_desc weights_primitive_desc(size_t index = 0) const {
-        memory::primitive_desc adesc;
-        memory::primitive_desc cdesc_elem;
-        mkldnn_primitive_desc_t cdesc = nullptr;
-        cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
-                                               mkldnn::convert_to_c(weights_pd), index);
-        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                          "could not clone a weights primitive descriptor");
-        adesc.reset(cdesc);
-        return adesc;
-    }
-
-    memory::primitive_desc diff_dst_primitive_desc(size_t index = 0) const {
-        memory::primitive_desc adesc;
-        memory::primitive_desc cdesc_elem;
-        mkldnn_primitive_desc_t cdesc = nullptr;
-        cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        const_mkldnn_primitive_desc_t const_cdesc =
-                mkldnn_primitive_desc_query_pd(cdesc_elem.get(),
-                                               mkldnn::convert_to_c(diff_dst_pd), index);
-        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, const_cdesc),
-                          "could not clone a diff_dst primitive descriptor");
-        adesc.reset(cdesc);
-        return adesc;
-    }
-
-    std::string get_impl_info_str() const {
-        memory::primitive_desc cdesc_elem;
-        cdesc_elem.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        const char *info;
-        error::wrap_c_api(mkldnn_primitive_desc_query(cdesc_elem.get(),
-                        mkldnn::convert_to_c(impl_info_str), 0, &info),
-                "could not query info string of primitive descriptor");
-        return std::string(info);
-    }
-
-    template <typename T>
-    void getPrimitiveDescriptor(T& pdesc) const {
-        mkldnn_primitive_desc_t cdesc = nullptr;
-
-        memory::primitive_desc cdescpd;
-
-        cdescpd.reset(mkldnn_primitive_desc_iterator_fetch(get()));
-        error::wrap_c_api(mkldnn_primitive_desc_clone(&cdesc, cdescpd.get()),
-                          "could not clone a src primititve descriptor");
-        pdesc.reset(cdesc);
-    }
-};
-
-}  // namespace mkldnn
--- a/inference-engine/src/mkldnn_plugin/mkldnn/ie_mkldnn.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/ie_mkldnn.cpp
@ -0,0 +1,152 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ie_mkldnn.h"
+#include <dnnl_debug.h>
+#include <cpu/platform.hpp>
+#include <cpu/x64/cpu_isa_traits.hpp>
+
+#include <cassert>
+#include <cstring>
+
+namespace mkldnn {
+namespace utils {
+
+const char* fmt2str(memory::format_tag fmt) {
+    return dnnl_fmt_tag2str(static_cast<dnnl_format_tag_t>(fmt));
+}
+
+mkldnn::memory::format_tag str2fmt(const char *str) {
+#define CASE(_fmt) do { \
+    if (!strcmp(#_fmt, str) \
+            || !strcmp("mkldnn_" #_fmt, str)) \
+        return static_cast<dnnl::memory::format_tag>(dnnl_ ## _fmt); \
+} while (0)
+        CASE(x);
+        CASE(nc);
+        CASE(ncw);
+        CASE(nwc);
+        CASE(nCw4c);
+        CASE(nCw8c);
+        CASE(nCw16c);
+        CASE(nchw);
+        CASE(nhwc);
+        CASE(chwn);
+        CASE(nChw4c);
+        CASE(nChw8c);
+        CASE(nChw16c);
+        CASE(oi);
+        CASE(io);
+        CASE(oiw);
+        CASE(wio);
+        CASE(OIw16i16o);
+        CASE(OIw16o16i);
+        CASE(Oiw16o);
+        CASE(Owi16o);
+        CASE(OIw8i16o2i);
+        CASE(OIw4i16o4i);
+        CASE(oihw);
+        CASE(ihwo);
+        CASE(hwio);
+        CASE(iohw);
+        CASE(dhwio);
+        CASE(OIhw8i8o);
+        CASE(OIhw16i16o);
+        CASE(OIhw8i16o2i);
+        CASE(OIdhw8i16o2i);
+        CASE(OIhw4i16o4i);
+        CASE(OIdhw4i16o4i);
+        CASE(OIhw8o16i2o);
+        CASE(IOhw8o16i2o);
+        CASE(OIhw8o8i);
+        CASE(OIhw8o32i);
+        CASE(OIhw16o32i);
+        CASE(OIhw16o16i);
+        CASE(IOhw16o16i);
+        CASE(Oihw16o);
+        CASE(Ohwi8o);
+        CASE(Ohwi16o);
+        CASE(goiw);
+        CASE(goihw);
+        CASE(hwigo);
+        CASE(giohw);
+        CASE(dhwigo);
+        CASE(goiw);
+        CASE(gOIw16i16o);
+        CASE(gOIw16o16i);
+        CASE(gOiw16o);
+        CASE(gOwi16o);
+        CASE(gOIw8i16o2i);
+        CASE(gOIw4i16o4i);
+        CASE(Goiw16g);
+        CASE(gOIhw8i8o);
+        CASE(gOIhw16i16o);
+        CASE(gOIhw8i16o2i);
+        CASE(gOIdhw8i16o2i);
+        CASE(gOIhw2i8o4i);
+        CASE(gOIhw4i16o4i);
+        CASE(gOIdhw4i16o4i);
+        CASE(gOIhw8o16i2o);
+        CASE(gIOhw8o16i2o);
+        CASE(gOIhw4o4i);
+        CASE(gOIhw8o8i);
+        CASE(gOIhw16o16i);
+        CASE(gIOhw16o16i);
+        CASE(gOihw16o);
+        CASE(gOhwi8o);
+        CASE(gOhwi16o);
+        CASE(Goihw8g);
+        CASE(Goihw16g);
+        CASE(Goidhw4g);
+        CASE(Goidhw8g);
+        CASE(Goidhw16g);
+        CASE(ncdhw);
+        CASE(ndhwc);
+        CASE(oidhw);
+        CASE(goidhw);
+        CASE(nCdhw4c);
+        CASE(nCdhw8c);
+        CASE(nCdhw16c);
+        CASE(OIdhw16i16o);
+        CASE(gOIdhw16i16o);
+        CASE(OIdhw16o16i);
+        CASE(gOIdhw16o16i);
+        CASE(Oidhw16o);
+        CASE(Odhwi16o);
+        CASE(gOidhw16o);
+        CASE(gOdhwi16o);
+        CASE(ntc);
+        CASE(tnc);
+        CASE(ldigo);
+        CASE(ldgoi);
+        CASE(ldgo);
+#undef CASE
+        assert(!"unknown memory format");
+        return dnnl::memory::format_tag::undef;
+    }
+
+
+
+int get_cache_size(int level, bool per_core) {
+    if (per_core) {
+        return mkldnn::impl::cpu::platform::get_per_core_cache_size(level);
+    } else {
+        using namespace mkldnn::impl::cpu::x64;
+        if (cpu().getDataCacheLevels() == 0) {
+            // this function can return stub values in case of unknown CPU type
+            return mkldnn::impl::cpu::platform::get_per_core_cache_size(level);
+        }
+
+        if (level > 0 && (unsigned) level <= cpu().getDataCacheLevels()) {
+            unsigned l = level - 1;
+            return cpu().getDataCacheSize(l);
+        } else {
+            return 0;
+        }
+    }
+    DNNL_THROW_ERROR(dnnl_unimplemented, "get_cache_size has no mode per_core == false");
+}
+
+}  // namespace utils
+}  // namespace mkldnn
--- a/inference-engine/src/mkldnn_plugin/mkldnn/ie_mkldnn.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/ie_mkldnn.h
@ -0,0 +1,21 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "mkldnn.hpp"
+
+namespace mkldnn {
+
+using primitive_desc_iterator = mkldnn::primitive_desc;
+
+namespace utils {
+
+int get_cache_size(int level, bool per_core);
+
+const char* fmt2str(memory::format_tag fmt);
+mkldnn::memory::format_tag str2fmt(const char *str);
+
+}  // namespace utils
+}  // namespace mkldnn
--- a/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn/iml_type_mapper.cpp
@ -17,31 +17,36 @@ impl_desc_type MKLDNNPlugin::parse_impl_name(std::string impl_desc_name) {

 #define SEARCH_WORD(_wrd) if (impl_desc_name.find(#_wrd) != std::string::npos) \
    res = static_cast<impl_desc_type>(res | impl_desc_type::_wrd);
+#define SEARCH_WORD_2(_wrd, _key) if (impl_desc_name.find(#_wrd) != std::string::npos) \
+    res = static_cast<impl_desc_type>(res | impl_desc_type::_key);

    SEARCH_WORD(ref);
    SEARCH_WORD(jit);
    SEARCH_WORD(gemm);
    SEARCH_WORD(blas);
    SEARCH_WORD(sse42);
+    SEARCH_WORD_2(sse41, sse42);
    SEARCH_WORD(avx2);
    SEARCH_WORD(avx512);
    SEARCH_WORD(any);
-    SEARCH_WORD(uni);
    SEARCH_WORD(_1x1);
    SEARCH_WORD(_dw);
    SEARCH_WORD(reorder);
    if ((res & impl_desc_type::avx2) != impl_desc_type::avx2 &&
        (res & impl_desc_type::avx512) != impl_desc_type::avx512)
        SEARCH_WORD(avx);
-#undef SEARCH_WORD
-
-#define SEARCH_WORD_2(_wrd, _key) if (impl_desc_name.find(#_wrd) != std::string::npos) \
-    res = static_cast<impl_desc_type>(res | impl_desc_type::_key);
+    if ((res & impl_desc_type::sse42) != impl_desc_type::sse42 &&
+        (res & impl_desc_type::avx) != impl_desc_type::avx &&
+        (res & impl_desc_type::avx2) != impl_desc_type::avx2 &&
+        (res & impl_desc_type::avx512) != impl_desc_type::avx512)
+        SEARCH_WORD(uni);

    SEARCH_WORD_2(nchw, ref);
    SEARCH_WORD_2(ncdhw, ref);
    SEARCH_WORD_2(wino, winograd);
+
 #undef SEARCH_WORD_2
+#undef SEARCH_WORD

    return res;
 }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.cpp
@ -11,20 +11,10 @@ mkldnn::primitive_desc_iterator MKLDNNDescriptor::createPrimitiveDescriptorItera
 }

 MKLDNNDescriptor::operator bool() {
-    return desc.get() != nullptr;
+    return desc != nullptr;
 }

 size_t MKLDNNDescriptor::inputNumbers() const {
-    DescFwdImpl<mkldnn::roi_pooling_forward::desc> *roiPooling =
-            dynamic_cast<DescFwdImpl<mkldnn::roi_pooling_forward::desc> *>(desc.get());
-    if (roiPooling != nullptr) {
-        return roiPooling->getPtr()->c_api_inputs.size();
-    }
-    DescFwdImpl<mkldnn::deformable_convolution_forward::desc> *defConv =
-            dynamic_cast<DescFwdImpl<mkldnn::deformable_convolution_forward::desc> *>(desc.get());
-    if (defConv != nullptr) {
-        return defConv->getPtr()->c_api_inputs.size();
-    }
    return 1;
 }

@ -37,8 +27,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::batch_normalization_f
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::batch_normalization_forward::desc>() {
-    DescFwdImpl<mkldnn::batch_normalization_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::batch_normalization_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::batch_normalization_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -50,8 +39,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::convolution_forward::
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_forward::desc>() {
-    DescFwdImpl<mkldnn::convolution_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::convolution_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::convolution_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -66,9 +54,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::convolution_backward_
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_backward_data::desc>() {
-    DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc> *typeDesc =
-            dynamic_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc,
-                    mkldnn::convolution_forward::primitive_desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -76,9 +62,7 @@ MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_backward_data::de
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::convolution_forward::primitive_desc>() {
-    DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc> *typeDesc =
-            dynamic_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc,
-                    mkldnn::convolution_forward::primitive_desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescBwdImpl<mkldnn::convolution_backward_data::desc, mkldnn::convolution_forward::primitive_desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -90,8 +74,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::inner_product_forward
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::inner_product_forward::desc>() {
-    DescFwdImpl<mkldnn::inner_product_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::inner_product_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::inner_product_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -103,8 +86,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::lrn_forward::desc> de
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::lrn_forward::desc>() {
-    DescFwdImpl<mkldnn::lrn_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::lrn_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::lrn_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -116,21 +98,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::pooling_forward::desc
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::pooling_forward::desc>() {
-    DescFwdImpl<mkldnn::pooling_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::pooling_forward::desc> *>(desc.get());
-    if (typeDesc == nullptr) {
-        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
-    }
-    return typeDesc->getPtr();
-}
-
-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::roi_pooling_forward::desc> desc) {
-    this->desc.reset(new DescFwdImpl<mkldnn::roi_pooling_forward::desc>(desc));
-}
-
-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::roi_pooling_forward::desc>() {
-    DescFwdImpl<mkldnn::roi_pooling_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::roi_pooling_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::pooling_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -142,21 +110,55 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::softmax_forward::desc
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::softmax_forward::desc>() {
-    DescFwdImpl<mkldnn::softmax_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::softmax_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::softmax_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
    return typeDesc->getPtr();
 }

-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc) {
-    this->desc.reset(new DescFwdImpl<mkldnn::rnn_forward::desc>(desc));
+MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::vanilla_rnn_forward::desc> desc) {
+    this->desc.reset(new DescFwdImpl<mkldnn::vanilla_rnn_forward::desc>(desc));
 }

-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::rnn_forward::desc>() {
-    DescFwdImpl<mkldnn::rnn_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::rnn_forward::desc> *>(desc.get());
+MKLDNNDescriptor::operator std::shared_ptr<mkldnn::vanilla_rnn_forward::desc>() {
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::vanilla_rnn_forward::desc>>(desc);
+    if (typeDesc == nullptr) {
+        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
+    }
+    return typeDesc->getPtr();
+}
+
+MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::lstm_forward::desc> desc) {
+    this->desc.reset(new DescFwdImpl<mkldnn::lstm_forward::desc>(desc));
+}
+
+MKLDNNDescriptor::operator std::shared_ptr<mkldnn::lstm_forward::desc>() {
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::lstm_forward::desc>>(desc);
+    if (typeDesc == nullptr) {
+        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
+    }
+    return typeDesc->getPtr();
+}
+
+MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::gru_forward::desc> desc) {
+    this->desc.reset(new DescFwdImpl<mkldnn::gru_forward::desc>(desc));
+}
+
+MKLDNNDescriptor::operator std::shared_ptr<mkldnn::gru_forward::desc>() {
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::gru_forward::desc>>(desc);
+    if (typeDesc == nullptr) {
+        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
+    }
+    return typeDesc->getPtr();
+}
+
+MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::lbr_gru_forward::desc> desc) {
+    this->desc.reset(new DescFwdImpl<mkldnn::lbr_gru_forward::desc>(desc));
+}
+
+MKLDNNDescriptor::operator std::shared_ptr<mkldnn::lbr_gru_forward::desc>() {
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::lbr_gru_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
@ -168,44 +170,7 @@ MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::eltwise_forward::desc
 }

 MKLDNNDescriptor::operator std::shared_ptr<mkldnn::eltwise_forward::desc>() {
-    DescFwdImpl<mkldnn::eltwise_forward::desc> *typeDesc =
-            dynamic_cast<DescFwdImpl<mkldnn::eltwise_forward::desc> *>(desc.get());
-    if (typeDesc == nullptr) {
-        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
-    }
-    return typeDesc->getPtr();
-}
-
-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::quantization_forward::desc> desc) {
-    this->desc.reset(new DescFwdImpl<mkldnn::quantization_forward::desc>(desc));
-}
-
-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::quantization_forward::desc>() {
-    auto *typeDesc = dynamic_cast<DescFwdImpl<mkldnn::quantization_forward::desc> *>(desc.get());
-    if (typeDesc == nullptr) {
-        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
-    }
-    return typeDesc->getPtr();
-}
-
-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::binary_convolution_forward::desc> desc) {
-    this->desc.reset(new DescFwdImpl<mkldnn::binary_convolution_forward::desc>(desc));
-}
-
-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::binary_convolution_forward::desc>() {
-    auto *typeDesc = dynamic_cast<DescFwdImpl<mkldnn::binary_convolution_forward::desc> *>(desc.get());
-    if (typeDesc == nullptr) {
-        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
-    }
-    return typeDesc->getPtr();
-}
-
-MKLDNNDescriptor::MKLDNNDescriptor(std::shared_ptr<mkldnn::deformable_convolution_forward::desc> desc) {
-    this->desc.reset(new DescFwdImpl<mkldnn::deformable_convolution_forward::desc>(desc));
-}
-
-MKLDNNDescriptor::operator std::shared_ptr<mkldnn::deformable_convolution_forward::desc>() {
-    auto *typeDesc = dynamic_cast<DescFwdImpl<mkldnn::deformable_convolution_forward::desc> *>(desc.get());
+    auto typeDesc = std::dynamic_pointer_cast<DescFwdImpl<mkldnn::eltwise_forward::desc>>(desc);
    if (typeDesc == nullptr) {
        THROW_IE_EXCEPTION << "Cannot cast descriptor!";
    }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_descriptor.h
@ -6,8 +6,7 @@

 #include <memory>
 #include <string>
-#include <mkldnn.hpp>
-#include <mkldnn/desc_iterator.hpp>
+#include "mkldnn/ie_mkldnn.h"

 class MKLDNNDescriptor {
 public:
@ -31,27 +30,24 @@ public:
    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::pooling_forward::desc> desc);
    operator std::shared_ptr<mkldnn::pooling_forward::desc>();

-    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::roi_pooling_forward::desc> desc);
-    operator std::shared_ptr<mkldnn::roi_pooling_forward::desc>();
-
    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::softmax_forward::desc> desc);
    operator std::shared_ptr<mkldnn::softmax_forward::desc>();

-    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::rnn_forward::desc> desc);
-    operator std::shared_ptr<mkldnn::rnn_forward::desc>();
+    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::vanilla_rnn_forward::desc> desc);
+    operator std::shared_ptr<mkldnn::vanilla_rnn_forward::desc>();
+
+    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::lstm_forward::desc> desc);
+    operator std::shared_ptr<mkldnn::lstm_forward::desc>();
+
+    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::gru_forward::desc> desc);
+    operator std::shared_ptr<mkldnn::gru_forward::desc>();
+
+    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::lbr_gru_forward::desc> desc);
+    operator std::shared_ptr<mkldnn::lbr_gru_forward::desc>();

    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::eltwise_forward::desc> desc);
    operator std::shared_ptr<mkldnn::eltwise_forward::desc>();

-    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::quantization_forward::desc> desc);
-    operator std::shared_ptr<mkldnn::quantization_forward::desc>();
-
-    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::binary_convolution_forward::desc> desc);
-    operator std::shared_ptr<mkldnn::binary_convolution_forward::desc>();
-
-    explicit MKLDNNDescriptor(std::shared_ptr<mkldnn::deformable_convolution_forward::desc> desc);
-    operator std::shared_ptr<mkldnn::deformable_convolution_forward::desc>();
-
    mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::engine &engine,
            const mkldnn::primitive_attr &attr = mkldnn::primitive_attr()) const;

@ -66,6 +62,7 @@ private:
        virtual ~IDesc() {}
        virtual mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::primitive_attr &attr,
                                                                                  const mkldnn::engine &engine) const = 0;
+        static constexpr bool allow_empty = true;
    };

    template <class T>
@ -76,7 +73,7 @@ private:

        mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::primitive_attr &attr,
                                                                          const mkldnn::engine &engine) const override {
-            return mkldnn::primitive_desc_iterator(*desc, attr, engine);
+            return mkldnn::primitive_desc_iterator(&desc->data, &attr, engine, nullptr, allow_empty);
        }

        std::shared_ptr<T>& getPtr() {
@ -95,7 +92,7 @@ private:

        mkldnn::primitive_desc_iterator createPrimitiveDescriptorIterator(const mkldnn::primitive_attr &attr,
                                                                          const mkldnn::engine &engine) const override {
-            return mkldnn::primitive_desc_iterator(*desc, attr, engine, *prim);
+            return mkldnn::primitive_desc_iterator(&desc->data, &attr, engine, prim.get()->get(), allow_empty);
        }

        std::shared_ptr<T>& getPtr() {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_dims.h
@ -64,7 +64,8 @@ public:
    }

    operator mkldnn::memory::dims() const {
-        return dims;
+        // TODO: it will convert each time.. not good
+        return mkldnn::memory::dims(dims.begin(), dims.end());
    }

    bool operator == (const MKLDNNDims& rhs) const {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.cpp
@ -270,7 +270,7 @@ bool MKLDNNEdge::nodeCanChangeDesc(const MKLDNNNodePtr &node) const {
 /// In we have {any, any, any} -> {any} or {any} -> {any, any, any} or {any} -> {any} it means that
 /// layer doesn't change memory format
 /// We don't support {any, any, nchw} -> {any}
-InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
+InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<memory::format_tag, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
    InferenceEngine::TensorDesc inDesc;

    if (inputDesc.getLayout() != InferenceEngine::Layout::ANY) {
@ -312,8 +312,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
        }
        if (child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size() <= childIdx)
            childIdx = 0;
-        memory::format childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
-        if (childInDesc != memory::any && childInDesc != memory::format_undef) {
+        memory::format_tag childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
+        if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
            if (formats.find(childInDesc) == formats.end())
                formats[childInDesc] = 1;
            else
@ -325,7 +325,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m

        if (enterCountUp < 2) {
            childInDesc = MKLDNNMemoryDesc(childEdge->getSpecifiedOutputDesc(formats, enterCountUp, ++enterCountDown)).getFormat();
-            if (childInDesc != memory::any && childInDesc != memory::format_undef) {
+            if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
                if (formats.find(childInDesc) == formats.end())
                    formats[childInDesc] = 1;
                else
@ -346,8 +346,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
            if (parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size() <= parentIdx) {
                parentIdx = 0;
            }
-            memory::format parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
-            if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
+            memory::format_tag parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
+            if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
                if (formats.find(parentOutDesc) == formats.end())
                    formats[parentOutDesc] = 1;
                else
@ -359,7 +359,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m

            if (enterCountUp < 2) {
                parentOutDesc = MKLDNNMemoryDesc(parentEdge->getSpecifiedInputDesc(formats, ++enterCountUp, enterCountDown)).getFormat();
-                if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
+                if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
                    if (formats.find(parentOutDesc) == formats.end())
                        formats[parentOutDesc] = 1;
                    else
@ -370,7 +370,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
    }

    size_t maxFormatCount = 0;
-    memory::format desc =  MKLDNNMemory::GetPlainFormat(getDims());
+    memory::format_tag desc =  MKLDNNMemory::GetPlainFormat(getDims());
    for (auto &it : formats) {
        if (maxFormatCount < it.second && MKLDNNMemory::isConsistant(getDims(), it.first)) {
            maxFormatCount = it.second;
@ -389,7 +389,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedInputDesc(std::map<mkldnn::m
    return MKLDNNMemoryDesc(getDims(), inDataType, desc);
 }

-InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
+InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<memory::format_tag, size_t> formats, size_t enterCountUp, size_t enterCountDown) {
    InferenceEngine::TensorDesc outDesc;

    if (outputDesc.getLayout() != InferenceEngine::Layout::ANY) {
@ -446,8 +446,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
        if (parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size() <= parentIdx) {
            parentIdx = 0;
        }
-        memory::format parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
-        if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
+        memory::format_tag parentOutDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[parentIdx].desc).getFormat();
+        if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
            if (formats.find(parentOutDesc) == formats.end())
                formats[parentOutDesc] = 1;
            else
@ -459,7 +459,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::

        if (enterCountDown < 2) {
            parentOutDesc = MKLDNNMemoryDesc(parentEdge->getSpecifiedInputDesc(formats, ++enterCountUp, enterCountDown)).getFormat();
-            if (parentOutDesc != memory::any && parentOutDesc != memory::format_undef) {
+            if (parentOutDesc != memory::format_tag::any && parentOutDesc != memory::format_tag::undef) {
                if (formats.find(parentOutDesc) == formats.end())
                    formats[parentOutDesc] = 1;
                else
@ -480,8 +480,8 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
            if (child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size() <= childIdx) {
                childIdx = 0;
            }
-            memory::format childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
-            if (childInDesc != memory::any && childInDesc != memory::format_undef) {
+            memory::format_tag childInDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[childIdx].desc).getFormat();
+            if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
                if (formats.find(childInDesc) == formats.end())
                    formats[childInDesc] = 1;
                else
@ -493,7 +493,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::

            if (enterCountDown < 2) {
                childInDesc = MKLDNNMemoryDesc(childEdge->getSpecifiedOutputDesc(formats, enterCountUp, ++enterCountDown)).getFormat();
-                if (childInDesc != memory::any && childInDesc != memory::format_undef) {
+                if (childInDesc != memory::format_tag::any && childInDesc != memory::format_tag::undef) {
                    if (formats.find(childInDesc) == formats.end())
                        formats[childInDesc] = 1;
                    else
@ -504,7 +504,7 @@ InferenceEngine::TensorDesc MKLDNNEdge::getSpecifiedOutputDesc(std::map<mkldnn::
    }

    size_t maxFormatCount = 0;
-    memory::format format =  MKLDNNMemory::GetPlainFormat(getDims());
+    memory::format_tag format =  MKLDNNMemory::GetPlainFormat(getDims());
    for (auto &it : formats) {
        if (maxFormatCount < it.second && MKLDNNMemory::isConsistant(getDims(), it.first)) {
            maxFormatCount = it.second;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_edge.h
@ -5,10 +5,13 @@
 #pragma once

 #include <ie_blob.h>
-#include <memory>
+
 #include "mkldnn_memory.h"
 #include "mkldnn_dims.h"
+#include "mkldnn/ie_mkldnn.h"
+
 #include <map>
+#include <memory>
 #include <vector>

 namespace MKLDNNPlugin {
@ -76,8 +79,10 @@ private:

    InferenceEngine::TensorDesc getInputDesc();
    InferenceEngine::TensorDesc getOutputDesc();
-    InferenceEngine::TensorDesc getSpecifiedInputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp = 1, size_t enterCountDown = 0);
-    InferenceEngine::TensorDesc getSpecifiedOutputDesc(std::map<mkldnn::memory::format, size_t> formats, size_t enterCountUp = 0, size_t enterCountDown = 1);
+    InferenceEngine::TensorDesc getSpecifiedInputDesc(std::map<mkldnn::memory::format_tag, size_t> formats,
+                                                      size_t enterCountUp = 1, size_t enterCountDown = 0);
+    InferenceEngine::TensorDesc getSpecifiedOutputDesc(std::map<mkldnn::memory::format_tag, size_t> formats,
+                                                       size_t enterCountUp = 0, size_t enterCountDown = 1);

    InferenceEngine::TensorDesc inputDesc;
    InferenceEngine::TensorDesc outputDesc;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_exec_network.cpp
@ -77,17 +77,12 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
    }

    OV_ITT_TASK_NEXT(taskChain, "createConstInputs");
-    auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, std::string name) {
-        LayerParams attrs = {layer.get()->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
+    auto createConstInputTo = [&](CNNLayerPtr layer, Blob::Ptr blob, const std::vector<size_t>& shape, const std::string& name) {
+        LayerParams attrs = {layer->name + "_const_" + name, "Const", blob->getTensorDesc().getPrecision()};
        auto constLayer = std::make_shared<InferenceEngine::CNNLayer>(attrs);
        constLayer->blobs["custom"] = blob;

-        std::vector<size_t> constDims(layer->insData[0].lock()->getDims().size(), 1);
-        if (constDims.size() > 1)
-            constDims[1] = blob.get()->size();
-        else
-            constDims[0] = blob.get()->size();
-        const TensorDesc& td = {blob->getTensorDesc().getPrecision(), constDims, TensorDesc::getLayoutByDims(constDims)};
+        const TensorDesc& td = {blob->getTensorDesc().getPrecision(), shape, TensorDesc::getLayoutByDims(shape)};

        DataPtr newEdgeAfterLayer(new Data(constLayer->name, td));
        newEdgeAfterLayer->setName(constLayer->name);
@ -107,16 +102,27 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
        layer->insData.push_back(newEdgeAfterLayer);
    };

+    // The code block below transforms legacy layers to the form more compatible with opset1 in order to simplify future migration
+    // TODO: remove after plug-in is migrated on opset1
    auto all_layers = details::CNNNetSortTopologically(_clonedNetwork);
    for (auto &layer : all_layers) {
        if (layer->type == "ScaleShift" && layer->insData.size() == 1) {
+            auto constDimsRank = layer->insData[0].lock()->getDims().size();
+
            Blob::Ptr scalesBlob = layer->blobs["weights"];
-            if (scalesBlob != nullptr)
-                createConstInputTo(layer, scalesBlob, "weights");
+            if (scalesBlob != nullptr) {
+                std::vector<size_t> shape(constDimsRank, 1);
+                shape[shape.size() > 1 ? 1 : 0] = scalesBlob->size();
+
+                createConstInputTo(layer, scalesBlob, shape, "weights");
+            }

            Blob::Ptr shiftBlob = layer->blobs["biases"];
            if (shiftBlob != nullptr) {
-                createConstInputTo(layer, shiftBlob, "biases");
+                std::vector<size_t> shape(constDimsRank, 1);
+                shape[shape.size() > 1 ? 1 : 0] = shiftBlob->size();
+
+                createConstInputTo(layer, shiftBlob, shape, "biases");
            } else if (scalesBlob != nullptr) {
                Blob::Ptr biases = make_shared_blob<float>(scalesBlob->getTensorDesc());
                if (biases == nullptr)
@ -126,12 +132,65 @@ MKLDNNExecNetwork::MKLDNNExecNetwork(const InferenceEngine::CNNNetwork &network,
                for (size_t i = 0; i < biases->size(); i++)
                    biasesPtr[i] = 0;

-                createConstInputTo(layer, biases, "biases");
+                std::vector<size_t> shape(constDimsRank, 1);
+                shape[shape.size() > 1 ? 1 : 0] = biases->size();
+
+                createConstInputTo(layer, biases, shape, "biases");
            }
        } else if (layer->type == "PReLU" && layer->insData.size() == 1) {
            Blob::Ptr scalesBlob = layer->blobs["weights"];
-            if (scalesBlob != nullptr)
-                createConstInputTo(layer, scalesBlob, "weights");
+            if (scalesBlob != nullptr) {
+                std::vector<size_t> shape(layer->insData[0].lock()->getDims().size(), 1);
+                shape[shape.size() > 1 ? 1 : 0] = scalesBlob->size();
+
+                createConstInputTo(layer, scalesBlob, shape, "weights");
+            }
+        } else if (layer->type == "DeformableConvolution") {
+            auto * defConvLayer = dynamic_cast<DeformableConvolutionLayer*>(layer.get());
+            if (defConvLayer == nullptr)
+                THROW_IE_EXCEPTION << "Cannot convert deformable convolution layer.";
+
+            Blob::Ptr weightsBlob = defConvLayer->blobs["weights"];
+            if (weightsBlob != nullptr) {
+                std::vector<size_t> shape;
+
+                if (defConvLayer->_group != 1) {
+                    shape.push_back(defConvLayer->_group);
+                }
+                shape.push_back(defConvLayer->_out_depth);
+                shape.push_back(defConvLayer->input()->getDims()[1]);
+                for (int i = 1; i <= defConvLayer->_kernel.size(); i++) {
+                    shape.push_back(defConvLayer->_kernel[defConvLayer->_kernel.size() - i]);
+                }
+
+                createConstInputTo(layer, weightsBlob, shape, "weights");
+
+                defConvLayer->blobs.clear();
+                defConvLayer->_weights = nullptr;
+            }
+        } else if (layer->type == "BinaryConvolution") {
+            auto * binConvLayer = dynamic_cast<BinaryConvolutionLayer*>(layer.get());
+            if (binConvLayer == nullptr)
+                THROW_IE_EXCEPTION << "Cannot convert binary convolution layer.";
+
+            Blob::Ptr weightsBlob = binConvLayer->blobs["weights"];
+            if (weightsBlob != nullptr) {
+                std::vector<size_t> shape;
+
+                if (binConvLayer->_group != 1) {
+                    shape.push_back(binConvLayer->_group);
+                }
+                shape.push_back(binConvLayer->_out_depth);
+                shape.push_back(binConvLayer->input()->getDims()[1]);
+                for (int i = 1; i <= binConvLayer->_kernel.size(); i++) {
+                    shape.push_back(binConvLayer->_kernel[binConvLayer->_kernel.size() - i]);
+                }
+
+                createConstInputTo(layer, weightsBlob, shape, "weights");
+
+                binConvLayer->blobs.clear();
+                binConvLayer->_weights = nullptr;
+            }
        }
    }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.cpp
@ -3,8 +3,10 @@
 //

 #include "mkldnn_extension_utils.h"
+#include "utils/general_utils.h"
 #include <limits>
 #include <vector>
+#include <numeric>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@ -15,8 +17,6 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
        return 4;
    case mkldnn::memory::data_type::s32:
        return 4;
-    case mkldnn::memory::data_type::s16:
-        return 2;
    case mkldnn::memory::data_type::bf16:
        return 2;
    case mkldnn::memory::data_type::s8:
@ -25,9 +25,8 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
        return 1;
    case mkldnn::memory::data_type::bin:
        return 1;
-    case mkldnn::memory::data_type::data_undef:
+    case mkldnn::memory::data_type::undef:
        return 0;
-
    default:
        THROW_IE_EXCEPTION << "Unsupported data type.";
    }
@ -36,21 +35,18 @@ uint8_t MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type dataType)
 memory::data_type MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::Precision prec) {
    switch (prec) {
        case InferenceEngine::Precision::FP32:
-            return memory::f32;
+            return memory::data_type::f32;
        case InferenceEngine::Precision::I32:
-            return memory::s32;
-        case InferenceEngine::Precision::I16:
-            return memory::s16;
+            return memory::data_type::s32;
        case InferenceEngine::Precision::BF16:
-            return memory::bf16;
+            return memory::data_type::bf16;
        case InferenceEngine::Precision::I8:
-            return memory::s8;
+            return memory::data_type::s8;
        case InferenceEngine::Precision::U8:
        case InferenceEngine::Precision::BOOL:
-            return memory::u8;
+            return memory::data_type::u8;
        case InferenceEngine::Precision::BIN:
-            return memory::bin;
-
+            return memory::data_type::bin;
        default: {
            THROW_IE_EXCEPTION << "The plugin does not support " << prec.name();
        }
@ -59,21 +55,18 @@ memory::data_type MKLDNNExtensionUtils::IEPrecisionToDataType(InferenceEngine::P

 InferenceEngine::Precision MKLDNNExtensionUtils::DataTypeToIEPrecision(memory::data_type dataType) {
    switch (dataType) {
-        case memory::f32:
-            return InferenceEngine::Precision(InferenceEngine::Precision::FP32);
-        case memory::s32:
+        case memory::data_type::f32:
+            return InferenceEngine::Precision::FP32;
+        case memory::data_type::s32:
            return InferenceEngine::Precision::I32;
-        case memory::s16:
-            return InferenceEngine::Precision::I16;
-        case memory::bf16:
+        case memory::data_type::bf16:
            return InferenceEngine::Precision::BF16;
-        case memory::s8:
+        case memory::data_type::s8:
            return InferenceEngine::Precision::I8;
-        case memory::u8:
+        case memory::data_type::u8:
            return InferenceEngine::Precision::U8;
-        case memory::bin:
+        case memory::data_type::bin:
            return InferenceEngine::Precision::BIN;
-
        default: {
            THROW_IE_EXCEPTION << "Unsupported data type.";
        }
@ -125,15 +118,82 @@ bool MKLDNNExtensionUtils::initTensorsAreEqual(const InferenceEngine::TensorDesc
        in1Block.getOffsetPadding() != uninitNum && in2Block.getOffsetPadding() != uninitNum);
 }

+PartialBlkDesc PartialBlkDesc::makePlain(const InferenceEngine::SizeVector &dims) {
+    PartialBlkDesc res;
+    res.outer_order.resize(dims.size());
+    std::iota(res.outer_order.begin(), res.outer_order.end(), 0);
+    return res;
+}
+
+PartialBlkDesc PartialBlkDesc::makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size) {
+    PartialBlkDesc res;
+    res.outer_order.resize(dims.size());
+    std::iota(res.outer_order.begin(), res.outer_order.end(), 0);
+    res.inner_blk_size = {block_size};
+    res.inner_blk_idxes = {1};
+    return res;
+}
+
+PartialBlkDesc PartialBlkDesc::extractFrom(const InferenceEngine::TensorDesc &desc) {
+    if (desc.getLayout() == InferenceEngine::ANY)
+        THROW_IE_EXCEPTION << "Cannot extract partial blocked descriptor for `ANY` layout";
+
+    const auto &dims = desc.getDims();
+    const auto &blk = desc.getBlockingDesc();
+    const auto &blk_dims = blk.getBlockDims();
+    const auto &blk_order = blk.getOrder();
+
+    PartialBlkDesc res;
+    res.outer_order = {blk_order.begin(), blk_order.begin() + dims.size()};
+    res.inner_blk_idxes = {blk_order.begin() + dims.size(), blk_order.end()};
+    res.inner_blk_size = {blk_dims.begin() + dims.size(), blk_dims.end()};
+
+    return res;
+}
+
+bool PartialBlkDesc::isAutoExtendedWith(const InferenceEngine::SizeVector &dims) const {
+    auto tmp_dims = dims;
+    for (int i = 0; i < inner_blk_size.size(); i++) {
+        auto idx = inner_blk_idxes[i];
+        auto blk = inner_blk_size[i];
+        if (tmp_dims[idx] % blk == 0)
+            tmp_dims[idx] /= blk;
+        else
+            return true;
+    }
+    return false;
+}
+
+bool PartialBlkDesc::operator == (const PartialBlkDesc& it) const {
+    return std::tie(this->inner_blk_idxes,
+                    this->inner_blk_size,
+                    this->outer_order) ==
+           std::tie(it.inner_blk_idxes,
+                    it.inner_blk_size,
+                    it.outer_order);
+}
+
+// Lexicographical compare of content
+bool PartialBlkDesc::operator < (const PartialBlkDesc& it) const {
+    return std::tie(this->inner_blk_idxes,
+                    this->inner_blk_size,
+                    this->outer_order) <
+           std::tie(it.inner_blk_idxes,
+                    it.inner_blk_size,
+                    it.outer_order);
+}
+
 std::string MKLDNNExtensionUtils::getReorderArgs(const InferenceEngine::TensorDesc &parentDesc, const InferenceEngine::TensorDesc &childDesc) {
    std::string inArgs, outArgs;
    if (parentDesc.getPrecision() != childDesc.getPrecision()) {
        inArgs += (inArgs.empty() ? "" : "_") + std::string(parentDesc.getPrecision().name());
        outArgs += (outArgs.empty() ? "" : "_") + std::string(childDesc.getPrecision().name());
    }
-    if (MKLDNNMemoryDesc(parentDesc).getFormat() != MKLDNNMemoryDesc(childDesc).getFormat()) {
-        inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(parentDesc).getFormat());
-        outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(MKLDNNMemoryDesc(childDesc).getFormat());
+    auto fmt_tag_src = MKLDNNMemoryDesc(parentDesc).getFormat();
+    auto fmt_tag_dst = MKLDNNMemoryDesc(childDesc).getFormat();
+    if (fmt_tag_src != fmt_tag_dst || one_of(mkldnn::memory::format_tag::undef, fmt_tag_src, fmt_tag_dst)) {
+        inArgs += (inArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(fmt_tag_src);
+        outArgs += (outArgs.empty() ? "" : "_") + MKLDNNMemory::formatToString(fmt_tag_dst);
    }
    return inArgs + "_" + outArgs;
 }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_extension_utils.h
@ -15,6 +15,61 @@

 namespace MKLDNNPlugin {

+
+/**
+ * Partial tensor descriptor
+ *
+ * Represent a classes of layout. As example Plain, TailC, CBlocked and other.
+ *
+ * The tensor are in one layout family if they have same PartialBlkDesc.
+ *
+ * Any tensor will have same PartialBlkDesc as it subview tensor.
+ *
+ * PartialBlkDesc plus Dims allow to reconstruct real tensorDesc (dense representation).
+ */
+class PartialBlkDesc {
+public:
+    /**
+     * Check if this partial blocking desc will lead to additional zero padding
+     * for real tensor with provided dims
+     *
+     * Example: dims [2, 3, 8, 8] with blocking by 16 for second dim. Will lead
+     *          to effective dims [2, 16, 8, 8] with zeroing all values
+     *          [:, 3:16, :, :]
+     *
+     * @param dims to check on zero auto padding
+     * @return true if provided dims will use auto padding. Otherwise false.
+     */
+    bool isAutoExtendedWith(const InferenceEngine::SizeVector &dims) const;
+
+    /**
+     * Construct PartialBlkDesc from provided TensorDesc
+     *
+     * PartialBlkDesc has less expressiveness power so some information from TensorDesc will be dropped.
+     * The different TensorDesc object will has equal PartialBlkDesc.
+     *
+     * @param desc to extract PartialBlkDesc information about kind of layout
+     * @return PartialBlkDesc object corresponds layout described in desc
+     */
+    static PartialBlkDesc extractFrom(const InferenceEngine::TensorDesc &desc);
+
+    /** Construct plain PartialBlkDesc based on dims information */
+    static PartialBlkDesc makePlain(const InferenceEngine::SizeVector &dims);
+
+    /** Construct blocked Channel PartialBlkDesc based on dims information */
+    static PartialBlkDesc makeCBlocked(const InferenceEngine::SizeVector &dims, size_t block_size);
+
+    /** Compare operators. Allow to use it as key for std::map */
+    bool operator == (const PartialBlkDesc& it) const;
+    bool operator < (const PartialBlkDesc& it) const;
+
+private:
+    PartialBlkDesc() = default;
+    InferenceEngine::SizeVector outer_order;
+    InferenceEngine::SizeVector inner_blk_size;
+    InferenceEngine::SizeVector inner_blk_idxes;
+};
+
 class MKLDNNExtensionUtils {
 public:
    static uint8_t sizeOfDataType(mkldnn::memory::data_type dataType);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -35,6 +35,7 @@
 #include <ie_plugin_config.hpp>

 #include "utils/blob_dump.h"
+#include "utils/general_utils.h"

 /*****************************************************
 * Debug capability
@ -447,7 +448,7 @@ void MKLDNNGraph::InitOptimalPrimitiveDescriptors() {

 void MKLDNNGraph::ExecuteConstantNodesOnly() {
    OV_ITT_SCOPED_TASK(itt::domains::MKLDNN_LT, "MKLDNNGraph::ExecuteConstantNodesOnly");
-    mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
+    mkldnn::stream stream(eng);
    for (auto &graphNode : graphNodes) {
        if (!graphNode->isConstant())
            continue;
@ -683,13 +684,12 @@ void MKLDNNGraph::PushInputData(const std::string& name, const InferenceEngine::
        void *inter_data_ptr = input->second->getChildEdgeAt(0)->getMemory().GetData();

        if (ext_data_ptr != inter_data_ptr) {
-            auto l = in->getTensorDesc().getLayout();
-            if (l == CHW && input->second->getChildEdgeAt(0)->getDims().ndims() == 4)
-                l = NCHW;
+            auto ext_tdesc = MKLDNNMemoryDesc {in->getTensorDesc()};

-            input->second->getChildEdgeAt(0)->getMemory().SetData(
-                    MKLDNNExtensionUtils::IEPrecisionToDataType(in->getTensorDesc().getPrecision()),
-                    MKLDNNMemory::Convert(l), ext_data_ptr, in->byteSize(), false);
+            auto ext_mem = MKLDNNMemory(eng);
+            ext_mem.Create(ext_tdesc, ext_data_ptr, false);
+
+            input->second->getChildEdgeAt(0)->getMemory().SetData(ext_mem, 0, false);
        }

        // todo: make sure 'name' exists in this map...
@ -760,7 +760,8 @@ void MKLDNNGraph::Infer(int batch) {
        THROW_IE_EXCEPTION << "Wrong state. Topology is not ready.";
    }

-    mkldnn::stream stream = mkldnn::stream(stream::kind::eager);
+    mkldnn::stream stream(eng);
+
    for (int i = 0; i < graphNodes.size(); i++) {
        if (IsCancellationRequested()) {
            ResetCancellationRequest();
@ -778,7 +779,6 @@ void MKLDNNGraph::Infer(int batch) {
            OV_ITT_SCOPED_TASK(itt::domains::MKLDNNPlugin, graphNodes[i]->profiling.execute);
            graphNodes[i]->execute(stream);
        }
-
        ENABLE_DUMP(do_after(DUMP_DIR, graphNodes[i]));
    }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@ -30,7 +30,7 @@ public:
        Ready = 1,
    };

-    MKLDNNGraph(): status(NotReady), eng(mkldnn::engine(mkldnn::engine::kind::cpu, 0)), cancelation_requested(false) {}
+    MKLDNNGraph(mkldnn::engine eng = mkldnn::engine(mkldnn::engine::kind::cpu, 0)) : status(NotReady), eng(eng), cancelation_requested(false) {}

    Status GetStatus() {
        return status;
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_dumper.cpp
@ -250,13 +250,15 @@ std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &no
    serialization_info[ExecGraphInfoSerialization::OUTPUT_PRECISIONS] = outputPrecisionsStr;

    std::string outputLayoutsStr;
-    auto outLayouts = node->getSelectedPrimitiveDescriptor()->getOutputLayouts();
-    if (!outLayouts.empty()) {
-        outputLayoutsStr = mkldnn_fmt2str(static_cast<mkldnn_memory_format_t>(outLayouts[0]));
+    auto outDescs = node->getSelectedPrimitiveDescriptor()->getConfig().outConfs;
+
+    if (!outDescs.empty()) {
+        auto fmt0 = MKLDNNMemoryDesc(outDescs[0].desc).getFormat();
+        outputLayoutsStr = mkldnn::utils::fmt2str(fmt0);

        bool isAllEqual = true;
-        for (size_t i = 1; i < outLayouts.size(); i++) {
-            if (outLayouts[i - 1] != outLayouts[i]) {
+        for (size_t i = 1; i < outDescs.size(); i++) {
+            if (MKLDNNMemoryDesc(outDescs[i - 1].desc).getFormat() != MKLDNNMemoryDesc(outDescs[i].desc).getFormat()) {
                isAllEqual = false;
                break;
            }
@ -264,11 +266,13 @@ std::map<std::string, std::string> extract_node_metadata(const MKLDNNNodePtr &no

        // If all output layouts are the same, we store the name only once
        if (!isAllEqual) {
-            for (size_t i = 1; i < outLayouts.size(); i++)
-                outputLayoutsStr += "," + std::string(mkldnn_fmt2str(static_cast<mkldnn_memory_format_t>(outLayouts[i])));
+            for (size_t i = 1; i < outDescs.size(); i++) {
+                auto fmt = MKLDNNMemoryDesc(outDescs[i].desc).getFormat();
+                outputLayoutsStr += "," + std::string(mkldnn::utils::fmt2str(fmt));
+            }
        }
    } else {
-        outputLayoutsStr = mkldnn_fmt2str(mkldnn_format_undef);
+        outputLayoutsStr = mkldnn::utils::fmt2str(mkldnn::memory::format_tag::undef);
    }
    serialization_info[ExecGraphInfoSerialization::OUTPUT_LAYOUTS] = outputLayoutsStr;

--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.cpp
@ -18,8 +18,11 @@
 #include "nodes/mkldnn_interpolate_node.h"
 #include "nodes/mkldnn_input_node.h"

+#include "mkldnn/ie_mkldnn.h"
+
 #include <blob_factory.hpp>
 #include <legacy/ie_layers_internal.hpp>
+#include "utils/general_utils.h"

 // WA for xbyak.h
 #ifdef _WIN32
@ -30,7 +33,7 @@
 #  define _WINSOCK2API_
 #endif
 #endif
-#include <cpu_isa_traits.hpp>
+#include <cpu/x64/cpu_isa_traits.hpp>

 #include <string>
 #include <list>
@ -97,10 +100,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
    FuseConvolutionAndDWConvolution(graph);
    graph.RemoveDroppedNodes();

-#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
    FuseBinaryConvolutionAndQuantize(graph);
    graph.RemoveDroppedNodes();
-#endif

    FuseBatchNormWithScale(graph);
    graph.RemoveDroppedNodes();
@ -108,10 +109,8 @@ void MKLDNNGraphOptimizer::ApplyCommonGraphOptimizations(MKLDNNGraph &graph) {
    RemoveIdentityOperator(graph);
    graph.RemoveDroppedNodes();

-#if defined(COMPILED_CPU_MKLDNN_ELTWISE_NODE)
    FuseConvolutionSumAndConvolutionSumActivation(graph);
    graph.RemoveDroppedNodes();
-#endif

    FuseConvolutionAndSimpleOperation(graph);
    graph.RemoveDroppedNodes();
@ -140,7 +139,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap
    RemoveIOScaleShifts(graph);
    graph.RemoveDroppedNodes();

-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
    ChangeConvertToReorder(graph);
    graph.RemoveDroppedNodes();

@ -149,7 +147,6 @@ void MKLDNNGraphOptimizer::ApplyImplSpecificGraphOptimizations(MKLDNNGraph &grap

    DropConvertReorder(graph);
    graph.RemoveDroppedNodes();
-#endif

    MergePermuteAndReorder(graph);
    graph.RemoveDroppedNodes();
@ -277,54 +274,54 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
        return true;
    };

-    auto initializeWeightsZeroPoints = [](MKLDNNNodePtr node, MKLDNNNodePtr parent0) {
-        auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
-        if (convNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
-
-        int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
-
-        if (parent0->getType() == Eltwise) {
-            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
-            if (eltwiseNode->getOpType() != Subtract)
-                return false;
-
-            if (parent0->getParentEdges().size() != 2)
-                return false;
-
-            if (parent0->getParentEdgesAtPort(1)[0]->getParent()->getCnnLayer()->type == "Const") {
-                auto arg0 = parent0->getParentEdgesAtPort(1)[0]->getParent();
-                if (arg0->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
-                    return false;
-
-                if (parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != 1 &&
-                    parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != OC)
-                    return false;
-
-                auto arg1 = parent0->getParentEdgesAtPort(0)[0]->getParent();
-                if (arg1->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
-                    return false;
-
-                auto zeroPointsBlob = dynamic_cast<TBlob<int8_t>*>(arg0->getCnnLayer()->blobs["custom"].get());
-                if (zeroPointsBlob == nullptr)
-                    THROW_IE_EXCEPTION << "Cannot cast to TBlob internal zero points blob";
-
-                auto zeroPointsData = zeroPointsBlob->buffer().as<int8_t*>();
-                if (zeroPointsData == nullptr)
-                    THROW_IE_EXCEPTION << "zeroPointsBlob has not allocated buffer";
-
-                for (int j = 0; j < parent0->getParentEdgesAtPort(1)[0]->getDims()[0]; j++) {
-                    convNode->weightsZeroPoints.push_back(static_cast<float>(zeroPointsData[j]));
-                }
-            } else {
-                return false;
-            }
-        } else {
-            return false;
-        }
-
-        return true;
-    };
+//    auto initializeWeightsZeroPoints = [](MKLDNNNodePtr node, MKLDNNNodePtr parent0) {
+//        auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
+//        if (convNode == nullptr)
+//            THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
+//
+//        int OC = node->getChildEdgesAtPort(0)[0]->getDims()[1];
+//
+//        if (parent0->getType() == Eltwise) {
+//            auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(parent0.get());
+//            if (eltwiseNode->getOpType() != Subtract)
+//                return false;
+//
+//            if (parent0->getParentEdges().size() != 2)
+//                return false;
+//
+//            if (parent0->getParentEdgesAtPort(1)[0]->getParent()->getCnnLayer()->type == "Const") {
+//                auto arg0 = parent0->getParentEdgesAtPort(1)[0]->getParent();
+//                if (arg0->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
+//                    return false;
+//
+//                if (parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != 1 &&
+//                    parent0->getParentEdgesAtPort(1)[0]->getDims()[0] != OC)
+//                    return false;
+//
+//                auto arg1 = parent0->getParentEdgesAtPort(0)[0]->getParent();
+//                if (arg1->getCnnLayer()->outData[0]->getPrecision() != Precision::I8)
+//                    return false;
+//
+//                auto zeroPointsBlob = dynamic_cast<TBlob<int8_t>*>(arg0->getCnnLayer()->blobs["custom"].get());
+//                if (zeroPointsBlob == nullptr)
+//                    THROW_IE_EXCEPTION << "Cannot cast to TBlob internal zero points blob";
+//
+//                auto zeroPointsData = zeroPointsBlob->buffer().as<int8_t*>();
+//                if (zeroPointsData == nullptr)
+//                    THROW_IE_EXCEPTION << "zeroPointsBlob has not allocated buffer";
+//
+//                for (int j = 0; j < parent0->getParentEdgesAtPort(1)[0]->getDims()[0]; j++) {
+//                    convNode->weightsZeroPoints.push_back(static_cast<float>(zeroPointsData[j]));
+//                }
+//            } else {
+//                return false;
+//            }
+//        } else {
+//            return false;
+//        }
+//
+//        return true;
+//    };

    auto initializeOutputCompensation = [](MKLDNNNodePtr node) {
        auto* convNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
@ -405,13 +402,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndZeroPoints(MKLDNNGraph &graph) {
            graph.DropNode(dataEltwise);
        }

-        auto weightsEltwise = conv->getParentEdgesAtPort(1)[0]->getParent();
-        if (initializeWeightsZeroPoints(conv, weightsEltwise)) {
-            auto p_edge = weightsEltwise->getParentEdgesAtPort(1)[0];
-            removeEdge(graph, p_edge);
-
-            graph.DropNode(weightsEltwise);
-        }
+// [TODO] Weights zero point is not supported on oneDNN side for the moment
+//        auto weightsEltwise = conv->getParentEdgesAtPort(1)[0]->getParent();
+//        if (initializeWeightsZeroPoints(conv, weightsEltwise)) {
+//            auto p_edge = weightsEltwise->getParentEdgesAtPort(1)[0];
+//            removeEdge(graph, p_edge);
+//
+//            graph.DropNode(weightsEltwise);
+//        }

        initializeOutputCompensation(conv);
    }
@ -617,6 +615,12 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndActivation(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();

    auto isFusingSupported = [&](MKLDNNNodePtr conv, MKLDNNNodePtr activation) {
+        auto* binConv = dynamic_cast<MKLDNNBinaryConvolutionNode *>(conv.get());
+        if (binConv) {
+            if (!binConv->canFuse(activation))
+                return false;
+        }
+
        if (!activation->getCnnLayer())
            return false;

@ -792,17 +796,23 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
        return (isSutableConv || isSutableBinConv) && node->getChildEdges().size() == 1;
    };

-    auto isSutableChildNode = [](MKLDNNNodePtr node) {
-        if (node->getType() != Eltwise)
+    auto isSutableChildNode = [](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+        if (childNode->getType() != Eltwise)
            return false;

-        if (!node->getCnnLayer())
+        if (!childNode->getCnnLayer())
            return false;

-        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
+        auto* binConv = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parentNode.get());
+        if (binConv) {
+            if (!binConv->canFuse(childNode))
+                return false;
+        }
+
+        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(childNode.get());
        if (eltwiseNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get eltwise node " << node->getName();
-        return ((eltwiseNode->getOpType() == MulAdd && node->getCnnLayer()->blobs.size() == 2) ||
+            THROW_IE_EXCEPTION << "Cannot get eltwise node " << childNode->getName();
+        return ((eltwiseNode->getOpType() == MulAdd && childNode->getCnnLayer()->blobs.size() == 2) ||
                (eltwiseNode->getOpType() == Prelu));
    };

@ -811,14 +821,14 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDepthwise(MKLDNNGraph &graph) {
        if (!isSutableParentNode(conv)) continue;

        auto depthwise0 = conv->getChildEdgeAt(0)->getChild();
-        if (!isSutableChildNode(depthwise0)) continue;
+        if (!isSutableChildNode(conv, depthwise0)) continue;

        conv->fuseWith(depthwise0);

        if (depthwise0->getChildEdges().size() == 1) {
            auto depthwise1 = depthwise0->getChildEdgeAt(0)->getChild();

-            if (isSutableChildNode(depthwise1)) {
+            if (isSutableChildNode(conv, depthwise1)) {
                conv->fuseWith(depthwise1);

                auto parents = depthwise1->parentEdges;
@ -854,41 +864,29 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
        return node->getType() == Convolution;
    };

-    auto isBinaryConvolutionNode = [](MKLDNNNodePtr node) {
-        return node->getType() == BinaryConvolution;
-    };
-
    auto is1x1Convolution = [](ConvolutionLayer* layer) {
        return layer->_kernel[X_AXIS] == 1 && layer->_kernel[Y_AXIS] == 1;
    };

    auto isSutableParentConvolution = [&](MKLDNNNodePtr node) {
-        if (isBinaryConvolutionNode(node)) {
-            auto *layer = dynamic_cast<BinaryConvolutionLayer *>(node->getCnnLayer().get());
-            if (layer == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
+        auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
+        if (layer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();

-            bool isSupportedParams = layer->_group == 1;
-            if (!isSupportedParams) return false;
-        } else {
-            auto *layer = dynamic_cast<ConvolutionLayer *>(node->getCnnLayer().get());
-            if (layer == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get convolution layer " << node->getName();
+        auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
+        if (parentConvolutionNode == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();

-            auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(node.get());
-            if (parentConvolutionNode == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get convolution node " << node->getName();
+        if (!parentConvolutionNode->weightsZeroPoints.empty())
+            return false;

-            if (!parentConvolutionNode->weightsZeroPoints.empty())
-                return false;
-
-            bool isSupportedParams =
-                    layer->_group == 1 &&
-                    ((is1x1Convolution(layer) && layer->_stride[X_AXIS] == 1 && layer->_stride[Y_AXIS] == 1) || !is1x1Convolution(layer)) &&
-                    (layer->outData[0].get()->getPrecision() == Precision::FP32 || layer->outData[0].get()->getPrecision() == Precision::U8) &&
-                    node->getChildEdgeAt(0)->getDims().ndims() == 4;
-            if (!isSupportedParams) return false;
-        }
+        // TODO [oneDNN]: is it still valide constrain on conv to fuse in?
+        bool isSupportedParams = layer->_group == 1 &&
+                is1x1Convolution(layer) &&  // TODO [oneDNN] : fusing is permitted only with 1x1 convolutions
+                everyone_is(1, layer->_stride[X_AXIS], layer->_stride[Y_AXIS]) &&
+                one_of(layer->outData[0].get()->getPrecision(), Precision::FP32, Precision::U8) &&
+                node->getChildEdgeAt(0)->getDims().ndims() == 4;
+        if (!isSupportedParams) return false;

        return node->getChildEdges().size() == 1 && isConvolutionNode(node->getChildEdgeAt(0)->getChild());
    };
@ -898,28 +896,26 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
        if (childLayer == nullptr)
            THROW_IE_EXCEPTION << "Cannot get convolution layer " << childNode->getName();

-        if (!isBinaryConvolutionNode(parentNode)) {
-            auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
-            if (parentLayer == nullptr)
-                THROW_IE_EXCEPTION << "Cannot get convolution layer " << parentNode->getName();
+        auto* parentLayer = dynamic_cast<ConvolutionLayer*>(parentNode->getCnnLayer().get());
+        if (parentLayer == nullptr)
+            THROW_IE_EXCEPTION << "Cannot get convolution layer " << parentNode->getName();

-            if (parentLayer->outData[0].get()->getPrecision() != childLayer->outData[0].get()->getPrecision())
-                return false;
+        if (parentLayer->outData[0].get()->getPrecision() != childLayer->outData[0].get()->getPrecision())
+            return false;

-            if (parentLayer->precision != childLayer->precision)
-                return false;
+        if (parentLayer->precision != childLayer->precision)
+            return false;

-            auto parentOutputPrecision = !parentNode->fusedWith.empty()
-                    ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
-                    : parentNode->getCnnLayer()->outData[0].get()->getPrecision();
+        auto parentOutputPrecision = !parentNode->fusedWith.empty()
+                ? parentNode->fusedWith[parentNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
+                : parentNode->getCnnLayer()->outData[0].get()->getPrecision();

-            auto childOutputPrecision = !childNode->fusedWith.empty()
-                    ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
-                    : childNode->getCnnLayer()->outData[0].get()->getPrecision();
+        auto childOutputPrecision = !childNode->fusedWith.empty()
+                ? childNode->fusedWith[childNode->fusedWith.size() - 1]->getCnnLayer()->outData[0].get()->getPrecision()
+                : childNode->getCnnLayer()->outData[0].get()->getPrecision();

-            if (parentOutputPrecision != childOutputPrecision)
-                return false;
-        }
+        if (parentOutputPrecision != childOutputPrecision)
+            return false;

        auto* childConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(childNode.get());
        if (childConvolutionNode == nullptr)
@ -928,50 +924,24 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
        if (!childConvolutionNode->inputZeroPoints.empty() || !childConvolutionNode->weightsZeroPoints.empty())
            return false;

-        bool withBias = (childLayer->_biases != nullptr && childLayer->_biases->size() != 0) ||
-                        childConvolutionNode->getBaseIntputsNumber() == 3;
-
        auto allPads = getPaddings(*childLayer);
+
        bool isSupportedParams = childLayer->_out_depth == childLayer->_group &&
                                 childLayer->_out_depth != 1 &&
-                                 childLayer->_kernel[X_AXIS] == 3 && childLayer->_kernel[Y_AXIS] == 3 &&
-                                 allPads.begin[X_AXIS] == 1 && allPads.begin[Y_AXIS] == 1 &&
-                                 childLayer->_dilation[X_AXIS] == 1 && childLayer->_dilation[Y_AXIS] == 1 &&
-                                 withBias &&
+                                 everyone_is(3, childLayer->_kernel[X_AXIS], childLayer->_kernel[Y_AXIS]) &&
+                                 everyone_is(1, allPads.begin[X_AXIS], allPads.begin[Y_AXIS]) &&
+                                 everyone_is(1, allPads.end[X_AXIS], allPads.end[Y_AXIS]) &&
+                                 everyone_is(1, childLayer->_dilation[X_AXIS], childLayer->_dilation[Y_AXIS]) &&
+                                 childLayer->_stride[X_AXIS] == childLayer->_stride[Y_AXIS] &&
+                                 false &&  // TODO [oneDNN]: disabled while not ported
+                                 one_of(childLayer->_stride[X_AXIS], 1 /*, 2*/) &&  // TODO [oneDNN]: stride 2 should also be supported
                                 childNode->getChildEdgeAt(0)->getDims().ndims() == 4;

        return isSupportedParams;
    };

-    auto isFusingWorthwhile = [&](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
-        if (isBinaryConvolutionNode(parentNode)) {
-            return true;
-        }
-
-        auto* layer = dynamic_cast<ConvolutionLayer*>(childNode->getCnnLayer().get());
-        if (layer == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get convolution layer " << childNode->getName();
-
-        auto inDims = childNode->inDims[0];
-        auto outDims = childNode->outDims[0];
-        int elemSize = MKLDNNExtensionUtils::sizeOfDataType(MKLDNNExtensionUtils::IEPrecisionToDataType(layer->precision));
-
-        int L3_cache_size = mkldnn_get_cache_size(3, false);
-        int dw_conv_input_size = inDims[0] * inDims[1] * inDims[2] * inDims[3] * elemSize;
-        int dw_conv_output_size = outDims[0] * outDims[1]* outDims[2] * outDims[3] * elemSize;
-
-        auto* parentConvolutionNode = dynamic_cast<MKLDNNConvolutionNode*>(parentNode.get());
-        if (parentConvolutionNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get convolution node " << parentNode->getName();
-
-        bool isInt8 = parentConvolutionNode->canBeExecutedInInt8();
-        bool isAVX512NotSupported = !mkldnn::impl::cpu::mayiuse(impl::cpu::cpu_isa_t::avx512_common);
-
-        return isInt8 ? isAVX512NotSupported : (dw_conv_input_size + dw_conv_output_size > L3_cache_size / 2);
-    };
-
    for (int i = 0; i < graphNodes.size(); i++) {
-        if (!isConvolutionNode(graphNodes[i]) && !isBinaryConvolutionNode(graphNodes[i])) continue;
+        if (!isConvolutionNode(graphNodes[i])) continue;

        auto parentConvNode = graphNodes[i];
        if (!isSutableParentConvolution(parentConvNode)) continue;
@ -979,8 +949,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
        auto childConvNode = parentConvNode->getChildEdgeAt(0)->getChild();
        if (!isSutableChildConvolution(parentConvNode, childConvNode)) continue;

-        if (!isFusingWorthwhile(parentConvNode, childConvNode)) continue;
-
        parentConvNode->fuseWith(childConvNode);

        for (auto node : childConvNode->getFusedWith())
@ -991,7 +959,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionAndDWConvolution(MKLDNNGraph &graph) {
    }
 }

-#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
 void MKLDNNGraphOptimizer::FuseConvolutionAndQuantize(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();

@ -1117,18 +1084,16 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
        return isSutableBinConv && node->getChildEdges().size() == 1;
    };

-    auto isSutableChildNode = [](MKLDNNNodePtr node) {
-        if (!node->getCnnLayer())
+    auto isSutableChildNode = [](MKLDNNNodePtr parentNode, MKLDNNNodePtr childNode) {
+        if (childNode->getType() != Quantize)
            return false;

-        if (node->getType() != Quantize)
+        auto* binConv = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parentNode.get());
+        if (!binConv) {
            return false;
+        }

-        auto* quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(node.get());
-        if (quantizeNode == nullptr)
-            THROW_IE_EXCEPTION << "Cannot get quantize layer " << node->getName();
-
-        return quantizeNode->isBinarization();
+        return binConv->canFuse(childNode);
    };

    for (int i = 0; i < graphNodes.size(); i++) {
@ -1136,7 +1101,7 @@ void MKLDNNGraphOptimizer::FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph)
        if (!isSutableParentNode(parent)) continue;

        auto child = parent->getChildEdgeAt(0)->getChild();
-        if (!isSutableChildNode(child)) continue;
+        if (!isSutableChildNode(parent, child)) continue;

        parent->fuseWith(child);

@ -1205,7 +1170,6 @@ void MKLDNNGraphOptimizer::FusePoolingAndQuantize(MKLDNNGraph &graph) {
        graph.DropNode(child);
    }
 }
-#endif

 /**
 *  Check if there is a data dependency between parent and child
@ -1273,7 +1237,6 @@ static bool is_data_dependency(const std::shared_ptr<MKLDNNNode> &parent,
 *                ***
 */

-#if defined(COMPILED_CPU_MKLDNN_ELTWISE_NODE)
 void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph) {
    std::vector<MKLDNNNodePtr> &graphNodes = graph.GetNodes();

@ -1308,17 +1271,27 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
        bool isSutableParent1 = parent1->getType() == Convolution || parent1->getType() == BinaryConvolution;
        bool isSutableParent2 = parent2->getType() == Convolution || parent2->getType() == BinaryConvolution;

-        auto* parentNode1 = dynamic_cast<MKLDNNConvolutionNode *>(parent1.get());
-        if (parentNode1) {
-            if (!parentNode1->canBeExecutedInInt8()) {
-                isSutableParent1 = isSutableParent1 && parentNode1->getFusedWith().empty();
+        auto* binConvNode1 = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parent1.get());
+        if (binConvNode1) {
+            isSutableParent1 = isSutableParent1 && binConvNode1->canFuse(graphNode);
+        }
+
+        auto* binConvNode2 = dynamic_cast<MKLDNNBinaryConvolutionNode *>(parent2.get());
+        if (binConvNode2) {
+            isSutableParent2 = isSutableParent2 && binConvNode2->canFuse(graphNode);
+        }
+
+        auto* convNode1 = dynamic_cast<MKLDNNConvolutionNode *>(parent1.get());
+        if (convNode1) {
+            if (!convNode1->canBeExecutedInInt8()) {
+                isSutableParent1 = isSutableParent1 && convNode1->getFusedWith().empty();
            }
        }

-        auto* parentNode2 = dynamic_cast<MKLDNNConvolutionNode *>(parent2.get());
-        if (parentNode2) {
-            if (!parentNode2->canBeExecutedInInt8()) {
-                isSutableParent2 = isSutableParent2 && parentNode2->getFusedWith().empty();
+        auto* convNode2 = dynamic_cast<MKLDNNConvolutionNode *>(parent2.get());
+        if (convNode2) {
+            if (!convNode2->canBeExecutedInInt8()) {
+                isSutableParent2 = isSutableParent2 && convNode2->getFusedWith().empty();
            }
        }

@ -1387,6 +1360,10 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
        if (mergedConvNode != nullptr)
            childPort = mergedConvNode->getParentEdges().size();

+        auto* mergedBinConvNode = dynamic_cast<MKLDNNBinaryConvolutionNode*>(mergedConv.get());
+        if (mergedBinConvNode != nullptr)
+            childPort = mergedBinConvNode->getParentEdges().size();
+
        MKLDNNEdgePtr edgePtr(new MKLDNNEdge(peerNode, mergedConv, peer_port, childPort));
        graph.GetEdges().push_back(edgePtr);

@ -1415,7 +1392,6 @@ void MKLDNNGraphOptimizer::FuseConvolutionSumAndConvolutionSumActivation(MKLDNNG
        sum->remove();
    }
 }
-#endif

 void MKLDNNGraphOptimizer::FuseMVNAndSimpleOperation(MKLDNNGraph &graph) {
    auto& graphNodes = graph.GetNodes();
@ -1759,7 +1735,6 @@ void MKLDNNGraphOptimizer::RemoveIdentityOperator(MKLDNNGraph &graph) {
    }
 }

-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
 void MKLDNNGraphOptimizer::DropDoubleReorders(MKLDNNGraph &graph) {
    std::set<MKLDNNNodePtr> processed;
    int graphNodesSize = graph.GetNodes().size();
@ -1897,7 +1872,6 @@ void MKLDNNGraphOptimizer::ChangeConvertToReorder(MKLDNNGraph& graph) {
        graph.DropNode(convertCandidate);
    }
 }
-#endif

 void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
    for (MKLDNNNodePtr& node : graph.GetNodes()) {
@ -1913,7 +1887,6 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
            if (cur->getTensorDesc().getPrecision() != l->outData[0]->getTensorDesc().getPrecision()) {
                if (node->name.find("_iScaleShift_") != std::string::npos) {
                    auto child = node->childEdges[0].lock()->getChild();
-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
                    if (child->type == Reorder) {
                        MKLDNNReorderNode* rn = dynamic_cast<MKLDNNReorderNode*>(child.get());
                        if (rn != nullptr) {
@ -1921,16 +1894,11 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
                            graph.DropNode(node);
                        }
                    } else {
-#else
                        THROW_IE_EXCEPTION << "Strange case. No Reorder after iScaleShift";
-#endif
-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
                    }
-#endif
                } else if (node->name.find("_oScaleShift_") != std::string::npos) {
                    auto parent = node->parentEdges[0].lock()->getParent();

-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
                    if (parent->type == Reorder) {
                        MKLDNNReorderNode* rn = dynamic_cast<MKLDNNReorderNode*>(parent.get());
                        if (rn != nullptr) {
@ -1938,12 +1906,8 @@ void MKLDNNGraphOptimizer::RemoveIOScaleShifts(MKLDNNGraph &graph) {
                            graph.DropNode(node);
                        }
                    } else {
-#else
                        THROW_IE_EXCEPTION << "Strange case. No Reorder before oScaleShift";
-#endif
-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
                    }
-#endif
                }
            }
        }
@ -2054,9 +2018,9 @@ void MKLDNNGraphOptimizer::FuseClampAndQuantize(MKLDNNGraph &graph) {
        std::vector<float> newCropLow(cropLowData.size());
        std::vector<float> newCropHigh(cropHighData.size());
        for (int i = 0; i < cropLowData.size(); i++)
-            newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getBeta());
+            newCropLow[i] = std::max(cropLowData[i], eltwiseNode->getAlpha());
        for (int i = 0; i < cropHighData.size(); i++)
-            newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getAlpha());
+            newCropHigh[i] = std::min(cropHighData[i], eltwiseNode->getBeta());

        quantizeNode->setCropLow(newCropLow);
        quantizeNode->setCropHigh(newCropHigh);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph_optimizer.h
@ -27,26 +27,20 @@ private:
    void FuseConvolutionAndDepthwise(MKLDNNGraph &graph);
    void FuseConvolutionAndSimpleOperation(MKLDNNGraph &graph);
    void FuseConvolutionAndDWConvolution(MKLDNNGraph &graph);
-#if defined(COMPILED_CPU_MKLDNN_QUANTIZE_NODE)
    void FuseConvolutionAndQuantize(MKLDNNGraph &graph);
    void FuseBinaryConvolutionAndQuantize(MKLDNNGraph &graph);
    void FusePoolingAndQuantize(MKLDNNGraph &graph);
-#endif
    void FuseBatchNormWithScale(MKLDNNGraph& graph);
-#if defined(COMPILED_CPU_MKLDNN_ELTWISE_NODE)
    void FuseConvolutionSumAndConvolutionSumActivation(MKLDNNGraph &graph);
-#endif
    void FuseMVNAndSimpleOperation(MKLDNNGraph &graph);
    void FuseInterpolateAndSimpleOperation(MKLDNNGraph &graph);
    void FuseNormalizeAndSimpleOperation(MKLDNNGraph &graph);
    void RemoveIdentityOperator(MKLDNNGraph& graph);

    void RemoveIOScaleShifts(MKLDNNGraph& graph);
-#if defined (COMPILED_CPU_MKLDNN_REORDER_NODE)
    void DropDoubleReorders(MKLDNNGraph& graph);
    void DropConvertReorder(MKLDNNGraph& graph);
    void ChangeConvertToReorder(MKLDNNGraph &graph);
-#endif
    void FuseConvolutionAndZeroPoints(MKLDNNGraph &graph);
    void FuseBroadcastAndEltwise(MKLDNNGraph &graph);
    void FuseEltwiseAndSimple(MKLDNNGraph &graph);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_infer_request.cpp
@ -113,14 +113,14 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushInputData() {
            // these precisions are supported by mkldnn, so we push the blob directly
            // BUT if a mean image exists, we convert the blob and send FP32
            case InferenceEngine::Precision::U8:
-            case InferenceEngine::Precision::BOOL:
-            case InferenceEngine::Precision::I16: {
+            case InferenceEngine::Precision::BOOL: {
                if (graph->hasMeanImageFor(input.first))
                    inPrec = InferenceEngine::Precision::FP32;
                break;
            }
            // these precisions are unsupported by mkldnn, so we convert the blob and send I32
            case InferenceEngine::Precision::U16:
+            case InferenceEngine::Precision::I16:
            case InferenceEngine::Precision::I64:
            case InferenceEngine::Precision::U64: {
                inPrec = InferenceEngine::Precision::I32;
@ -143,9 +143,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::PushStates() {
                    auto cur_state_mem = cur_node->getStore();
                    auto data_ptr = state->GetState()->cbuffer().as<void*>();
                    auto data_size = state->GetState()->byteSize();
-                    auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(cur_state_mem->GetDataType());
-                    auto padSize = cur_state_mem->GetDescriptor().data.layout_desc.blocking.offset_padding;
-                    auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetData()) + padSize * elemSize;
+                    auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetPtr());

                    cpu_memcpy(cur_state_mem_buf, data_ptr, data_size);
                }
@ -164,9 +162,7 @@ void MKLDNNPlugin::MKLDNNInferRequest::PullStates() {
                    auto cur_state_mem = cur_node->getStore();
                    auto data_ptr = state->GetState()->cbuffer().as<void*>();
                    auto data_size = state->GetState()->byteSize();
-                    auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(cur_state_mem->GetDataType());
-                    auto padSize = cur_state_mem->GetDescriptor().data.layout_desc.blocking.offset_padding;
-                    auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetData()) + padSize * elemSize;
+                    auto cur_state_mem_buf = static_cast<uint8_t*>(cur_state_mem->GetPtr());

                    cpu_memcpy(data_ptr, cur_state_mem_buf, data_size);
                }
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.cpp
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory.h
@ -4,40 +4,71 @@

 #pragma once

-#include <memory>
-#include <vector>
-
 #include "ie_layouts.h"
 #include "mkldnn_dims.h"
 #include <mkldnn.hpp>
-#include <string>
 #include <mkldnn_types.h>
+
+#include <string>
 #include <functional>
+#include <memory>
+#include <vector>
+
+/**
+ * @file contains a concept classes to work with memory/tensor/blob abstractions on plugin level.
+ *
+ * MKLDNNMemoryDesc - the descriptor of tensor representation in memory. Describes all required information
+ * for proper allocation and handling tensor in some buffer. The real memory is not present, just description.
+ * This object answers on question how and where data with logical index [x1, x2, .. xN] placed in real buffer.
+ * In the simplest case it describe a mapping between "logical offset" and "real offset".
+ *
+ * MKLDNNMemory is an abstraction of some real tensor which contains some data. As in short it's a pair of
+ * memory descriptor and raw buffer handler to contains data. In case of system memory raw buffer it's simple
+ * "void*" on some system memory buffer.
+ *
+ */

 namespace MKLDNNPlugin {

+/**
+ * Represent internal plugin abstraction of tensor description
+ *
+ */
 class MKLDNNMemoryDesc {
 public:
-    MKLDNNMemoryDesc(): desc({}, mkldnn::memory::data_type::f32, mkldnn::memory::format::format_undef) {}
+    /** Empty constructor - doesn't define any tensor representation */
+    MKLDNNMemoryDesc(): desc() {}
+
+    /** Construct a tensor desc with plain layout format (like ND C array) */
+    MKLDNNMemoryDesc(const mkldnn::memory::dims& dims, mkldnn::memory::data_type dataType);
+
+    /** Construct a tensor desc with specified layout format tag. Any and Undef is not supported */
+    MKLDNNMemoryDesc(const mkldnn::memory::dims& dims, mkldnn::memory::data_type dataType, mkldnn::memory::format_tag format);
+
    explicit MKLDNNMemoryDesc(const InferenceEngine::TensorDesc& tDesc);
    explicit MKLDNNMemoryDesc(const mkldnn::memory::desc& desc): desc(desc) {}
-    MKLDNNMemoryDesc(mkldnn::memory::dims dims, mkldnn::memory::data_type dataType, mkldnn::memory::format format);

-    mkldnn::memory::format getFormat() const {
-        return static_cast<mkldnn::memory::format>(desc.data.format);
-    }
+
+    /**
+     * Try to define original format tag use on creation
+     *
+     * @return format tag if was able to define it
+     */
+    mkldnn::memory::format_tag getFormat() const;

    mkldnn::memory::data_type getDataType() const {
        return static_cast<mkldnn::memory::data_type>(desc.data.data_type);
    }

+    size_t GetElementSize() const;
+
    MKLDNNDims getDims() const {
        return MKLDNNDims(desc.data.dims, desc.data.ndims);
    }

    bool blocksExtended() const;
    operator bool() const {
-        return getFormat() != mkldnn::memory::format::any && getFormat() != mkldnn::memory::format::format_undef;
+        return getFormat() != mkldnn::memory::format_tag::any && getFormat() != mkldnn::memory::format_tag::undef;
    }

    bool operator == (const MKLDNNMemoryDesc& rhs) const;
@ -46,15 +77,19 @@ public:
    operator mkldnn::memory::desc() const;
    operator InferenceEngine::TensorDesc() const;

+
+    bool isPlainFormat() const;
+    bool isBlockedCFormat(size_t blk_size = UNREACHABLE_DIM) const;
+    bool isTailCFormat() const;
+
+    bool isSame(mkldnn::memory::format_tag fmt) const;
+
 private:
+    static constexpr size_t UNREACHABLE_DIM = std::numeric_limits<size_t>::max();
    mkldnn::memory::desc desc;
 };


-class MKLDNNMemory;
-
-using MKLDNNMemoryPtr = std::shared_ptr<MKLDNNMemory>;
-
 class MKLDNNMemory {
 public:
    explicit MKLDNNMemory(const mkldnn::engine& eng);
@ -68,13 +103,17 @@ public:
    }

    mkldnn::memory::desc GetDescriptor() const {
-        return prim->get_primitive_desc().desc();
+        return prim->get_desc();
    }

-    mkldnn::memory::primitive_desc GetPrimitiveDescriptor() const {
-        return prim->get_primitive_desc();
+    const MKLDNNMemoryDesc GetDesc() const {
+        return MKLDNNMemoryDesc {prim->get_desc()};
    }

+    /**
+     * Return handler of buffer. Real data may starts from some other offset
+     * @return
+     */
    void* GetData() const {
        void* data = prim->get_data_handle();
        if (data == nullptr)
@ -82,6 +121,18 @@ public:
        return data;
    }

+    /**
+     * Return raw pointer on first element
+     * Like a GetData() but offset is applied.
+     * @return
+     */
+    void* GetPtr() const {
+        auto ptr = static_cast<uint8_t*>(GetData());
+        ptr += GetDescriptor().data.offset0 * GetDesc().GetElementSize();
+        return ptr;
+    }
+
+
    mkldnn::memory::data_type GetDataType() const {
        return static_cast<mkldnn::memory::data_type>(GetDescriptor().data.data_type);
    }
@ -89,41 +140,35 @@ public:
    size_t GetSize() const;
    size_t GetElementsCount() const;

-    mkldnn::memory::format GetFormat() const {
-        return static_cast<mkldnn::memory::format>(prim->get_primitive_desc().desc().data.format);
-    }
-
    mkldnn::memory::dims GetDims() const {
        auto data = GetDescriptor().data;
-
-        return std::vector<ptrdiff_t>(data.dims, data.dims + data.ndims);
+        return {std::begin(data.dims), std::begin(data.dims) + data.ndims};
    }

-    void Create(mkldnn::memory::dims dims, mkldnn::memory::data_type data_type, mkldnn::memory::format format,
+    void Create(const mkldnn::memory::dims& dims, mkldnn::memory::data_type data_type, mkldnn::memory::format_tag format,
                const void* data = nullptr);

    void Create(const mkldnn::memory::desc& desc, const void* data = nullptr, bool pads_zeroing = true);

-    void SetData(mkldnn::memory::data_type dataType, mkldnn::memory::format format, const void* data, size_t size, bool ftz = true) const;
-    void SetData(const MKLDNNMemory& memory, bool ftz = true) const;
-
+    // Like a plain format
+    void SetData(mkldnn::memory::data_type dataType, mkldnn::memory::format_tag format, const void* data, size_t size, bool ftz = true) const;
+    void SetData(const MKLDNNMemory& memory, size_t size = 0, bool ftz = true) const;
    void FillZero();

-    static bool IsPlainFormat(mkldnn::memory::format format);
-    static bool IsGroupedFormat(mkldnn::memory::format format);
-    static mkldnn::memory::format GetPlainFormat(mkldnn::memory::dims dims);
-    static InferenceEngine::Layout GetPlainLayout(mkldnn::memory::dims dims);
-    static bool isConsistant(mkldnn::memory::dims dims, mkldnn::memory::format format);
-    static mkldnn::memory::format Convert(const InferenceEngine::Layout layout);
+    static mkldnn::memory::format_tag GetPlainFormat(const mkldnn::memory::dims& dims);
+    static InferenceEngine::Layout GetPlainLayout(const mkldnn::memory::dims& dims);
+    static bool isConsistant(const mkldnn::memory::dims& dims, mkldnn::memory::format_tag format);
+    static mkldnn::memory::format_tag Convert(const InferenceEngine::Layout layout);
+    static InferenceEngine::Precision convertToIePrec(mkldnn::memory::data_type dataType);
+    static mkldnn::memory::data_type convertToDataType(const InferenceEngine::Precision &precision);

-    static std::string formatToString(mkldnn::memory::format fmt);
-
-    static void CreateBlockingDesc(mkldnn::memory::desc& desc);
+    static std::string formatToString(mkldnn::memory::format_tag fmt);

 private:
    std::shared_ptr<mkldnn::memory> prim;
    mkldnn::engine eng;
 };

+using MKLDNNMemoryPtr = std::shared_ptr<MKLDNNMemory>;

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp
@ -27,7 +27,7 @@ namespace MKLDNNPlugin {
 *
 * Example:
 *
- * Mem
+ * Mem(offset)
 *  |        |____|             Box {4, 5}
 *  |  |_____________|          Box {2, 6}
 *  |     |____|                Box {3, 4}
@ -38,7 +38,7 @@ namespace MKLDNNPlugin {
 *
 *  Boxes which has an ExecOrder-axis intersection should have no Mem-axis intersections.
 *  The goal is to define a minimal required memory blob to store all boxes with such
- *  constraints and specify all corresponfing position on Mem axis(through offset field).
+ *  constraints and specify all corresponding position on Mem axis(through offset field).
 *
 *  NOTE!
 *  Exec order is predefined.
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -27,17 +27,12 @@
 #include <nodes/mkldnn_pooling_node.h>
 #include <nodes/mkldnn_reorder_node.h>
 #include <nodes/mkldnn_reshape_node.h>
-#include <nodes/mkldnn_roi_pooling_node.h>
 #include <nodes/mkldnn_softmax_node.h>
 #include <nodes/mkldnn_tile_node.h>
 #include <nodes/mkldnn_split_node.h>
 #include <nodes/mkldnn_pad_node.h>
 #include <nodes/mkldnn_permute_node.h>
 #include <nodes/mkldnn_memory_node.hpp>
-#include <nodes/mkldnn_rnn.h>
-#include <nodes/mkldnn_quantize_node.h>
-#include <nodes/mkldnn_bin_conv_node.h>
-#include <nodes/mkldnn_def_conv_node.h>
 #include <nodes/mkldnn_mvn_node.h>
 #include <nodes/mkldnn_normalize_node.h>
 #include <nodes/mkldnn_reduce_node.h>
@ -45,6 +40,7 @@
 #include <nodes/mkldnn_scatter_update_node.h>
 #include <nodes/mkldnn_interpolate_node.h>
 #include <mkldnn_types.h>
+#include <dnnl_types.h>
 #include "mkldnn_extension_utils.h"

 #include "nodes/common/cpu_memcpy.h"
@ -202,7 +198,7 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::
            while (getline(stream, str, ',')) {
                if (str.substr(0, 4) != "cpu:")
                    continue;
-                inputMemoryFormatsFilter.push_back(mkldnn_str2fmt(str.substr(4, str.size()).c_str()));
+                inputMemoryFormatsFilter.push_back(mkldnn::utils::str2fmt(str.substr(4, str.size()).c_str()));
            }
        }

@ -213,7 +209,7 @@ MKLDNNNode::MKLDNNNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::
            while (getline(stream, str, ',')) {
                if (str.substr(0, 4) != "cpu:")
                    continue;
-                outputMemoryFormatsFilter.push_back(mkldnn_str2fmt(str.substr(4, str.size()).c_str()));
+                outputMemoryFormatsFilter.push_back(mkldnn::utils::str2fmt(str.substr(4, str.size()).c_str()));
            }
        }
    }
@ -488,25 +484,25 @@ const std::vector<MKLDNNEdgePtr> MKLDNNNode::getChildEdgesAtPort(size_t idx) con
 }


-std::vector<memory::format> MKLDNNNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
+std::vector<memory::format_tag> MKLDNNNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
    if (dims.ndims() == 0)
-        return {memory::format::x};
+        return {memory::format_tag::x};
    else if (dims.ndims() == 1)
-        return {memory::format::x};
+        return {memory::format_tag::x};
    else if (dims.ndims() == 2)
-        return {memory::format::nc};
+        return {memory::format_tag::nc};
    else if (dims.ndims() == 3)
-        return {memory::format::tnc, memory::format::ntc};
+        return {memory::format_tag::tnc, memory::format_tag::ntc};
    else if (dims.ndims() == 4)
-        return {memory::format::nchw, memory::format::nChw8c, memory::format::nChw16c};
+        return {memory::format_tag::nchw, memory::format_tag::nChw8c, memory::format_tag::nChw16c};
    else if (dims.ndims() == 5)
-        return {memory::format::ncdhw, memory::format::nCdhw8c, memory::format::nCdhw16c};
-    return {memory::format::any};
+        return {memory::format_tag::ncdhw, memory::format_tag::nCdhw8c, memory::format_tag::nCdhw16c};
+    return {memory::format_tag::any};
 }

 void MKLDNNNode::execute(mkldnn::stream strm) {
    if (prim) {
-        strm.submit({*prim});
+        (*prim).execute(strm, primArgs);
    }
 }

@ -516,7 +512,8 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() {

    for (auto& desc : descs) {
        auto itpd = desc.createPrimitiveDescriptorIterator(engine);
-        while (itpd.is_not_end()) {
+
+        while (static_cast<bool>(itpd)) {
            InferenceEngine::LayerConfig config;
            config.dynBatchSupport = true;
            for (size_t i = 0; i < descInputNumbers(desc); i++) {
@ -527,35 +524,35 @@ void MKLDNNNode::initSupportedPrimitiveDescriptors() {
                config.inConfs.push_back(dataConfig);
            }

-            std::vector<mkldnn::memory::format> outFormats;
            for (size_t i = 0; i < descOutputNumbers(desc); i++) {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = canBeInPlace() ? 0 : -1;
                dataConfig.constant = false;
                dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getDstMemDesc(itpd, i));
                config.outConfs.push_back(dataConfig);
-
-                auto primDesc = itpd.fetch();
-                auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0);
-                if (dstPrimDesc) {
-                    outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
-                } else {
-                    // This path is needed to correctly handle Deconvolution node
-                    auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0);
-                    if (diffSrcPrimDesc) {
-                        outFormats.emplace_back(static_cast<memory::format>(itpd.diff_src_primitive_desc().desc().data.format));
-                    }
-                }
            }
-            impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());

-            supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
-            itpd++;
+            supportedPrimitiveDescriptors.emplace_back(config, impl_type);
+            if (!itpd.next_impl())
+                break;
        }
    }
 }

 void MKLDNNNode::filterSupportedPrimitiveDescriptors() {
+    // Compare by partial layout descriptor (without particular strides values)
+    auto areCompatible = [](const TensorDesc& tdesc, mkldnn::memory::format_tag fmt) {
+        TensorDesc fmt_tdesc = MKLDNNMemoryDesc{
+            MKLDNNDims(tdesc.getDims()),
+            MKLDNNExtensionUtils::IEPrecisionToDataType(tdesc.getPrecision()),
+            fmt};
+
+        auto tmp_partial_tdesc = PartialBlkDesc::extractFrom(fmt_tdesc);
+        auto actual_partial_tdesc = PartialBlkDesc::extractFrom(tdesc);
+        return tmp_partial_tdesc == actual_partial_tdesc;
+    };
+
    if (!inputMemoryFormatsFilter.empty() || !outputMemoryFormatsFilter.empty()) {
        auto itpd = supportedPrimitiveDescriptors.begin();
        while (itpd != supportedPrimitiveDescriptors.end()) {
@ -565,12 +562,12 @@ void MKLDNNNode::filterSupportedPrimitiveDescriptors() {

            bool isSuitableDesc = true;
            for (int i = 0; i < inputMemoryFormatsFilter.size(); i++) {
-                if (inputMemoryFormatsFilter[i] != MKLDNNMemoryDesc(config.inConfs[i].desc).getFormat())
-                    isSuitableDesc = false;
+                const bool matched = areCompatible(config.inConfs[i].desc, inputMemoryFormatsFilter[i]);
+                isSuitableDesc &= matched;
            }
            for (int i = 0; i < outputMemoryFormatsFilter.size(); i++) {
-                if (outputMemoryFormatsFilter[i] != MKLDNNMemoryDesc(config.outConfs[i].desc).getFormat())
-                    isSuitableDesc = false;
+                const bool matched = areCompatible(config.outConfs[i].desc, outputMemoryFormatsFilter[i]);
+                isSuitableDesc &= matched;
            }
            if (!isSuitableDesc) {
                itpd = supportedPrimitiveDescriptors.erase(itpd);
@ -600,20 +597,20 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
    size_t selected_count = 0;
    for (size_t j = 0; j < descs.size(); j++) {
        const auto &desc = descs[j];
-        std::shared_ptr<primitive_desc_iterator> itpd;
+        primitive_desc_iterator itpd;
        if (attr == nullptr) {
-            itpd = std::make_shared<primitive_desc_iterator>(desc.createPrimitiveDescriptorIterator(engine));
+            itpd = desc.createPrimitiveDescriptorIterator(engine);
        } else {
-            itpd = std::make_shared<primitive_desc_iterator>(desc.createPrimitiveDescriptorIterator(engine, *(attr.get())));
+            itpd = desc.createPrimitiveDescriptorIterator(engine, *(attr.get()));
        }
-        while (itpd->is_not_end()) {
+        while (static_cast<bool>(itpd)) {
            InferenceEngine::LayerConfig cfg;
            cfg.dynBatchSupport = true;
            for (size_t i = 0; i < descInputNumbers(desc); i++) {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = canBeInPlace() ? 0 : -1;
                dataConfig.constant = false;
-                dataConfig.desc = getSrcMemDesc(*itpd, i);
+                dataConfig.desc = getSrcMemDesc(itpd, i);
                cfg.inConfs.push_back(dataConfig);
            }

@ -621,10 +618,10 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = -1;
                dataConfig.constant = false;
-                dataConfig.desc = getDstMemDesc(*itpd, i);
+                dataConfig.desc = getDstMemDesc(itpd, i);
                cfg.outConfs.push_back(dataConfig);
            }
-            impl_desc_type impl_type = parse_impl_name(itpd->get_impl_info_str().c_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
            if (selected_count == selectedPrimitiveDescriptorIndex) {
                if (impl_type != selectedPD->getImplementationType()) {
                    THROW_IE_EXCEPTION << "Cannot get the original layer configuration!";
@ -637,7 +634,8 @@ void MKLDNNNode::initDescriptor(const InferenceEngine::LayerConfig &config) {
                }
            }
            selected_count++;
-            (*itpd)++;
+            if (!itpd.next_impl())
+                break;
        }
    }

@ -747,16 +745,9 @@ void MKLDNNNode::prepareMemory(const PrimitiveDescInfo *selected_pd, mkldnn::pri

        auto create = [&] () {
            auto newDesc = MKLDNNMemoryDesc(internalBlob->getTensorDesc());
-            auto newFormat = newDesc.getFormat();
-            if (newFormat == mkldnn::memory::ncdhw) {
-                newFormat = mkldnn::memory::goihw;
-            }
-            if (newFormat == mkldnn::memory::nchw) {
-                newFormat = mkldnn::memory::oihw;
-            }

            MKLDNNMemory memory{ engine };
-            memory.Create(MKLDNNMemoryDesc(newDesc.getDims(), newDesc.getDataType(), newFormat), internalBlob->buffer());
+            memory.Create(newDesc, internalBlob->buffer());

            MKLDNNMemoryPtr _ptr = MKLDNNMemoryPtr(new MKLDNNMemory(engine));
            _ptr->Create(intDescs[i]);
@ -1045,7 +1036,7 @@ bool MKLDNNNode::isInitConfig(const InferenceEngine::LayerConfig& config) const
 }

 MKLDNNMemoryDesc MKLDNNNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
+    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));
    if (desc.getLayout() == InferenceEngine::Layout::ANY)
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
                                                            getParentEdgeAt(idx)->getDims().ToSizeVector(),
@ -1057,7 +1048,7 @@ MKLDNNMemoryDesc MKLDNNNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &prim
 }

 MKLDNNMemoryDesc MKLDNNNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc());
+    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));
    if (desc.getLayout() == InferenceEngine::Layout::ANY)
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
                                                            getChildEdgeAt(idx)->getDims().ToSizeVector(),
@ -1091,8 +1082,25 @@ int MKLDNNNode::getMaxBatch() {

 void MKLDNNNode::setDynamicBatchLim(int lim) {
    dynBatchLim = lim;
-    if (prim) {
-        prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
+
+    auto setDynamicBatch = [this](int argType, int newBatch) {
+        auto param = primArgs.find(argType);
+        if (param != primArgs.end()) {
+            auto oldMem = param->second;
+            mkldnn::memory::desc newMemDesc(oldMem.get_desc());
+            newMemDesc.data.dims[0] = newBatch;
+            newMemDesc.data.padded_dims[0] = newBatch;
+            mkldnn::memory newMem(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
+            primArgs.at(argType) = newMem;
+        }
+    };
+
+    if (!primArgs.empty()) {
+        int newBatch = batchToProcess();
+        setDynamicBatch(DNNL_ARG_SRC, newBatch);
+        setDynamicBatch(DNNL_ARG_DST, newBatch);
+        setDynamicBatch(DNNL_ARG_DIFF_SRC, newBatch);
+        setDynamicBatch(DNNL_ARG_DIFF_DST, newBatch);
    }
 }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -207,16 +207,16 @@ static std::string NameFromType(Type type) {

 class PrimitiveDescInfo {
 public:
-    PrimitiveDescInfo(const InferenceEngine::LayerConfig conf, impl_desc_type type): config(conf) {
+    PrimitiveDescInfo(const InferenceEngine::LayerConfig& conf, impl_desc_type type): config(conf) {
        implementationType = type;
    }

-    PrimitiveDescInfo(const InferenceEngine::LayerConfig conf, impl_desc_type type, std::vector<mkldnn::memory::format> outFmts): config(conf) {
+    PrimitiveDescInfo(const InferenceEngine::LayerConfig& conf, impl_desc_type type, const std::vector<mkldnn::memory::format_tag>& outFmts): config(conf) {
        implementationType = type;
        outputLayouts = outFmts;
    }

-    PrimitiveDescInfo(const InferenceEngine::LayerConfig conf, impl_desc_type type, mkldnn::memory::format outFmt): config(conf) {
+    PrimitiveDescInfo(const InferenceEngine::LayerConfig& conf, impl_desc_type type, mkldnn::memory::format_tag outFmt): config(conf) {
        implementationType = type;

        setOutputLayouts(outFmt);
@ -238,7 +238,7 @@ public:
        return implementationType;
    }

-    const std::vector<mkldnn::memory::format>& getOutputLayouts() const {
+    const std::vector<mkldnn::memory::format_tag>& getOutputLayouts() const {
        return outputLayouts;
    }

@ -246,7 +246,7 @@ public:
        implementationType = type;
    }

-    void setOutputLayouts(mkldnn::memory::format outFmt) {
+    void setOutputLayouts(mkldnn::memory::format_tag outFmt) {
        outputLayouts.clear();

        for (int i = 0; i < config.outConfs.size(); i++) {
@ -257,7 +257,7 @@ public:
 private:
    InferenceEngine::LayerConfig config;
    impl_desc_type implementationType;
-    std::vector<mkldnn::memory::format> outputLayouts;
+    std::vector<mkldnn::memory::format_tag> outputLayouts;
 };

 class MKLDNNNode : public InferenceEngine::details::no_copy {
@ -458,7 +458,7 @@ public:
        for (const auto& desc : descs) {
            auto itpd = desc.createPrimitiveDescriptorIterator(engine, attr);

-            while (itpd.is_not_end())  {
+            while (static_cast<bool>(itpd))  {
                std::vector<InferenceEngine::TensorDesc> srcDescs;
                for (size_t i = 0; i < descInputNumbers(desc); i++)
                    srcDescs.push_back(getSrcMemDesc(itpd, i));
@ -467,17 +467,17 @@ public:
                for (size_t i = 0; i < descOutputNumbers(desc); i++)
                    dstDescs.push_back(getDstMemDesc(itpd, i));

-                impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+                impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());

                if (impl_type == selected_pd->getImplementationType() &&
                    descsEqual(srcDescs, selected_pd->getConfig().inConfs) &&
                    descsEqual(dstDescs, selected_pd->getConfig().outConfs)) {
                    prepareMemory(selected_pd, itpd);
                    PD prim_desc = createPd<PD, D, FPD>(desc);
-                    itpd.getPrimitiveDescriptor(prim_desc);
-                    return prim_desc;
+                    return {itpd.get()};
                }
-                itpd++;
+                if (!itpd.next_impl())
+                    break;
            }
        }

@ -553,8 +553,8 @@ protected:
    std::vector <MKLDNNNodePtr> fusedWith;
    std::vector <MKLDNNNodePtr> mergedWith;
    std::vector <impl_desc_type> implPriorities;
-    std::vector <mkldnn_memory_format_t> inputMemoryFormatsFilter;
-    std::vector <mkldnn_memory_format_t> outputMemoryFormatsFilter;
+    std::vector <mkldnn::memory::format_tag> inputMemoryFormatsFilter;
+    std::vector <mkldnn::memory::format_tag> outputMemoryFormatsFilter;

    std::string originalLayers;  // contains names of the original layers separated by comma

@ -573,6 +573,7 @@ protected:
    std::vector<InferenceEngine::Blob::Ptr> internalBlobs;
    std::vector<MKLDNNMemoryPtr> internalBlobMemory;
    std::vector<PrimitiveDescInfo> supportedPrimitiveDescriptors;
+    std::unordered_map<int, mkldnn::memory> primArgs;
    MKLDNNPrimitive prim;
    std::vector<MKLDNNDescriptor> descs;

@ -590,7 +591,7 @@ protected:

    virtual const std::vector<impl_desc_type>& getPrimitivesPriority();

-    std::vector<mkldnn::memory::format> getAvailableFormatsForDims(const MKLDNNDims& dims) const;
+    virtual std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims& dims) const;
    int batchToProcess();

    InferenceEngine::Blob::Ptr createInternalBlob(InferenceEngine::SizeVector dims, bool weights, bool is_grouped = false);
@ -679,15 +680,4 @@ static struct REG_MKLDNN_CONCAT3(Registrar4, __prim, __LINE__) {
    }                                                                                       \
 } REG_MKLDNN_CONCAT3(_reg_, __prim, __LINE__);

-template <typename T, typename U>
-inline T div_up(const T a, const U b) {
-    assert(b);
-    return (a + b - 1) / b;
-}
-
-template <typename T, typename U>
-inline T rnd_up(const T a, const U b) {
-    return div_up(a, b) * b;
-}
-
 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_plugin.cpp
@ -120,6 +120,7 @@ static void Transformation(CNNNetwork& clonedNetwork, const Config& conf) {
    std::vector<std::pair<ngraph::element::Type, ngraph::element::Type>> convert_precision_list{
            {ngraph::element::i64,     ngraph::element::i32},
            {ngraph::element::u64,     ngraph::element::i32},
+            {ngraph::element::i16,     ngraph::element::i32},
            {ngraph::element::u16,     ngraph::element::i32},
            {ngraph::element::u32,     ngraph::element::i32},
            {ngraph::element::f16,     ngraph::element::f32},
@ -325,6 +326,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
            NetPass::ConvertPrecision(implNetworkWrapper, Precision::FP16, Precision::FP32);
            NetPass::ConvertPrecision(implNetworkWrapper, Precision::BOOL, Precision::U8);
            NetPass::ConvertPrecision(implNetworkWrapper, Precision::U16, Precision::I32);
+            NetPass::ConvertPrecision(implNetworkWrapper, Precision::I16, Precision::I32);
        }
    }

--- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.cpp
@ -4,9 +4,6 @@

 #include <mkldnn_types.h>
 #include "mkldnn_primitive.h"
-#include "../../thirdparty/mkl-dnn/src/common/primitive_desc.hpp"
-#include "../../thirdparty/mkl-dnn/src/common/memory_pd.hpp"
-#include "../../thirdparty/mkl-dnn/src/cpu/cpu_concat.hpp"

 using namespace MKLDNNPlugin;

@ -20,65 +17,11 @@ mkldnn::primitive MKLDNNPrimitive::operator*() {
    return *prim;
 }

-void MKLDNNPrimitive::reset(mkldnn::primitive* prim) {
-    this->prim.reset(prim);
+void MKLDNNPrimitive::reset(mkldnn::primitive* primitive) {
+    prim.reset(primitive);
 }

-MKLDNNPrimitive &MKLDNNPrimitive::operator=(const std::shared_ptr<mkldnn::primitive>& prim) {
-    this->prim = prim;
+MKLDNNPrimitive &MKLDNNPrimitive::operator=(const std::shared_ptr<mkldnn::primitive>& primitive) {
+    prim = primitive;
    return *this;
 }
-
-void MKLDNNPrimitive::setBatchLimit(int batch, size_t inputNum, size_t outputNum) {
-    bool success = true;
-    auto * primDesc = prim->get_primitive_desc();
-    auto * concatPrimDesc = dynamic_cast<const mkldnn::impl::cpu::cpu_concat_pd_t *>(primDesc);
-    for (int i = 0; success && i < primDesc->n_inputs() && i < inputNum; i++) {
-        // Depthwise layers contains weights as input
-        if (primDesc->input_pd()->desc()->ndims != primDesc->input_pd(i)->desc()->ndims)
-            break;
-        auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->input_pd(i)->desc());
-        if (originInputBatches.size() <= i)
-            originInputBatches.push_back(memDesc->dims[0]);
-
-        if (batch > originInputBatches[i])
-            success = false;
-        memDesc->dims[0] = batch;
-        memDesc->layout_desc.blocking.padding_dims[0] = batch;
-        if (concatPrimDesc != nullptr) {
-            memDesc = const_cast<mkldnn_memory_desc_t *>(concatPrimDesc->src_image_pd(i)->desc());
-            memDesc->dims[0] = batch;
-            memDesc->layout_desc.blocking.padding_dims[0] = batch;
-        }
-    }
-    for (int i = 0; success && i < primDesc->n_outputs() && i < outputNum; i++) {
-        if (primDesc->output_pd()->desc()->ndims != primDesc->output_pd(i)->desc()->ndims)
-            break;
-        auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->output_pd(i)->desc());
-        if (i < inputNum && memDesc == primDesc->input_pd(i)->desc())
-            continue;
-        if (originOutputBatches.size() <= i)
-            originOutputBatches.push_back(memDesc->dims[0]);
-
-        if (batch > originOutputBatches[i])
-            success = false;
-        memDesc->dims[0] = batch;
-        memDesc->layout_desc.blocking.padding_dims[0] = batch;
-    }
-
-    if (success)
-        return;
-
-    for (int i = 0; i < primDesc->n_inputs() && i < originInputBatches.size(); i++) {
-        auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->input_pd(i)->desc());
-        memDesc->dims[0] = originInputBatches[i];
-        memDesc->layout_desc.blocking.padding_dims[0] = originInputBatches[i];
-    }
-    for (int i = 0; i < primDesc->n_outputs() && i < originOutputBatches.size(); i++) {
-        auto * memDesc = const_cast<mkldnn_memory_desc_t *>(primDesc->output_pd(i)->desc());
-        memDesc->dims[0] = originOutputBatches[i];
-        memDesc->layout_desc.blocking.padding_dims[0] = originOutputBatches[i];
-    }
-
-    THROW_IE_EXCEPTION << "Dynamic batch cannot be changed!";
-}
--- a/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_primitive.h
@ -17,16 +17,13 @@ class MKLDNNPrimitive {
 public:
    MKLDNNPrimitive();
    operator bool();
-    MKLDNNPrimitive& operator=(const std::shared_ptr<mkldnn::primitive>& prim);
+    MKLDNNPrimitive& operator=(const std::shared_ptr<mkldnn::primitive>& primitive);
    mkldnn::primitive operator*();

-    void reset(mkldnn::primitive* prim);
-    void setBatchLimit(int batch, size_t inputNum, size_t outputNum);
+    void reset(mkldnn::primitive* primitive);

 private:
    std::shared_ptr<mkldnn::primitive> prim;
-    std::vector<int> originInputBatches;
-    std::vector<int> originOutputBatches;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.cpp
@ -3,6 +3,9 @@
 //

 #include "emitter.h"
+
+#include "utils/general_utils.h"
+
 #include <vector>

 using namespace mkldnn::impl::cpu;
@ -11,28 +14,19 @@ using namespace Xbyak;

 namespace MKLDNNPlugin {

-template <typename T, typename P>
-constexpr bool one_of(T val, P item) { return val == item; }
-
-template <typename T, typename P, typename... Args>
-constexpr bool one_of(T val, P item, Args... item_others) {
-    return val == item || one_of(val, item_others...);
-}
-
-
 size_t jit_emitter::get_max_vecs_count() const {
-    return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 32 : 16;
+    return one_of(host_isa_, cpu::x64::avx512_common, cpu::x64::avx512_core) ? 32 : 16;
 }

 size_t jit_emitter::get_vec_length() const {
-    return one_of(host_isa_, cpu::avx512_common, cpu::avx512_core) ? 64 :
-           one_of(host_isa_, cpu::avx2) ? 32 : 16;
+    return one_of(host_isa_, cpu::x64::avx512_common, cpu::x64::avx512_core) ? 64 :
+           one_of(host_isa_, cpu::x64::avx2) ? 32 : 16;
 }

 void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const {
-    if (host_isa_ == cpu::sse42) {
+    if (host_isa_ == cpu::x64::sse41) {
        h->uni_vmovups(addr, Xmm(vec_idx));
-    } else if (host_isa_ == cpu::avx2) {
+    } else if (host_isa_ == cpu::x64::avx2) {
        h->uni_vmovups(addr, Ymm(vec_idx));
    } else {
        h->uni_vmovups(addr, Zmm(vec_idx));
@ -40,9 +34,9 @@ void jit_emitter::push_vec(const Xbyak::Address &addr, size_t vec_idx) const {
 }

 void jit_emitter::pop_vec(size_t vec_idx, const Xbyak::Address &addr) const {
-    if (host_isa_ == cpu::sse42) {
+    if (host_isa_ == cpu::x64::sse41) {
        h->uni_vmovups(Xmm(vec_idx), addr);
-    } else if (host_isa_ == cpu::avx2) {
+    } else if (host_isa_ == cpu::x64::avx2) {
        h->uni_vmovups(Ymm(vec_idx), addr);
    } else {
        h->uni_vmovups(Zmm(vec_idx), addr);
@ -69,8 +63,8 @@ void jit_emitter::emitter_preamble(const std::vector<size_t> &in_vec_idxs, const
    for (auto idx : pool_vec_idxs)
        aux_vec_idxs.push_back(idx);

-    // For sse42 mask register has to be Xmm(0)
-    if (host_isa_ == cpu::sse42 && aux_vecs_count() > 0) {
+    // For sse41 mask register has to be Xmm(0)
+    if (host_isa_ == cpu::x64::sse41 && aux_vecs_count() > 0) {
        size_t idx = 0;
        assert(std::find(in_vec_idxs.begin(), in_vec_idxs.end(), idx) == in_vec_idxs.end());
        if (std::find(aux_vec_idxs.begin(), aux_vec_idxs.end(), idx) == aux_vec_idxs.end()) {
--- a/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/common/emitter.h
@ -5,15 +5,16 @@
 #pragma once

 #include <ie_common.h>
-#include "jit_generator.hpp"
+#include <cpu/x64/jit_generator.hpp>
 #include "mkldnn_node.h"
+
 #include <set>

 namespace MKLDNNPlugin {

 class jit_emitter {
 public:
-    jit_emitter(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_emitter(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
        : h(host), host_isa_(host_isa), n(node), exec_prc_(exec_prc) {
        k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well
@ -33,8 +34,8 @@ protected:
    size_t get_vec_length() const;

    const MKLDNNNode* n;
-    mkldnn::impl::cpu::jit_generator* h;
-    mkldnn::impl::cpu::cpu_isa_t host_isa_;
+    mkldnn::impl::cpu::x64::jit_generator* h;
+    mkldnn::impl::cpu::x64::cpu_isa_t host_isa_;
    InferenceEngine::Precision exec_prc_;

    Xbyak::Opmask k_mask;
@ -63,12 +64,12 @@ protected:
    Xbyak::Label l_table;

    enum {
-        _cmp_eq_oq = mkldnn::impl::cpu::jit_generator::_cmp_eq_oq,
-        _cmp_neq_uq = mkldnn::impl::cpu::jit_generator::_cmp_neq_uq,
-        _cmp_lt_os = mkldnn::impl::cpu::jit_generator::_cmp_lt_os,
-        _cmp_le_os = mkldnn::impl::cpu::jit_generator::_cmp_le_os,
-        _cmp_ge_os = mkldnn::impl::cpu::jit_generator::_cmp_nlt_us,
-        _cmp_gt_os = mkldnn::impl::cpu::jit_generator::_cmp_nle_us,
+        _cmp_eq_oq = mkldnn::impl::cpu::x64::jit_generator::_cmp_eq_oq,
+        _cmp_neq_uq = mkldnn::impl::cpu::x64::jit_generator::_cmp_neq_uq,
+        _cmp_lt_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_lt_os,
+        _cmp_le_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_le_os,
+        _cmp_ge_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_nlt_us,
+        _cmp_gt_os = mkldnn::impl::cpu::x64::jit_generator::_cmp_nle_us,
    };

    virtual void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
--- a/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/common/softmax.cpp
@ -2,18 +2,23 @@
 // SPDX-License-Identifier: Apache-2.0
 //

-#include <vector>
-#include <ie_parallel.hpp>
-#include <mkldnn_extension_utils.h>
-#include "jit_generator.hpp"
-#include "jit_uni_eltwise.hpp"
-#include "utils/bfloat16.hpp"
 #include "softmax.h"

+#include <ie_parallel.hpp>
+#include <cpu/x64/jit_generator.hpp>
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>
+#include <mkldnn.hpp>  // TODO: just to replace mkldnn->dnnl via macros
+#include "utils/bfloat16.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
 using namespace InferenceEngine;
 using namespace MKLDNNPlugin;
 using namespace mkldnn;
 using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;

 #define GET_OFF(field) offsetof(jit_args_softmax, field)
@ -39,14 +44,23 @@ struct jit_uni_softmax_kernel {

    jit_uni_softmax_kernel() : ker_(nullptr) {}
    virtual ~jit_uni_softmax_kernel() {}
+
+    virtual void create_ker() = 0;
 };

 template <cpu_isa_t isa>
 struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32)

-    jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jit_uni_softmax_kernel(), jit_generator() {
-        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
+    jit_uni_softmax_kernel_f32(jit_softmax_config_params jcp) : jcp_(jcp), jit_uni_softmax_kernel(), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
+        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f));

        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
            emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -69,23 +83,23 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge

        mov(aux_reg_work_amount, reg_work_amount);
        mov(aux_reg_src, reg_src);
-        load_vector(vmm_max, ptr[aux_reg_src], jcp.src_dt);
+        load_vector(vmm_max, ptr[aux_reg_src], jcp_.src_dt);
        L(max_loop_label); {
            cmp(aux_reg_work_amount, 0);
            jle(max_loop_end_label, T_NEAR);

-            load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
+            load_vector(vmm_val, ptr[aux_reg_src], jcp_.src_dt);

-            if (isa == cpu::sse42) {
+            if (isa == x64::sse41) {
                uni_vmovups(vmm_mask, vmm_val);
                uni_vcmpgtps(vmm_mask, vmm_mask, vmm_max);
-            } else if (isa == cpu::avx2) {
+            } else if (isa == x64::avx2) {
                uni_vcmpgtps(vmm_mask, vmm_val, vmm_max);
            } else {
                vcmpps(k_mask, vmm_val, vmm_max, _cmp_nle_us);
            }

-            if (isa == cpu::avx512_common) {
+            if (isa == x64::avx512_common) {
                vptestmd(k_mask, vmm_mask, vmm_mask);
                vblendmps(vmm_max | k_mask, vmm_max, vmm_val);
            } else {
@ -108,13 +122,13 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
            cmp(aux_reg_work_amount, 0);
            jle(exp_loop_end_label, T_NEAR);

-            load_vector(vmm_val, ptr[aux_reg_src], jcp.src_dt);
+            load_vector(vmm_val, ptr[aux_reg_src], jcp_.src_dt);

            uni_vsubps(vmm_val, vmm_val, vmm_max);
            exp_injector->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
            uni_vaddps(vmm_exp_sum, vmm_exp_sum, vmm_val);

-            store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt);
+            store_vector(ptr[aux_reg_dst], vmm_val, jcp_.dst_dt);

            add(aux_reg_src, reg_src_stride);
            add(aux_reg_dst, reg_dst_stride);
@ -131,11 +145,11 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
            cmp(aux_reg_work_amount, 0);
            jle(div_loop_end_label, T_NEAR);

-            load_vector(vmm_val, ptr[aux_reg_dst], jcp.dst_dt);
+            load_vector(vmm_val, ptr[aux_reg_dst], jcp_.dst_dt);

            uni_vdivps(vmm_val, vmm_val, vmm_exp_sum);

-            store_vector(ptr[aux_reg_dst], vmm_val, jcp.dst_dt);
+            store_vector(ptr[aux_reg_dst], vmm_val, jcp_.dst_dt);

            add(aux_reg_dst, reg_dst_stride);
            sub(aux_reg_work_amount, 1);
@ -151,13 +165,10 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge
            emu_vcvtneps2bf16->emit_table();

        exp_injector->prepare_table();
-
-        ker_ = (decltype(ker_))this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
-            Xbyak::Ymm, Xbyak::Zmm>::type;
+    using Vmm = typename conditional3<isa == x64::sse41, Xbyak::Xmm, isa == x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;

    Xbyak::Reg64 reg_src = r8;
@ -181,6 +192,8 @@ private:

    std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;

+    jit_softmax_config_params jcp_;
+
    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, Precision src_dt) {
        switch (src_dt) {
            case Precision::FP32:
@ -227,16 +240,18 @@ SoftmaxGeneric::SoftmaxGeneric(Precision inpPrc, Precision outPrc)
    jcp.src_dt = inpPrc;
    jcp.dst_dt = outPrc;

-    if (mayiuse(cpu::avx512_common)) {
-        softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx512_common>(jcp));
+    if (mayiuse(x64::avx512_common)) {
+        softmax_kernel.reset(new jit_uni_softmax_kernel_f32<x64::avx512_common>(jcp));
        block_size = 16;
-    } else if (mayiuse(cpu::avx2)) {
-        softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::avx2>(jcp));
+    } else if (mayiuse(x64::avx2)) {
+        softmax_kernel.reset(new jit_uni_softmax_kernel_f32<x64::avx2>(jcp));
        block_size = 8;
-    } else if (mayiuse(cpu::sse42)) {
-        softmax_kernel.reset(new jit_uni_softmax_kernel_f32<cpu::sse42>(jcp));
+    } else if (mayiuse(x64::sse41)) {
+        softmax_kernel.reset(new jit_uni_softmax_kernel_f32<x64::sse41>(jcp));
        block_size = 4;
    }
+    if (softmax_kernel)
+        softmax_kernel->create_ker();
 }

 template<typename in_data_t, typename out_data_t>
--- a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.cpp
@ -5,12 +5,15 @@
 #include "common/emitter.h"
 #include "jit_eltwise_emitters.hpp"
 #include "mkldnn_eltwise_node.h"
-#include "jit_uni_eltwise.hpp"
+#include <cpu/x64/jit_uni_eltwise.hpp>
 #include "legacy/ie_layers.h"

+
+
 using namespace InferenceEngine;
 using namespace mkldnn::impl::utils;
-using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl;
+using namespace mkldnn::impl::cpu::x64;
 using namespace Xbyak;

 namespace MKLDNNPlugin {
@ -23,25 +26,25 @@ size_t jit_add_emitter::get_inputs_num() { return 2; }

 void jit_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
        h->uni_vaddps(vmm_dst, vmm_dst, vmm_src1);
    } else {
@ -57,27 +60,27 @@ size_t jit_mul_add_emitter::get_inputs_num() { return 3; }

 void jit_mul_add_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                    const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_mul_add_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_src2 = Vmm(in_vec_idxs[2]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
        h->mulps(vmm_dst, vmm_src1);
        h->addps(vmm_dst, vmm_src2);
@ -116,25 +119,25 @@ size_t jit_subtract_emitter::get_inputs_num() { return 2; }

 void jit_subtract_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_subtract_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
        h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
    } else {
@ -151,25 +154,25 @@ size_t jit_multiply_emitter::get_inputs_num() { return 2; }

 void jit_multiply_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_multiply_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
        h->uni_vmulps(vmm_dst, vmm_dst, vmm_src1);
    } else {
@ -186,20 +189,20 @@ size_t jit_divide_emitter::get_inputs_num() { return 2; }

 void jit_divide_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -226,7 +229,7 @@ void jit_divide_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
        }
    };

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->uni_vmovups(vmm_dst, vmm_src0);
        uni_vdiv(vmm_dst, vmm_dst, vmm_src1);
    } else {
@ -250,26 +253,26 @@ size_t jit_floor_mod_emitter::get_inputs_num() { return 2; }

 void jit_floor_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_floor_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        if (vmm_dst.getIdx() != vmm_src0.getIdx())
            h->uni_vmovups(vmm_dst, vmm_src0);
        h->uni_vmovups(vmm_aux0, vmm_src0);
@ -299,26 +302,26 @@ size_t jit_mod_emitter::get_inputs_num() { return 2; }

 void jit_mod_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_mod_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        if (vmm_dst.getIdx() != vmm_src0.getIdx())
            h->uni_vmovups(vmm_dst, vmm_src0);
        h->uni_vmovups(vmm_aux0, vmm_src0);
@ -348,20 +351,20 @@ size_t jit_maximum_emitter::get_inputs_num() { return 2; }

 void jit_maximum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_maximum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -374,7 +377,7 @@ void jit_maximum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
        }
    };

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        if (vmm_src0.getIdx() != vmm_dst.getIdx())
            h->uni_vmovups(vmm_dst, vmm_src0);
        uni_vmax(vmm_dst, vmm_dst, vmm_src1);
@ -395,20 +398,20 @@ size_t jit_minimum_emitter::get_inputs_num() { return 2; }

 void jit_minimum_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_minimum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -421,7 +424,7 @@ void jit_minimum_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const
        }
    };

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        if (vmm_src0.getIdx() != vmm_dst.getIdx())
            h->uni_vmovups(vmm_dst, vmm_src0);
        uni_vmin(vmm_dst, vmm_dst, vmm_src1);
@ -442,25 +445,25 @@ size_t jit_squared_difference_emitter::get_inputs_num() { return 2; }

 void jit_squared_difference_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_squared_difference_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        if (vmm_src0.getIdx() != vmm_dst.getIdx())
            h->uni_vmovups(vmm_dst, vmm_src0);
        h->uni_vsubps(vmm_dst, vmm_dst, vmm_src1);
@ -480,20 +483,20 @@ size_t jit_power_dynamic_emitter::get_inputs_num() { return 2; }

 void jit_power_dynamic_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -512,7 +515,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,

    // caller obligation to save k-regs as callee may use them
    size_t n_k_regs_to_save = 8;
-    if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+    if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
        h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
        for (size_t i = 0; i < n_k_regs_to_save; ++i) {
            if (mayiuse(avx512_core))
@ -561,7 +564,7 @@ void jit_power_dynamic_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
    h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());

    // restore k registers
-    if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+    if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
        for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
            if (mayiuse(avx512_core))
                h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
@ -588,33 +591,33 @@ size_t jit_equal_emitter::get_inputs_num() { return 2; }

 void jit_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->movups(vmm_aux0, vmm_src0);
        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
        h->movups(vmm_aux1, table_val("one"));
        h->pxor(vmm_dst, vmm_dst);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
        h->uni_vmovups(vmm_dst, table_val("zero"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -644,33 +647,33 @@ size_t jit_not_equal_emitter::get_inputs_num() { return 2; }

 void jit_not_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_not_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->movups(vmm_aux0, vmm_src0);
        h->cmpps(vmm_aux0, vmm_src1, _cmp_eq_oq);
        h->movups(vmm_dst, table_val("one"));
        h->pxor(vmm_aux1, vmm_aux1);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpeqps(vmm_aux0, vmm_src0, vmm_src1);
        h->uni_vmovups(vmm_dst, table_val("one"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -700,33 +703,33 @@ size_t jit_greater_emitter::get_inputs_num() { return 2; }

 void jit_greater_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_greater_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->movups(vmm_aux0, vmm_src0);
        h->cmpps(vmm_aux0, vmm_src1, _cmp_gt_os);
        h->movups(vmm_aux1, table_val("one"));
        h->pxor(vmm_dst, vmm_dst);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpgtps(vmm_aux0, vmm_src0, vmm_src1);
        h->uni_vmovups(vmm_dst, table_val("zero"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -756,33 +759,33 @@ size_t jit_greater_equal_emitter::get_inputs_num() { return 2; }

 void jit_greater_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_greater_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->movups(vmm_aux0, vmm_src0);
        h->cmpps(vmm_aux0, vmm_src1, _cmp_ge_os);
        h->movups(vmm_aux1, table_val("one"));
        h->pxor(vmm_dst, vmm_dst);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpgeps(vmm_aux0, vmm_src0, vmm_src1);
        h->uni_vmovups(vmm_dst, table_val("zero"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -812,33 +815,33 @@ size_t jit_less_emitter::get_inputs_num() { return 2; }

 void jit_less_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_less_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->movups(vmm_aux0, vmm_src0);
        h->cmpps(vmm_aux0, vmm_src1, _cmp_lt_os);
        h->movups(vmm_aux1, table_val("one"));
        h->pxor(vmm_dst, vmm_dst);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpltps(vmm_aux0, vmm_src0, vmm_src1);
        h->uni_vmovups(vmm_dst, table_val("zero"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -868,20 +871,20 @@ size_t jit_less_equal_emitter::get_inputs_num() { return 2; }

 void jit_less_equal_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_less_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -889,13 +892,13 @@ void jit_less_equal_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, co
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->movups(vmm_aux0, vmm_src0);
        h->cmpps(vmm_aux0, vmm_src1, _cmp_le_os);
        h->movups(vmm_aux1, table_val("one"));
        h->pxor(vmm_dst, vmm_dst);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpleps(vmm_aux0, vmm_src0, vmm_src1);
        h->uni_vmovups(vmm_dst, table_val("zero"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -925,20 +928,20 @@ size_t jit_logical_and_emitter::get_inputs_num() { return 2; }

 void jit_logical_and_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -946,7 +949,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
    Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->pxor(vmm_aux0, vmm_aux0);
        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
        h->movups(vmm_dst, table_val("one"));
@ -960,7 +963,7 @@ void jit_logical_and_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
        h->blendvps(vmm_aux2, vmm_aux1);

        h->uni_vandps(vmm_dst, vmm_dst, vmm_aux2);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
        h->uni_vmovups(vmm_dst, table_val("one"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -1002,20 +1005,20 @@ size_t jit_logical_or_emitter::get_inputs_num() { return 2; }

 void jit_logical_or_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -1023,7 +1026,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, co
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
    Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->pxor(vmm_aux0, vmm_aux0);
        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
        h->movups(vmm_dst, table_val("one"));
@ -1037,7 +1040,7 @@ void jit_logical_or_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, co
        h->blendvps(vmm_aux2, vmm_aux1);

        h->uni_vorps(vmm_dst, vmm_dst, vmm_aux2);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
        h->uni_vmovups(vmm_dst, table_val("one"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -1078,20 +1081,20 @@ size_t jit_logical_xor_emitter::get_inputs_num() { return 2; }

 void jit_logical_xor_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
@ -1099,7 +1102,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);
    Vmm vmm_aux2 = Vmm(aux_vec_idxs[2]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->pxor(vmm_aux0, vmm_aux0);
        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
        h->movups(vmm_dst, table_val("one"));
@ -1113,7 +1116,7 @@ void jit_logical_xor_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, c
        h->blendvps(vmm_aux2, vmm_aux1);

        h->uni_vxorps(vmm_dst, vmm_dst, vmm_aux2);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
        h->uni_vmovups(vmm_dst, table_val("one"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("zero"), vmm_aux0);
@ -1154,32 +1157,32 @@ size_t jit_logical_not_emitter::get_inputs_num() { return 1; }

 void jit_logical_not_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_logical_not_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->pxor(vmm_aux0, vmm_aux0);
        h->cmpps(vmm_aux0, vmm_src0, _cmp_eq_oq);
        h->movups(vmm_aux1, table_val("one"));
        h->pxor(vmm_dst, vmm_dst);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vcmpeqps(vmm_aux0, vmm_src0, table_val("zero"));
        h->uni_vmovups(vmm_dst, table_val("zero"));
        h->vblendvps(vmm_dst, vmm_dst, table_val("one"), vmm_aux0);
@ -1209,20 +1212,20 @@ size_t jit_power_static_emitter::get_inputs_num() { return 1; }

 void jit_power_static_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
@ -1238,7 +1241,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
    Xmm xmm0 = Xmm(0), xmm1 = Xmm(1);

    if (scale != 1.f || shift != 0.f) {
-        if (isa == cpu::sse42) {
+        if (isa == cpu::x64::sse41) {
            h->uni_vmovups(vmm_aux0, table_val("scale"));
            h->uni_vmulps(vmm_aux0, vmm_aux0, vmm_src0);
            h->uni_vmovups(vmm_dst, table_val("shift"));
@ -1264,7 +1267,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,

        if (power < 0.f) {
            h->uni_vmovups(vmm_aux0, table_val("one"));
-            if (isa == cpu::sse42) {
+            if (isa == cpu::x64::sse41) {
                h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
                h->uni_vmovups(vmm_dst, vmm_aux0);
            } else {
@ -1280,7 +1283,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,

        if (power < 0.f) {
            h->uni_vmovups(vmm_aux0, table_val("one"));
-            if (isa == cpu::sse42) {
+            if (isa == cpu::x64::sse41) {
                h->uni_vdivps(vmm_aux0, vmm_aux0, vmm_dst);
                h->uni_vmovups(vmm_dst, vmm_aux0);
            } else {
@ -1302,7 +1305,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,

        // caller obligation to save k-regs as callee may use them
        size_t n_k_regs_to_save = 8;
-        if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+        if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
            h->sub(h->rsp, n_k_regs_to_save * k_mask_size);
            for (size_t i = 0; i < n_k_regs_to_save; ++i) {
                if (mayiuse(avx512_core))
@ -1351,7 +1354,7 @@ void jit_power_static_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs,
        h->add(h->rsp, (get_max_vecs_count() + 2) * get_vec_length());

        // restore k registers
-        if (isa == cpu::avx512_common || isa == cpu::avx512_core) {
+        if (isa == cpu::x64::avx512_common || isa == cpu::x64::avx512_core) {
            for (int i = n_k_regs_to_save - 1; i >= 0; --i) {
                if (mayiuse(avx512_core))
                    h->kmovq(Opmask(i), h->ptr[h->rsp + i * k_mask_size]);
@ -1397,27 +1400,27 @@ size_t jit_prelu_emitter::get_inputs_num() { return 2; }

 void jit_prelu_emitter::emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                                const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
-        emit_isa<cpu::sse42>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx2) {
-        emit_isa<cpu::avx2>(in_vec_idxs, out_vec_idxs);
-    } else if (host_isa_ == cpu::avx512_common) {
-        emit_isa<cpu::avx512_common>(in_vec_idxs, out_vec_idxs);
+    if (host_isa_ == cpu::x64::sse41) {
+        emit_isa<cpu::x64::sse41>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        emit_isa<cpu::x64::avx2>(in_vec_idxs, out_vec_idxs);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        emit_isa<cpu::x64::avx512_common>(in_vec_idxs, out_vec_idxs);
    } else {
        assert(!"unsupported isa");
    }
 }

-template <mkldnn::impl::cpu::cpu_isa_t isa>
+template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
 void jit_prelu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const {
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xmm, isa == cpu::x64::avx2, Ymm, Zmm>::type;
    Vmm vmm_src0 = Vmm(in_vec_idxs[0]);
    Vmm vmm_src1 = Vmm(in_vec_idxs[1]);
    Vmm vmm_dst = Vmm(out_vec_idxs[0]);
    Vmm vmm_aux0 = Vmm(aux_vec_idxs[0]);
    Vmm vmm_aux1 = Vmm(aux_vec_idxs[1]);

-    if (isa == cpu::sse42) {
+    if (isa == cpu::x64::sse41) {
        h->pxor(vmm_aux0, vmm_aux0);
        h->cmpps(vmm_aux0, vmm_src0, _cmp_gt_os);
        h->movups(vmm_aux1, vmm_src1);
@ -1425,12 +1428,12 @@ void jit_prelu_emitter::emit_isa(const std::vector<size_t> &in_vec_idxs, const s
        if (vmm_src0.getIdx() != vmm_dst.getIdx())
            h->movups(vmm_dst, vmm_src0);
        h->blendvps(vmm_dst, vmm_aux1);
-    } else if (isa == cpu::avx2) {
+    } else if (isa == cpu::x64::avx2) {
        h->vmulps(vmm_aux0, vmm_src0, vmm_src1);
        h->vxorps(vmm_aux1, vmm_aux1, vmm_aux1);
        h->vcmpgtps(vmm_aux1, vmm_src0, vmm_aux1);
        h->vblendvps(vmm_dst, vmm_aux0, vmm_src0, vmm_aux1);
-    } else if (isa == cpu::avx512_common) {
+    } else if (isa == cpu::x64::avx512_common) {
        h->vxorpd(vmm_aux0, vmm_aux0, vmm_aux0);
        if (vmm_src0.getIdx() != vmm_dst.getIdx())
            h->vmovups(vmm_dst, vmm_src0);
--- a/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/jit_eltwise_emitters.hpp
@ -5,14 +5,14 @@
 #pragma once

 #include "common/emitter.h"
-#include "jit_generator.hpp"
+#include <cpu/x64/jit_generator.hpp>
 #include "mkldnn_node.h"

 namespace MKLDNNPlugin {

 class jit_add_emitter : public jit_emitter {
 public:
-    jit_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                    InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -21,13 +21,13 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };

 class jit_mul_add_emitter : public jit_emitter {
 public:
-    jit_mul_add_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -36,7 +36,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    size_t aux_vecs_count() const override;
@ -45,7 +45,7 @@ private:

 class jit_subtract_emitter : public jit_emitter {
 public:
-    jit_subtract_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                         InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -54,14 +54,14 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };


 class jit_multiply_emitter : public jit_emitter {
 public:
-    jit_multiply_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                         InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -70,14 +70,14 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };


 class jit_divide_emitter : public jit_emitter {
 public:
-    jit_divide_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -87,7 +87,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
    size_t aux_vecs_count() const override;
 };
@ -95,7 +95,7 @@ private:

 class jit_floor_mod_emitter : public jit_emitter {
 public:
-    jit_floor_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                          InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -104,7 +104,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
    size_t aux_vecs_count() const override;
 };
@ -112,7 +112,7 @@ private:

 class jit_mod_emitter : public jit_emitter {
 public:
-    jit_mod_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                    InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -121,7 +121,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
    size_t aux_vecs_count() const override;
 };
@ -129,7 +129,7 @@ private:

 class jit_maximum_emitter : public jit_emitter {
 public:
-    jit_maximum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -139,14 +139,14 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };


 class jit_minimum_emitter : public jit_emitter {
 public:
-    jit_minimum_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -156,14 +156,14 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };


 class jit_squared_difference_emitter : public jit_emitter {
 public:
-    jit_squared_difference_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_squared_difference_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                                   InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -172,14 +172,14 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };


 class jit_power_dynamic_emitter : public jit_emitter {
 public:
-    jit_power_dynamic_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                              InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -188,14 +188,14 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;
 };


 class jit_equal_emitter : public jit_emitter {
 public:
-    jit_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                      InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -204,7 +204,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -214,7 +214,7 @@ private:

 class jit_not_equal_emitter : public jit_emitter {
 public:
-    jit_not_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                          InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -223,7 +223,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -233,7 +233,7 @@ private:

 class jit_greater_emitter : public jit_emitter {
 public:
-    jit_greater_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -242,7 +242,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -252,7 +252,7 @@ private:

 class jit_greater_equal_emitter : public jit_emitter {
 public:
-    jit_greater_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                              InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -261,7 +261,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -271,7 +271,7 @@ private:

 class jit_less_emitter : public jit_emitter {
 public:
-    jit_less_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                     InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -280,7 +280,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -290,7 +290,7 @@ private:

 class jit_less_equal_emitter : public jit_emitter {
 public:
-    jit_less_equal_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                           InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -299,7 +299,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -309,7 +309,7 @@ private:

 class jit_logical_and_emitter : public jit_emitter {
 public:
-    jit_logical_and_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                            InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -318,7 +318,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -328,7 +328,7 @@ private:

 class jit_logical_or_emitter : public jit_emitter {
 public:
-    jit_logical_or_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                           InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -337,7 +337,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -347,7 +347,7 @@ private:

 class jit_logical_xor_emitter : public jit_emitter {
 public:
-    jit_logical_xor_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                            InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -356,7 +356,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -365,7 +365,7 @@ private:

 class jit_logical_not_emitter : public jit_emitter {
 public:
-    jit_logical_not_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                            InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -374,7 +374,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -383,7 +383,7 @@ private:

 class jit_power_static_emitter : public jit_emitter {
 public:
-    jit_power_static_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                             InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -392,7 +392,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    void register_table_entries() override;
@ -401,7 +401,7 @@ private:

 class jit_prelu_emitter : public jit_emitter {
 public:
-    jit_prelu_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                      InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -410,7 +410,7 @@ private:
    void emit_impl(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                  const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) override;

-    template <mkldnn::impl::cpu::cpu_isa_t isa>
+    template <mkldnn::impl::cpu::x64::cpu_isa_t isa>
    void emit_isa(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs) const;

    size_t aux_vecs_count() const override;
--- a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.cpp
@ -8,7 +8,8 @@
 #include "legacy/ie_layers.h"

 using namespace mkldnn::impl::utils;
-using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl;
+using namespace mkldnn::impl::cpu::x64;
 using namespace Xbyak;

 namespace MKLDNNPlugin {
@ -19,15 +20,15 @@ jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa,

    auto alg = static_cast<mkldnn_alg_kind_t>(eltwiseNode.getAlgorithm());

-    if (host_isa_ == cpu::sse42) {
-        eltwise_injector_sse42 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::sse42>>(
-                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
-    } else if (host_isa_ == cpu::avx2) {
-        eltwise_injector_avx2 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx2>>(
-                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
-    } else if (host_isa_ == cpu::avx512_common) {
-        eltwise_injector_avx512_common = std::make_shared<jit_uni_eltwise_injector_f32<cpu::avx512_common>>(
-                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta());
+    if (host_isa_ == cpu::x64::sse41) {
+        eltwise_injector_sse42 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::x64::sse41>>(
+                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1);
+    } else if (host_isa_ == cpu::x64::avx2) {
+        eltwise_injector_avx2 = std::make_shared<jit_uni_eltwise_injector_f32<cpu::x64::avx2>>(
+                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1);
+    } else if (host_isa_ == cpu::x64::avx512_common) {
+        eltwise_injector_avx512_common = std::make_shared<jit_uni_eltwise_injector_f32<cpu::x64::avx512_common>>(
+                host, alg, eltwiseNode.getAlpha(), eltwiseNode.getBeta(), 1);
    } else {
        assert(!"unsupported isa");
    }
@ -37,15 +38,15 @@ size_t jit_mkldnn_emitter::get_inputs_num() { return 1; }

 void jit_mkldnn_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std::vector<size_t> &out_vec_idxs,
                              const std::vector<size_t> &pool_vec_idxs, const std::vector<size_t> &pool_gpr_idxs) {
-    if (host_isa_ == cpu::sse42) {
+    if (host_isa_ == cpu::x64::sse41) {
        if (out_vec_idxs[0] != in_vec_idxs[0])
            h->uni_vmovups(Xmm(out_vec_idxs[0]), Xmm(in_vec_idxs[0]));
        eltwise_injector_sse42->compute_vector(out_vec_idxs[0]);
-    } else if (host_isa_ == cpu::avx2) {
+    } else if (host_isa_ == cpu::x64::avx2) {
        if (out_vec_idxs[0] != in_vec_idxs[0])
            h->uni_vmovups(Ymm(out_vec_idxs[0]), Ymm(in_vec_idxs[0]));
        eltwise_injector_avx2->compute_vector(out_vec_idxs[0]);
-    } else if (host_isa_ == cpu::avx512_common) {
+    } else if (host_isa_ == cpu::x64::avx512_common) {
        if (out_vec_idxs[0] != in_vec_idxs[0])
            h->uni_vmovups(Zmm(out_vec_idxs[0]), Zmm(in_vec_idxs[0]));
        eltwise_injector_avx512_common->compute_vector(out_vec_idxs[0]);
@ -55,11 +56,11 @@ void jit_mkldnn_emitter::emit(const std::vector<size_t> &in_vec_idxs, const std:
 }

 void jit_mkldnn_emitter::emit_table() {
-    if (host_isa_ == cpu::sse42) {
+    if (host_isa_ == cpu::x64::sse41) {
        eltwise_injector_sse42->prepare_table();
-    } else if (host_isa_ == cpu::avx2) {
+    } else if (host_isa_ == cpu::x64::avx2) {
        eltwise_injector_avx2->prepare_table();
-    } else if (host_isa_ == cpu::avx512_common) {
+    } else if (host_isa_ == cpu::x64::avx512_common) {
        eltwise_injector_avx512_common->prepare_table();
    } else {
        assert(!"unsupported isa");
--- a/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/jit_mkldnn_emitters.hpp
@ -5,15 +5,16 @@
 #pragma once

 #include "common/emitter.h"
-#include "jit_generator.hpp"
+#include <cpu/x64/jit_generator.hpp>
 #include "mkldnn_node.h"
-#include "jit_uni_eltwise.hpp"
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>
+

 namespace MKLDNNPlugin {

 class jit_mkldnn_emitter : public jit_emitter {
 public:
-    jit_mkldnn_emitter(mkldnn::impl::cpu::jit_generator *host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
                       InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32);

    size_t get_inputs_num() override;
@ -24,9 +25,9 @@ public:
    void emit_table() override;

 private:
-    std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::sse42>> eltwise_injector_sse42;
-    std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx2>> eltwise_injector_avx2;
-    std::shared_ptr<mkldnn::impl::cpu::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::avx512_common>> eltwise_injector_avx512_common;
+    std::shared_ptr<mkldnn::impl::cpu::x64::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::x64::sse41>> eltwise_injector_sse42;
+    std::shared_ptr<mkldnn::impl::cpu::x64::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::x64::avx2>> eltwise_injector_avx2;
+    std::shared_ptr<mkldnn::impl::cpu::x64::jit_uni_eltwise_injector_f32<mkldnn::impl::cpu::x64::avx512_common>> eltwise_injector_avx512_common;
 };

 } // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.cpp
@ -14,16 +14,16 @@ MKLDNNBatchNormalizationNode::MKLDNNBatchNormalizationNode(const InferenceEngine
                                                           const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
        : MKLDNNNode(layer, eng, cache) {
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        return GetVarianceDesc(primitive_desc_it.fetch());
+        return GetVarianceDesc(primitive_desc_it);
    });
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        return GetMeanDesc(primitive_desc_it.fetch());
+        return GetMeanDesc(primitive_desc_it);
    });

    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
        if (!fusedWithScale())
            return MKLDNNMemoryDesc();
-        return GetScaleShiftWeightsDesc(primitive_desc_it.fetch());
+        return GetScaleShiftWeightsDesc(primitive_desc_it);
    });
 }

@ -105,57 +105,29 @@ void MKLDNNBatchNormalizationNode::getSupportedDescriptors() {
    }
 }

-MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetVarianceDesc(const memory::primitive_desc &primitive_desc) const {
-    memory::primitive_desc aprimitive_desc;
-    mkldnn_primitive_desc_t bndesc = nullptr;
+static MKLDNNMemoryDesc get_bn_mdesc_by_index(const mkldnn::primitive_desc_iterator &primitive_desc, int idx) {
    mkldnn_batch_normalization_desc_t *p;
    error::wrap_c_api(mkldnn_primitive_desc_query(
-            primitive_desc.get(), mkldnn::convert_to_c(batch_normalization_d), 0, &p),
+            primitive_desc.get(), mkldnn::convert_to_c(mkldnn::query::batch_normalization_d), 0, &p),
                      "could not get a batch-normalization descriptor");
-    const_mkldnn_primitive_desc_t const_bndesc =
-            (p->flags & use_global_stats) ?
-            mkldnn_primitive_desc_query_pd(primitive_desc.get(),
-                                                  mkldnn::convert_to_c(src_pd), 2) :
-            mkldnn_primitive_desc_query_pd(primitive_desc.get(),
-                                                  mkldnn::convert_to_c(dst_pd), 2);
-    error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                                                         const_bndesc),
-                      "could not clone a variance primitive descriptor");
-    aprimitive_desc.reset(bndesc);
-    return MKLDNNMemoryDesc(aprimitive_desc.desc());
+    auto bndesc =
+            (p->flags & mkldnn::convert_to_c(mkldnn::normalization_flags::use_global_stats)) ?
+            primitive_desc.src_desc(idx) : primitive_desc.dst_desc(idx);
+
+    return MKLDNNMemoryDesc {bndesc};
 }

-MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetMeanDesc(const memory::primitive_desc &primitive_desc) const {
-    memory::primitive_desc aprimitive_desc;
-    mkldnn_primitive_desc_t bndesc = nullptr;
-    mkldnn_batch_normalization_desc_t *p;
-    error::wrap_c_api(mkldnn_primitive_desc_query(
-            primitive_desc.get(), mkldnn::convert_to_c(batch_normalization_d), 0, &p),
-                      "could not get a batch-normalization descriptor");
-    const_mkldnn_primitive_desc_t const_bndesc =
-            (p->flags & use_global_stats) ?
-            mkldnn_primitive_desc_query_pd(primitive_desc.get(),
-                                                  mkldnn::convert_to_c(src_pd), 1) :
-            mkldnn_primitive_desc_query_pd(primitive_desc.get(),
-                                                  mkldnn::convert_to_c(dst_pd), 1);
-    error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                                                         const_bndesc),
-                      "could not clone a mean primitive descriptor");
-    aprimitive_desc.reset(bndesc);
-    return MKLDNNMemoryDesc(aprimitive_desc.desc());
+MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetVarianceDesc(const mkldnn::primitive_desc &primitive_desc) const {
+    // TODO: rewrite with using stat_desc
+    return get_bn_mdesc_by_index(primitive_desc, 2);
 }

-MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetScaleShiftWeightsDesc(const memory::primitive_desc &primitive_desc) const {
-    memory::primitive_desc adesc;
-    mkldnn_primitive_desc_t bndesc = nullptr;
-    const_mkldnn_primitive_desc_t const_bndesc =
-            mkldnn_primitive_desc_query_pd(primitive_desc.get(),
-                                           mkldnn::convert_to_c(weights_pd), 0);
-    error::wrap_c_api(mkldnn_primitive_desc_clone(&bndesc,
-                                                  const_bndesc),
-                      "could not clone a weights primitive descriptor");
-    adesc.reset(bndesc);
-    return MKLDNNMemoryDesc(adesc.desc());
+MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetMeanDesc(const mkldnn::primitive_desc &primitive_desc) const {
+    return get_bn_mdesc_by_index(primitive_desc, 1);
+}
+
+MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::GetScaleShiftWeightsDesc(const mkldnn::primitive_desc &primitive_desc) const {
+    return MKLDNNMemoryDesc(primitive_desc.weights_desc(0));
 }

 bool MKLDNNBatchNormalizationNode::created() const {
@ -166,23 +138,28 @@ void MKLDNNBatchNormalizationNode::createPrimitive() {
    if (prim)
        return;

-    if (fusedWithScale()) {
-        auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
-                batch_normalization_forward::desc>();
-        prim.reset(new batch_normalization_forward(prim_desc,
-                                                   getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                                   (const primitive::at) internalBlobMemory[1]->GetPrimitive(),
-                                                   (const primitive::at) internalBlobMemory[0]->GetPrimitive(),
-                                                   (const primitive::at) internalBlobMemory[2]->GetPrimitive(),
-                                                   getChildEdgeAt(0)->getMemory().GetPrimitive()));
-    }  else {
-        auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
-                batch_normalization_forward::desc>();
-        prim.reset(new batch_normalization_forward(prim_desc,
-                                                   getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                                   (const primitive::at) internalBlobMemory[1]->GetPrimitive(),
-                                                   (const primitive::at) internalBlobMemory[0]->GetPrimitive(),
-                                                   getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    auto prim_desc = createPrimitiveDescriptor<batch_normalization_forward::primitive_desc,
+            batch_normalization_forward::desc>();
+    prim.reset(new batch_normalization_forward(prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+
+    const auto &mean = internalBlobMemory[1]->GetPrimitive();
+    const auto &var = internalBlobMemory[0]->GetPrimitive();
+
+    if (convert_to_c(flag) & dnnl_use_scaleshift) {
+        const auto &sclshft = internalBlobMemory[2]->GetPrimitive();
+        primArgs = {{DNNL_ARG_SRC, src},
+                    {DNNL_ARG_MEAN, mean},
+                    {DNNL_ARG_VARIANCE, var},
+                    {DNNL_ARG_SCALE_SHIFT, sclshft},
+                    {DNNL_ARG_DST, dst}};
+    } else {
+        primArgs = {{DNNL_ARG_SRC, src},
+                    {DNNL_ARG_MEAN, mean},
+                    {DNNL_ARG_VARIANCE, var},
+                    {DNNL_ARG_DST, dst}};
    }
 }

@ -194,15 +171,16 @@ void MKLDNNBatchNormalizationNode::createDescriptor(const std::vector<InferenceE
        MKLDNNDims dims = inDesc.getDims();
        dims.push_back(1);  // H
        dims.push_back(1);  // W
-        auto format = memory::nchw;
+        auto format = memory::format_tag::nchw;
        inDesc = MKLDNNMemoryDesc(dims, inDesc.getDataType(), format);
    }

-    unsigned flag = mkldnn_use_global_stats;
+    flag = normalization_flags::use_global_stats;
    if (fusedWithScale())
-        flag |= mkldnn_use_scaleshift;
+        flag |= normalization_flags::use_scale_shift;
+
    MKLDNNDescriptor desc(std::shared_ptr<batch_normalization_forward::desc>(
-            new batch_normalization_forward::desc(prop_kind::forward_scoring, inDesc, eps,
+            new mkldnn::batch_normalization_forward::desc(prop_kind::forward_scoring, inDesc, eps,
                                                  flag)));
    descs.push_back(desc);
 }
@ -237,7 +215,7 @@ void MKLDNNBatchNormalizationNode::initSupportedPrimitiveDescriptors() {
    // BN primitive doesn't support strides
    for (auto& desc : descs) {
        primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine());
-        while (itpd.is_not_end()) {
+        while (static_cast<bool>(itpd)) {
            InferenceEngine::LayerConfig config;
            config.dynBatchSupport = true;
            for (size_t i = 0; i < desc.inputNumbers(); i++) {
@ -248,27 +226,25 @@ void MKLDNNBatchNormalizationNode::initSupportedPrimitiveDescriptors() {
                config.inConfs.push_back(dataConfig);
            }

-            std::vector<memory::format> outFormats;
            for (size_t i = 0; i < desc.outputNumbers(); i++) {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = canBeInPlace() ? 0 : -1;
                dataConfig.constant = false;
                dataConfig.desc = getDstMemDesc(itpd, i);
                config.outConfs.push_back(dataConfig);
-
-                outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
            }
-            impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());

-            supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
-            itpd++;
+            supportedPrimitiveDescriptors.emplace_back(config, impl_type);
+            if (!itpd.next_impl())
+                break;
        }
    }
 }

 MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it,
                                                             size_t idx) {
-    TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
+    TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));

    if (getParentEdgeAt(0)->getDims().ndims() == 2 && desc.getLayout() == InferenceEngine::Layout::NCHW) {
        desc.reshape(getParentEdgeAt(idx)->getDims().ToSizeVector(), InferenceEngine::Layout::NC);
@ -286,7 +262,7 @@ MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getSrcMemDesc(mkldnn::primitive_d

 MKLDNNMemoryDesc MKLDNNBatchNormalizationNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it,
                                                             size_t idx) {
-    TensorDesc desc =  MKLDNNMemoryDesc(primitive_desc_it.dst_primitive_desc(idx).desc());
+    TensorDesc desc =  MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));

    if (getParentEdgeAt(0)->getDims().ndims() == 2 && desc.getLayout() == InferenceEngine::Layout::NCHW) {
        desc.reshape(getParentEdgeAt(idx)->getDims().ToSizeVector(), InferenceEngine::Layout::NC);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_batchnorm_node.h
@ -25,16 +25,19 @@ public:
                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
    void createPrimitive() override;
    bool created() const override;
+
    bool fusedWithScale() const {return fusedWith.size() == 1 && fusedWith[0]->getType() == Eltwise
                                        && fusedWith[0]->getCnnLayer()->type == "ScaleShift";}

    MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
    MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
+
 private:
+    mkldnn::normalization_flags flag = mkldnn::normalization_flags::none;
    float eps = 0.0f;
-    MKLDNNMemoryDesc GetVarianceDesc(const mkldnn::memory::primitive_desc& primitive_desc) const;
-    MKLDNNMemoryDesc GetMeanDesc(const mkldnn::memory::primitive_desc& primitive_desc) const;
-    MKLDNNMemoryDesc GetScaleShiftWeightsDesc(const mkldnn::memory::primitive_desc& primitive_desc) const;
+    MKLDNNMemoryDesc GetVarianceDesc(const mkldnn::primitive_desc& primitive_desc) const;
+    MKLDNNMemoryDesc GetMeanDesc(const mkldnn::primitive_desc& primitive_desc) const;
+    MKLDNNMemoryDesc GetScaleShiftWeightsDesc(const mkldnn::primitive_desc& primitive_desc) const;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.cpp
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_bin_conv_node.h
@ -12,48 +12,106 @@

 namespace MKLDNNPlugin {

+struct jit_bin_conv_params {
+    int mb;
+    int ngroups;
+    int ic, oc, ic_padded;
+    int ih, iw, oh, ow;
+    int l_pad, t_pad, b_pad;
+    int kh, kw;
+    int stride_h, stride_w;
+    int dilate_h, dilate_w;
+    bool with_sum;
+    bool with_dw_conv;
+    bool with_binarization;
+
+    float pad_value;
+    bool exclude_pad;
+
+    int nb_ic, ic_block;
+    int nb_oc, oc_block;
+    int nb_oc_blocking;
+    int ur_w, ur_w_tail;
+    int typesize_in, typesize_out;
+    mkldnn::memory::data_type dst_dt;
+};
+
+struct jit_dw_conv_params {
+    int kh;
+};
+
+struct jit_bin_conv_call_args {
+    const void *src;
+    const void *dst;
+    const void *filt;
+    size_t kh_padding;
+    size_t kw_padding;
+    size_t oc_work;
+    size_t t_overflow;
+    size_t b_overflow;
+    size_t oc_off;
+};
+
+struct jit_uni_bin_conv_kernel {
+    void (*ker_)(const jit_bin_conv_call_args *);
+
+    void operator()(const jit_bin_conv_call_args *args) {
+        assert(ker_);
+        ker_(args);
+    }
+
+    explicit jit_uni_bin_conv_kernel(jit_bin_conv_params jcp, jit_dw_conv_params jcp_dw_conv, const mkldnn_primitive_attr &attr) :
+        ker_(nullptr), jcp_(jcp), jcp_dw_conv_(jcp_dw_conv), attr_(attr) {}
+    virtual ~jit_uni_bin_conv_kernel() {}
+
+    virtual void create_ker() = 0;
+
+    jit_bin_conv_params jcp_;
+    jit_dw_conv_params jcp_dw_conv_;
+
+    const mkldnn_primitive_attr &attr_;
+};
+
 class MKLDNNBinaryConvolutionNode : public MKLDNNNode {
 public:
    MKLDNNBinaryConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
    ~MKLDNNBinaryConvolutionNode() override = default;

    void getSupportedDescriptors() override;
-    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
-                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
-    void initDescriptor(const InferenceEngine::LayerConfig& config) override;
    void createPrimitive() override;
    void initSupportedPrimitiveDescriptors() override;
+    void execute(mkldnn::stream strm) override;
    bool created() const override;
    bool canBeInPlace() const override {
        return false;
    }
-    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);
+    void setPostOps(mkldnn::primitive_attr &attr);
+    bool canFuse(const MKLDNNNodePtr& node) const;

 private:
    bool withSum = false;
    bool withBinarization = false;
-    bool withDWConv = false;
-    bool isDW = false;
-    bool isMerged = false;
-    bool isGrouped = false;
+
+    size_t group = 1;
+    float pad_value = 0.f;
+
    std::vector<ptrdiff_t> stride;
    std::vector<ptrdiff_t> dilation;
    std::vector<ptrdiff_t> paddingL;
    std::vector<ptrdiff_t> paddingR;
-    InferenceEngine::SizeVector weightDims;
-    InferenceEngine::SizeVector biasesDims;

-    ptrdiff_t dw_conv_oc = 0;
-    ptrdiff_t dw_conv_ih = 0;
-    ptrdiff_t dw_conv_iw = 0;
-    std::vector<ptrdiff_t> dw_conv_kernel;
-    std::vector<ptrdiff_t> dw_conv_strides;
-    mkldnn::memory::data_type dw_conv_in_dt = mkldnn::memory::data_type::data_undef;
-    std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
+    jit_bin_conv_params jcp = {};
+    jit_dw_conv_params jcp_dw_conv = {};
+    std::shared_ptr<jit_uni_bin_conv_kernel> bin_conv_kernel = nullptr;

-    int baseInputsNumber;
+    mkldnn::primitive_attr attr;

-    float pad_value = 0.f;
+    impl_desc_type implType = impl_desc_type::ref;
+
+    void executeOptimized(const uint8_t* src, const uint8_t* weights, uint8_t* dst,
+                          const std::vector<size_t>& s_str, const std::vector<size_t>& w_str, const std::vector<size_t>& d_str);
+    void executeReference(const uint8_t* src, const uint8_t* weights, uint8_t* dst,
+                          const std::vector<size_t>& s_str, const std::vector<size_t>& w_str, const std::vector<size_t>& d_str);
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_concat_node.cpp
@ -101,10 +101,10 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
        InferenceEngine::DataConfig dataConfig;
        dataConfig.inPlace = -1;
        dataConfig.constant = false;
-        auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format::nc :
-                                                                                          parentEdge->getDims().ndims() == 4 ? memory::format::nhwc :
-                                                                                                                               memory::format::ndhwc
-                                                                                        : memory::format::any;
+        auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? parentEdge->getDims().ndims() == 2 ? memory::format_tag::nc :
+                                                                                          parentEdge->getDims().ndims() == 4 ? memory::format_tag::nhwc :
+                                                                                                                               memory::format_tag::ndhwc
+                                                                                        : memory::format_tag::any;

        dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(parentEdge->getDims(), inputDataType, fmt));
        config.inConfs.push_back(dataConfig);
@ -116,9 +116,9 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
    config.outConfs[0].inPlace = -1;
    config.outConfs[0].constant = false;
    if ((!isMixedPrecision && outputPrecision != Precision::U8 && outputPrecision != Precision::I8) || axis != 1) {
-        auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format::nc :
-                                                                                          dims.ndims() == 4 ? memory::format::nhwc :
-                                                                                                              memory::format::ndhwc
+        auto fmt = (inputPrecision == Precision::U8 || inputPrecision == Precision::I8) ? dims.ndims() == 2 ? memory::format_tag::nc :
+                                                                                          dims.ndims() == 4 ? memory::format_tag::nhwc :
+                                                                                                              memory::format_tag::ndhwc
                                                                                        : MKLDNNMemory::GetPlainFormat(dims);

        config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(MKLDNNMemoryDesc(dims, outputDataType, fmt));
@ -128,25 +128,25 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
            if (dims.ndims() == 4) {
                if (dims[1] % 8 == 0) {
                    config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                            MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw8c));
-                    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nChw8c);
+                            MKLDNNMemoryDesc(dims, outputDataType, memory::format_tag::nChw8c));
+                    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, memory::format_tag::nChw8c);

                    if (dims[1] % 16 == 0) {
                        config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                                MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nChw16c));
-                        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nChw16c);
+                                MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nChw16c));
+                        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nChw16c);
                    }
                }
            } else if (dims.ndims() == 5) {
                if (dims[1] % 8 == 0) {
                    config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                            MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw8c));
-                    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nCdhw8c);
+                            MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw8c));
+                    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw8c);

                    if (dims[1] % 16 == 0) {
                        config.outConfs[0].desc = MKLDNNExtensionUtils::getUninitTensorDesc(
-                                MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::nCdhw16c));
-                        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nCdhw16c);
+                                MKLDNNMemoryDesc(dims, outputDataType, mkldnn::memory::format_tag::nCdhw16c));
+                        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nCdhw16c);
                    }
                }
            }
@ -197,7 +197,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                                                    {blkDims, order, offset, offsets, strides});
            }

-            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::nhwc);
+            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::nhwc);

            return;
        } else if (numOfDim == 5) {
@ -231,7 +231,7 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                                                    {blkDims, order, offset, offsets, strides});
            }

-            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::ndhwc);
+            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::ref, mkldnn::memory::format_tag::ndhwc);

            return;
        }
@ -303,8 +303,8 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
                                                     {blkDims, order, offset, offsets, strides});
            }
            if (canInplace) {
-                auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::nChw8c : mkldnn::memory::nChw16c
-                                                 : sizeS == 8lu ? mkldnn::memory::nCdhw8c : mkldnn::memory::nCdhw16c;
+                auto dstFormat = numOfDim == 4lu ? sizeS == 8lu ? mkldnn::memory::format_tag::nChw8c : mkldnn::memory::format_tag::nChw16c
+                                                 : sizeS == 8lu ? mkldnn::memory::format_tag::nCdhw8c : mkldnn::memory::format_tag::nCdhw16c;
                supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, dstFormat);
            }
        }
@ -312,9 +312,6 @@ void MKLDNNConcatNode::initSupportedPrimitiveDescriptors() {
 }

 void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(inputPrecision);
-    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(outputPrecision);
-
    bool hasUnknown = false;
    std::vector<size_t> canSelectPrimitive;
    for (size_t i = 0; i < supportedPrimitiveDescriptors.size(); i++) {
@ -382,48 +379,45 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
        canOptimize = false;
    }

-    std::map<mkldnn::memory::format, size_t> formatFrequency;
+    std::map<PartialBlkDesc, size_t> formatFrequency;
    for (size_t i = 0; i < getParentEdges().size(); i++) {
        auto parentEdge = getParentEdgeAt(i);
        auto parent = parentEdge->getParent();

-        if (parent->getSelectedPrimitiveDescriptor() == nullptr)
+        auto parent_pdesc = parent->getSelectedPrimitiveDescriptor();
+        if (parent_pdesc == nullptr)
            continue;

-        int outputIndex = parentEdge->getOutputNum();
-        if (outputIndex < 0)
+        const auto &parent_config = parent_pdesc->getConfig();
+        int outputIndex = parentEdge->getInputNum();
+        if (outputIndex < 0 || outputIndex >= parent_config.outConfs.size())
            THROW_IE_EXCEPTION << "Cannot find index of output node";
-        if (outputIndex >= parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs.size())
-            outputIndex = 0;
-        auto outDesc = MKLDNNMemoryDesc(parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[outputIndex].desc);
-        if (!outDesc)
+        const auto &port_desc = parent_config.outConfs[outputIndex].desc;
+        if (port_desc.getLayout() == Layout::ANY)
            continue;
-        if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
-            formatFrequency[outDesc.getFormat()] += 1;
-        else
-            formatFrequency[outDesc.getFormat()] = 1;
+        auto partial_format_desc = PartialBlkDesc::extractFrom(port_desc);
+        formatFrequency[partial_format_desc] += 1;
    }
    for (size_t i = 0; i < getChildEdges().size(); i++) {
        auto childEdge = getChildEdgeAt(i);
        auto child = childEdge->getChild();
-        if (child->getSelectedPrimitiveDescriptor() == nullptr)
+        const auto *prim_desc = child->getSelectedPrimitiveDescriptor();
+        if (prim_desc == nullptr)
            continue;
+
+        const auto &config = prim_desc->getConfig();
        int inputIndex = childEdge->getOutputNum();
-        if (inputIndex < 0)
+        if (inputIndex < 0 || inputIndex >= config.inConfs.size())
            THROW_IE_EXCEPTION << "Cannot find index of output node";
-        if (inputIndex >= child->getSelectedPrimitiveDescriptor()->getConfig().inConfs.size())
-            inputIndex = 0;
-        auto outDesc = MKLDNNMemoryDesc(child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[inputIndex].desc);
-        if (!outDesc)
+        const auto &port_desc = config.inConfs[inputIndex].desc;
+        if (port_desc.getLayout() == Layout::ANY)
            continue;
-        if (formatFrequency.find(outDesc.getFormat()) != formatFrequency.end())
-            formatFrequency[outDesc.getFormat()] += 1;
-        else
-            formatFrequency[outDesc.getFormat()] = 1;
+        auto partial_format_desc = PartialBlkDesc::extractFrom(port_desc);
+        formatFrequency[partial_format_desc] += 1;
    }

    size_t maxCount = 0;
-    mkldnn::memory::format convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
+    auto convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
    for (auto &it : formatFrequency) {
        if (it.second > maxCount) {
            maxCount = it.second;
@ -431,15 +425,15 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
        }
    }

-    if (canOptimize && MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, convertTo).blocksExtended())
-        convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
+    if (canOptimize && convertTo.isAutoExtendedWith(getChildEdgeAt(0)->getDims().ToSizeVector()))
+        convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
    for (size_t i = 0; canOptimize && i < getParentEdges().size(); i++) {
-        if (MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType, convertTo).blocksExtended())
-            convertTo = MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims());
+        if (convertTo.isAutoExtendedWith(getParentEdgeAt(i)->getDims().ToSizeVector()))
+            convertTo = PartialBlkDesc::makePlain(getChildEdgeAt(0)->getDims().ToSizeVector());
    }

    for (auto supportedPdIndex : canSelectPrimitive) {
-        if (MKLDNNMemoryDesc(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc).getFormat() == convertTo) {
+        if (PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[supportedPdIndex].getConfig().inConfs[0].desc) == convertTo) {
            selectPrimitiveDescriptorByIndex(static_cast<int>(supportedPdIndex));
            return;
        }
@ -449,10 +443,10 @@ void MKLDNNConcatNode::selectOptimalPrimitiveDescriptor() {
        auto &primDescInfo = supportedPrimitiveDescriptors[i];
        if (primDescInfo.getImplementationType() == impl_desc_type::unknown)
            continue;
-        if (convertTo == MKLDNNMemoryDesc(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc).getFormat()) {
+        if (convertTo == PartialBlkDesc::extractFrom(supportedPrimitiveDescriptors[i].getConfig().outConfs[0].desc)) {
            size_t num = 0;
            for (num = 0; num < getParentEdges().size(); num++) {
-                if (MKLDNNMemoryDesc(getParentEdgeAt(num)->getDims(), inputDataType, convertTo).blocksExtended())
+                if (convertTo.isAutoExtendedWith(getParentEdgeAt(num)->getDims().ToSizeVector()))
                    break;
            }
            if (num == getParentEdges().size()) {
@ -482,8 +476,7 @@ void MKLDNNConcatNode::createPrimitive() {
    if (getSelectedPrimitiveDescriptor() == nullptr)
        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";

-    std::vector<memory::primitive_desc> srcs_pd;
-    std::vector<primitive::at> srcs_p;
+    std::vector<memory::desc> srcs_d;

    for (size_t i = 0; i < getParentEdges().size(); i++) {
        auto& srcMemPtr = getParentEdgeAt(i)->getMemoryPtr();
@ -499,20 +492,18 @@ void MKLDNNConcatNode::createPrimitive() {
            desc.data.dims[j] = dims[j];
        }

-        srcs_pd.emplace_back(desc, srcMemPtr->GetPrimitiveDescriptor().get_engine());
-        srcs_p.emplace_back(srcMemPtr->GetPrimitive());
+        srcs_d.emplace_back(desc);
    }

    auto desc = getChildEdgeAt(0)->getMemory().GetDescriptor();
    auto dims = getChildEdgeAt(0)->getDims();
    for (size_t i = 0; i < dims.ndims(); i++) {
        desc.data.dims[i] = dims[i];
-        desc.data.layout_desc.blocking.padding_dims[i] = dims[i];
+        desc.data.padded_dims[i] = dims[i];
    }

-    auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_pd);
-
-    prim.reset(new concat(primitive_desc, srcs_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    auto primitive_desc = concat::primitive_desc(desc, static_cast<int>(axis), srcs_d, getEngine());
+    prim.reset(new concat(primitive_desc));
 }

 size_t MKLDNNConcatNode::inverseOrder(const SizeVector& order, size_t axis) {
@ -617,14 +608,13 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {

    const MKLDNNMemory& dst_memory = getChildEdgeAt(0)->getMemory();
    const mkldnn::memory::data_type data_type = dst_memory.GetDataType();
+    const size_t num_src = getParentEdges().size();

    const bool isInt8 = (data_type == mkldnn_s8 || data_type == mkldnn_u8);

    if (isInt8) {
        uint8_t* dst_ptr = reinterpret_cast<uint8_t*>(dst_memory.GetData());

-        const size_t num_src = getParentEdges().size();
-
        std::vector<size_t> channels;
        size_t channels_size = 0;
        std::vector<const uint8_t*> src_ptrs;
@ -649,7 +639,11 @@ void MKLDNNConcatNode::execute(mkldnn::stream strm) {
            }
        });
    } else {
-        MKLDNNNode::execute(strm);
+        std::unordered_map<int, memory> mem_ags {{DNNL_ARG_DST, dst_memory.GetPrimitive()}};
+        for (int i = 0; i < num_src; i++)
+            mem_ags[DNNL_ARG_MULTIPLE_SRC + i] = getParentEdgeAt(i)->getMemory().GetPrimitive();
+
+        (*prim).execute(strm, mem_ags);
    }
 }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@ -5,7 +5,6 @@
 #include "mkldnn_conv_node.h"
 #include "mkldnn_reorder_node.h"
 #include "mkldnn_input_node.h"
-#include "desc_iterator.hpp"
 #include "mkldnn_eltwise_node.h"
 #include "mkldnn_quantize_node.h"
 #include "mkldnn_pooling_node.h"
@ -16,6 +15,7 @@
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
 #include <legacy/ie_layers_internal.hpp>
+#include <utils/general_utils.h>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@ -23,15 +23,15 @@ using namespace InferenceEngine;

 MKLDNNConvolutionNode::MKLDNNConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
        : MKLDNNNode(layer, eng, cache), withBiases(false), withSum(false), withDWConv(false), isDW(false), isMerged(false),
-          isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::data_undef),
+          isGrouped(false), dw_conv_oc(0), dw_conv_ih(0), dw_conv_iw(0), dw_conv_in_dt(memory::data_type::undef),
          groupNum(1lu), baseInputsNumber(1), eltwisePrecision(Precision::FP32) {
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
+        return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
    });
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
        if (!withBiases)
            return MKLDNNMemoryDesc();
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc());
+        return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(1));
    });

    auto ws = layer->blobs.find("w-scale");
@ -74,13 +74,13 @@ bool MKLDNNConvolutionNode::canBeExecutedInInt8() {
    if (baseInputsNumber > 1) {
        auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
        if (!inputZeroPoints.empty())
-            inputDataType = memory::u8;
+            inputDataType = memory::data_type::u8;

        auto weightsDataType = precisionToDataType(Precision::FP32);
        if (baseInputsNumber > 1) {
            weightsDataType = precisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());
            if (!weightsZeroPoints.empty())
-                weightsDataType = memory::s8;
+                weightsDataType = memory::data_type::s8;
        }

        return (inputDataType == mkldnn_s8 || inputDataType == mkldnn_u8) && weightsDataType == mkldnn_s8;
@ -125,7 +125,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {

    auto inputDataType = precisionToDataType(getCnnLayer()->insData[0].lock()->getPrecision());
    if (!inputZeroPoints.empty())
-        inputDataType = memory::u8;
+        inputDataType = memory::data_type::u8;

    auto outputDataType = precisionToDataType(getCnnLayer()->outData[0]->getPrecision());
    eltwisePrecision = MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType);
@ -140,14 +140,14 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {

        // We need to make sure that convolution output and second input of fused Eltwise operation
        // have equal precision sizes since they use the same physical memory. In case precisions are different we upscale to FP32.
-        if (outputDataType != memory::f32 && outputDataType != memory::bf16 && withSum) {
+        if (outputDataType != memory::data_type::f32 && outputDataType != memory::data_type::bf16 && withSum) {
            for (int i = 0; i < fusedWith.size(); i++) {
                auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
                if (eltwiseNode && eltwiseNode->isSum()) {
                    eltwisePrecision = fusedEltwisePrecision(eltwiseNode, i);
                    if (MKLDNNExtensionUtils::DataTypeToIEPrecision(outputDataType).size() != eltwisePrecision.size()) {
                        eltwisePrecision = Precision::FP32;
-                        outputDataType = memory::f32;
+                        outputDataType = memory::data_type::f32;
                    }
                    break;
                }
@ -260,7 +260,7 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
                    dw_conv_in_dt = precisionToDataType(fusedWith[i - 1].get()->getCnnLayer()->outData[0]->getPrecision());
                }
            } else {
-                dw_conv_in_dt = memory::f32;
+                dw_conv_in_dt = memory::data_type::f32;
            }

            for (int j = 0; j < paddingR.size(); j++) {
@ -279,15 +279,15 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
    MKLDNNMemoryDesc in_candidate, out_candidate;
    if (canBeExecutedInInt8()) {
        in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
-                getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::ndhwc : memory::nhwc);
+                getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
        out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
-                getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::ndhwc : memory::nhwc);
+                getParentEdgeAt(0)->getDims().ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc);
        createDescriptor({in_candidate}, {out_candidate});
    } else {
        inputDataType = (convLayer->input()->getPrecision() == Precision::BF16
-        && !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::bf16 : memory::f32;
+        && !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
        outputDataType = (convLayer->outData[0]->getPrecision() == Precision::BF16
-        && !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::bf16 : memory::f32;
+        && !(isGrouped && getParentEdgeAt(0)->getDims().ndims() == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
        eltwisePrecision = Precision::FP32;
        for (int i = 0; i < fusedWith.size(); i++) {
            auto *eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(fusedWith[i].get());
@ -300,61 +300,69 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
                // for input of inplace tensor precision) to FP32. This will add reorder for that in-place tensor
                // bofore the fused convolution. This behaviour might be more correct regarding expected markup
                // of the graph but performance of first and second approaches might be different. Need to verify
-                outputDataType = eltwisePrecision == Precision::BF16 ? memory::bf16 : memory::f32;
+                outputDataType = eltwisePrecision == Precision::BF16 ? memory::data_type::bf16 : memory::data_type::f32;
            }
        }
        // correction for cases of FP32 input - we do not have FP32 convolution supported BF16 output
-        if (inputDataType == memory::f32
-            && (outputDataType == memory::bf16 || eltwisePrecision == Precision::BF16)) {
-            outputDataType = memory::f32;
+        if (inputDataType == memory::data_type::f32
+            && (outputDataType == memory::data_type::bf16 || eltwisePrecision == Precision::BF16)) {
+            outputDataType = memory::data_type::f32;
            eltwisePrecision = Precision::FP32;
        }

        Layout layout = convLayer->input()->getLayout();

        if (layout == NCHW || layout == NHWC) {
-            if (IC == 3 || IC == 1) {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
-                                                layout == NCHW ? memory::nchw : memory::nhwc);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+            if (IC == 1 && groupOC == 1) {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
                createDescriptor({in_candidate}, {out_candidate});
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+            } else if (IC == 3 || IC == 1) {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
+                                                layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
                createDescriptor({in_candidate}, {out_candidate});
            } else {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw16c);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
                createDescriptor({in_candidate}, {out_candidate});
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nChw8c);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
                createDescriptor({in_candidate}, {out_candidate});
            }

            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
-                    layout == NCHW ? memory::nchw : memory::nhwc);
+                    layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
-                    layout == NCHW ? memory::nchw : memory::nhwc);
+                    layout == NCHW ? memory::format_tag::nchw : memory::format_tag::nhwc);
            createDescriptor({in_candidate}, {out_candidate});
        } else if (layout == NCDHW || layout == NDHWC) {
-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
-                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
-            if (IC == 3 || IC == 1) {
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+            if (IC == 1 && groupOC == 1) {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
                createDescriptor({in_candidate}, {out_candidate});
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+            } else if (IC == 3 || IC == 1) {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
+                                                layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
+                createDescriptor({in_candidate}, {out_candidate});
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
                createDescriptor({in_candidate}, {out_candidate});
            } else {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw16c);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
                createDescriptor({in_candidate}, {out_candidate});
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nCdhw8c);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
                createDescriptor({in_candidate}, {out_candidate});
            }

            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType,
-                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
+                    layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
-                    layout == NCDHW ? memory::ncdhw : memory::ndhwc);
+                    layout == NCDHW ? memory::format_tag::ncdhw : memory::format_tag::ndhwc);
            createDescriptor({in_candidate}, {out_candidate});
        }
    }
@ -370,7 +378,7 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe

        auto* eltwiseNode = dynamic_cast<MKLDNNEltwiseNode *>(node.get());
        if (eltwiseNode && eltwiseNode->isSum()) {
-                ops.append_sum(1.0, mkldnn::memory::convert_to_c(precisionToDataType(eltwisePrecision)));
+                ops.append_sum(1.0, precisionToDataType(eltwisePrecision));
            continue;
        }

@ -396,43 +404,46 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe

                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
                    MKLDNNDims dwWeightsDims({dw_conv_oc, (ptrdiff_t)1, (ptrdiff_t)1, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS]});
-                    PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, weightsPrc, memory::format::Goihw8g);
+                    PostOpsIntBlobMemory[blob_idx]->Create(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
                    PostOpsIntBlobMemory[blob_idx]->FillZero();

                    Blob::Ptr weights = convLayer->blobs.find("weights")->second;
                    Blob::Ptr biases = convLayer->blobs.find("biases")->second;

-                    PostOpsIntBlobMemory[blob_idx]->SetData(weightsPrc, memory::goihw, weights->buffer(),
+                    PostOpsIntBlobMemory[blob_idx]->SetData(weightsPrc, memory::format_tag::goihw, weights->buffer(),
                                                            dwWeightsDims.size() * MKLDNNExtensionUtils::sizeOfDataType(weightsPrc));

                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
                    MKLDNNDims dwBiasesDims({dw_conv_oc});
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, biasPrc, memory::format::x);
+                    PostOpsIntBlobMemory[blob_idx + 1]->Create(dwBiasesDims, biasPrc, memory::format_tag::x);
                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::x, biases->buffer(),
+                    PostOpsIntBlobMemory[blob_idx + 1]->SetData(biasPrc, memory::format_tag::x, biases->buffer(),
                                                                dwBiasesDims.size() * MKLDNNExtensionUtils::sizeOfDataType(biasPrc));
-                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
-                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
-                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
-                                       (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
-                                       (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());
+                    // rewrite onto append_dw_k3s2p1
+//                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
+//                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
+//                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
+//                                       (const float *) PostOpsIntBlobMemory[blob_idx]->GetData(),
+//                                       (const float *) PostOpsIntBlobMemory[blob_idx + 1]->GetData());

                    blob_idx += 2;
                } else {
-                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
-                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
-                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
-                                       static_cast<float *>(getParentEdgeAt(
-                                               baseInputsNumber + 0)->getMemory().GetData()),
-                                       static_cast<float *>(getParentEdgeAt(
-                                               baseInputsNumber + 1)->getMemory().GetData()));
+                    // rewrite onto append_dw_k3s2p1
+//                    ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
+//                                       dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
+//                                       mkldnn::memory::convert_to_c(dw_conv_in_dt),
+//                                       static_cast<float *>(getParentEdgeAt(
+//                                               baseInputsNumber + 0)->getMemory().GetData()),
+//                                       static_cast<float *>(getParentEdgeAt(
+//                                               baseInputsNumber + 1)->getMemory().GetData()));
                }
            } else {
-                ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
-                                   dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
-                                   mkldnn::memory::convert_to_c(dw_conv_in_dt),
-                                   nullptr,
-                                   nullptr);
+                // rewrite onto append_dw_k3s2p1
+//                ops.append_dw_conv(dw_conv_ih, dw_conv_iw, dw_conv_kernel[Y_AXIS], dw_conv_kernel[X_AXIS],
+//                                   dw_conv_strides[Y_AXIS], dw_conv_strides[X_AXIS],
+//                                   mkldnn::memory::convert_to_c(dw_conv_in_dt),
+//                                   nullptr,
+//                                   nullptr);
            }

            if (convolutionNode->wScale != nullptr) {
@ -458,24 +469,26 @@ void MKLDNNConvolutionNode::setPostOps(mkldnn::primitive_attr &attr, bool initWe
                MKLDNNDims oScaleDims({static_cast<ptrdiff_t>(rnd_up(biasesDims[0], 16))});

                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(oScaleDims, memory::data_type::f32, memory::format::x);
+                PostOpsIntBlobMemory[blob_idx]->Create(oScaleDims, memory::data_type::f32, memory::format_tag::x);
                PostOpsIntBlobMemory[blob_idx]->FillZero();
-                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x, &oScaleDataVector[0],
+                PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::format_tag::x, &oScaleDataVector[0],
                                                        oScaleDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));

                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx + 1]->Create(oScaleDims, memory::data_type::f32, memory::format::x);
+                PostOpsIntBlobMemory[blob_idx + 1]->Create(oScaleDims, memory::data_type::f32, memory::format_tag::x);
                PostOpsIntBlobMemory[blob_idx + 1]->FillZero();
-                PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x, &oShiftDataVector[0],
+                PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::format_tag::x, &oShiftDataVector[0],
                                                            oShiftDataVector.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));

-                ops.append_depthwise(depthwise_scale_shift,
+                ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift,
                                     (const float *)PostOpsIntBlobMemory[blob_idx]->GetData(),
                                     (const float *)PostOpsIntBlobMemory[blob_idx + 1]->GetData());

                blob_idx += 2;
            }

+            THROW_IE_EXCEPTION << "append_dw_conv is not ported";
+
            continue;
        }

@ -499,7 +512,7 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
        if (containJitImpl && isPossibleToSkipInitConfig(desc))
            continue;
        auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
-        while (itpd.is_not_end()) {
+        while (static_cast<bool>(itpd)) {
            InferenceEngine::LayerConfig config;
            config.dynBatchSupport = true;
            for (size_t i = 0; i < descInputNumbers(desc); i++) {
@ -522,14 +535,13 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = -1;
                dataConfig.constant = false;
-                dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format::Goihw8g);
+                dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
                config.inConfs.push_back(dataConfig);

-                dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format::x);
+                dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format_tag::x);
                config.inConfs.push_back(dataConfig);
            }

-            std::vector<memory::format> outFormats;
            for (size_t i = 0; i < descOutputNumbers(desc); i++) {
                InferenceEngine::DataConfig dataConfig;
                if (withSum) {
@ -547,15 +559,14 @@ void MKLDNNConvolutionNode::initSupportedPrimitiveDescriptors() {
                    dataConfig.desc.setPrecision(eltwisePrecision);
                    config.inConfs.push_back(dataConfig);
                }
-
-                outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
            }
-            impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
            if (impl_type & jit)
                containJitImpl = true;

-            supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
-            itpd++;
+            supportedPrimitiveDescriptors.emplace_back(config, impl_type);
+            if (!itpd.next_impl())
+                break;
        }
    }
 }
@ -573,18 +584,14 @@ void MKLDNNConvolutionNode::createPrimitive() {
    auto prim_desc = createPrimitiveDescriptor<convolution_forward::primitive_desc,
            convolution_forward::desc>(attr);

-    if (withBiases) {
-        prim.reset(new convolution_forward(prim_desc,
-                                           getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                           getWeights(),
-                                           getBias(),
-                                           getChildEdgeAt(0)->getMemory().GetPrimitive()));
-    } else {
-        prim.reset(new convolution_forward(prim_desc,
-                                           getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                           getWeights(),
-                                           getChildEdgeAt(0)->getMemory().GetPrimitive()));
-    }
+    prim.reset(new convolution_forward(prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    if (withBiases)
+        primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_BIAS, getBias()}, {DNNL_ARG_DST, dst}};
+    else
+        primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DST, dst}};
 }

 bool MKLDNNConvolutionNode::created() const {
@ -602,16 +609,16 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
    }

    if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
-        wdt = memory::s8;
-        bdt = baseInputsNumber == 3 ? precisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::s32;
+        wdt = memory::data_type::s8;
+        bdt = baseInputsNumber == 3 ? precisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::data_type::s32;
    }

    if (baseInputsNumber == 1) {
        Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;

        if (weights->getTensorDesc().getPrecision() == Precision::I8) {
-            wdt = memory::s8;
-            bdt = memory::s32;
+            wdt = memory::data_type::s8;
+            bdt = memory::data_type::s32;

            Precision outPrec;
            if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
@ -636,7 +643,7 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::

    MKLDNNDims blocked_weightDims(weightDims);
    MKLDNNDims blocked_biasesDims(biasesDims);
-    MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::any};
+    MKLDNNMemoryDesc wgh_candidate{blocked_weightDims, wdt, memory::format_tag::any};

    std::vector<algorithm> algorithms;
    // We cannot map wino_format on tensor descriptor for now
@ -649,15 +656,21 @@ void MKLDNNConvolutionNode::createDescriptor(const std::vector<InferenceEngine::
        try {
            std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
            if (withBiases) {
-                MKLDNNMemoryDesc bias_candidate{blocked_biasesDims, bdt, memory::any};
+                MKLDNNMemoryDesc bias_candidate{blocked_biasesDims, bdt, memory::format_tag::any};

                conv_desc.reset(new convolution_forward::desc(prop_kind::forward_scoring, alg,
                            in_candidate, wgh_candidate, bias_candidate, out_candidate,
-                            stride, dilation, paddingL, paddingR, padding_kind::zero));
+                            mkldnn::memory::dims(stride.begin(), stride.end()),
+                            mkldnn::memory::dims(dilation.begin(), dilation.end()),
+                            mkldnn::memory::dims(paddingL.begin(), paddingL.end()),
+                            mkldnn::memory::dims(paddingR.begin(), paddingR.end())));
            } else {
                conv_desc.reset(new convolution_forward::desc(prop_kind::forward_scoring, alg,
-                            in_candidate, wgh_candidate, out_candidate, stride, dilation,
-                            paddingL, paddingR, padding_kind::zero));
+                            in_candidate, wgh_candidate, out_candidate,
+                            mkldnn::memory::dims(stride.begin(), stride.end()),
+                            mkldnn::memory::dims(dilation.begin(), dilation.end()),
+                            mkldnn::memory::dims(paddingL.begin(), paddingL.end()),
+                            mkldnn::memory::dims(paddingR.begin(), paddingR.end())));
            }

            descs.emplace_back(conv_desc);
@ -674,8 +687,9 @@ void MKLDNNConvolutionNode::addZeroPoints(mkldnn::primitive_attr& attr) const {
    if (!weightsZeroPoints.empty())
        attr.set_weights_zero_points(1 << 1 /*through C dim*/, weightsZeroPoints);

-    if (!outputCompensation.empty())
+    if (!outputCompensation.empty()) {
        attr.set_output_compensations(1 << 1 /*through C dim*/, outputCompensation);
+    }
 }

 void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr) const {
@ -695,7 +709,6 @@ void MKLDNNConvolutionNode::addScaleToPrimitiveAttr(mkldnn::primitive_attr attr)
            }
        }

-        attr.set_int_output_round_mode(mkldnn::round_nearest);
        attr.set_output_scales(1 << 1 /*through C dim*/, oScaleDataVector);
    }
 }
@ -741,7 +754,7 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
        if (containJitImpl && isPossibleToSkipInitConfig(desc))
            continue;
        auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
-        while (itpd.is_not_end()) {
+        while (static_cast<bool>(itpd)) {
            InferenceEngine::LayerConfig cfg;
            cfg.dynBatchSupport = true;
            for (size_t j = 0; j < descInputNumbers(desc); j++) {
@ -762,10 +775,10 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = -1;
                dataConfig.constant = false;
-                dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format::Goihw8g);
+                dataConfig.desc = MKLDNNMemoryDesc(dwWeightsDims, weightsPrc, memory::format_tag::Goihw8g);
                cfg.inConfs.push_back(dataConfig);

-                dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format::x);
+                dataConfig.desc = MKLDNNMemoryDesc(dwBiasesDims, biasPrc, memory::format_tag::x);
                cfg.inConfs.push_back(dataConfig);
            }

@ -783,7 +796,7 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c

                cfg.outConfs.push_back(dataConfig);
            }
-            impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
            if (impl_type & jit)
                containJitImpl = true;

@ -799,7 +812,8 @@ void MKLDNNConvolutionNode::initDescriptor(const InferenceEngine::LayerConfig& c
                }
            }
            selected_count++;
-            itpd++;
+            if (!itpd.next_impl())
+                break;
        }
    }
    selectedPD->getConfig() = rightConfig;
@ -820,14 +834,12 @@ void MKLDNNConvolutionNode::filterSupportedDescriptors() {
        while (itd != descs.end()) {
            bool isSuitableDesc = true;
            if (!inputMemoryFormatsFilter.empty()) {
-                auto src_fmt = std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.src_desc.format;
-                if (src_fmt != inputMemoryFormatsFilter[0])
-                    isSuitableDesc = false;
+                MKLDNNMemoryDesc src_tdesc(std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.src_desc);
+                isSuitableDesc &= src_tdesc.isSame(inputMemoryFormatsFilter[0]);
            }
            if (!outputMemoryFormatsFilter.empty()) {
-                auto dst_fmt = std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.dst_desc.format;
-                if (dst_fmt != outputMemoryFormatsFilter[0])
-                    isSuitableDesc = false;
+                MKLDNNMemoryDesc dst_tdesc(std::shared_ptr<mkldnn::convolution_forward::desc>(*itd)->data.dst_desc);
+                isSuitableDesc &= dst_tdesc.isSame(outputMemoryFormatsFilter[0]);
            }
            if (!isSuitableDesc) {
                itd = descs.erase(itd);
@ -861,21 +873,21 @@ bool MKLDNNConvolutionNode::isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) {
            isPossibleJitPlanar = false;

    std::shared_ptr<mkldnn::convolution_forward::desc> convDesc(desc);
-    auto srcMemFmt = convDesc->data.src_desc.format;
-    auto dstMemFmt = convDesc->data.dst_desc.format;
+    auto srcMemDesc = MKLDNNMemoryDesc {convDesc->data.src_desc};
+    auto dstMemDesc = MKLDNNMemoryDesc {convDesc->data.dst_desc};
    auto srcDataType = convDesc->data.src_desc.data_type;
    auto dstDataType = convDesc->data.dst_desc.data_type;
-    bool isPlanarFloatConv = (srcMemFmt == memory::nchw || srcMemFmt == memory::ncdhw)
-                             && (dstMemFmt == memory::nchw || dstMemFmt == memory::ncdhw)
-                             && srcDataType == memory::f32
-                             && dstDataType == memory::f32;
+    bool isPlanarFloatConv = srcMemDesc.isPlainFormat()
+                             && dstMemDesc.isPlainFormat()
+                             && srcDataType == memory::data_type::f32
+                             && dstDataType == memory::data_type::f32;

    return !isPossibleJitPlanar && isPlanarFloatConv;
 }

 MKLDNNMemoryDesc MKLDNNConvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(idx - 1).desc())
-                                               : MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
+    InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_desc(idx - 1))
+                                               : MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));

    if (desc.getLayout() == InferenceEngine::Layout::ANY) {
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
@ -32,6 +32,7 @@ public:
    bool canBeInPlace() const override {
        return false;
    }
+
    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);

    size_t descInputNumbers(MKLDNNDescriptor desc) override {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_crop_node.cpp
@ -10,6 +10,7 @@
 #include <mkldnn_extension_utils.h>
 #include "ie_parallel.hpp"
 #include "common/cpu_memcpy.h"
+#include "utils/general_utils.h"

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@ -32,7 +33,7 @@ void MKLDNNCropNode::getSupportedDescriptors() {
    MKLDNNDims childDims = getChildEdgeAt(0)->getDims();

    offsets.resize(static_cast<size_t>(childDims.ndims()));  // plus one dim for batch
-    dims.resize(static_cast<size_t>(childDims.ndims()));  // plus one dim for batch
+    dims.resize(static_cast<size_t>(childDims.ndims()));     // plus one dim for batch
    for (int i = 0; i < childDims.ndims(); i++)
        dims[i] = childDims[i];

@ -70,11 +71,11 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
        THROW_IE_EXCEPTION << "Crop supports only 2d, 4d and 5d blobs.";
    }

-    memory::format fmt = memory::format::format_undef;
+    memory::format_tag fmt = memory::format_tag::undef;
    switch (inDims.ndims()) {
-        case 2: fmt = memory::format::nc; break;
-        case 4: fmt = memory::format::nchw; break;
-        case 5: fmt = memory::format::ncdhw; break;
+        case 2: fmt = memory::format_tag::nc; break;
+        case 4: fmt = memory::format_tag::nchw; break;
+        case 5: fmt = memory::format_tag::ncdhw; break;
    }

    InferenceEngine::LayerConfig config;
@ -93,12 +94,12 @@ void MKLDNNCropNode::initSupportedPrimitiveDescriptors() {
    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);

    if ((inDims.ndims() == 4 || inDims.ndims() == 5) && channelAxis >= 0 && dims[channelAxis] % 8 == 0) {
-        fmt = inDims.ndims() == 5 ? memory::format::nCdhw8c : memory::format::nChw8c;
+        fmt = inDims.ndims() == 5 ? memory::format_tag::nCdhw8c : memory::format_tag::nChw8c;
        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmt);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
        supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
        if (dims[channelAxis] % 16 == 0) {
-            fmt = inDims.ndims() == 5 ? memory::format::nCdhw16c : memory::format::nChw16c;
+            fmt = inDims.ndims() == 5 ? memory::format_tag::nCdhw16c : memory::format_tag::nChw16c;
            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmt);
            config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmt);
            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, fmt);
@ -121,14 +122,19 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
    auto& parentMem = getParentEdgeAt(0)->getMemory();

    int m_block_size = 1;
-    if (!MKLDNNMemory::IsPlainFormat(parentMem.GetFormat())) {
-        m_block_size = parentMem.GetDescriptor().data.layout_desc.blocking.block_dims[1];
+    if (!parentMem.GetDesc().isPlainFormat()) {
+        const auto &desc = parentMem.GetDescriptor().data;
+        const auto &blk = desc.format_desc.blocking;
+        IE_ASSERT(desc.format_kind == dnnl_blocked &&
+                  blk.inner_nblks == 1 &&
+                  blk.inner_idxs[0] == 1);
+        m_block_size = blk.inner_blks[0];
    }
    const int m_inner_dim = dims[dims.size() - 1] * m_block_size;

-    const memory &dst_d = getChildEdgeAt(0)->getMemory().GetPrimitive();
+    const auto &dst_mem = getChildEdgeAt(0)->getMemory();

-    const int dst_ndims = dst_d.get_primitive_desc().desc().data.ndims;
+    const int dst_ndims = dst_mem.GetDesc().getDims().ndims();

    // TODO: Rewrite it in general case. For every tensor
    // and rank, without using letter N,C,D,H,W
@ -154,12 +160,10 @@ void MKLDNNCropNode::execute(mkldnn::stream strm) {
    const int IH = (src_ndims  > 2) ? src_dims[src_dims.size() - 2] : 1;
    const int IW = (src_ndims  > 3) ? src_dims[src_dims.size() - 1] : 1;

-    const uint8_t itemSize = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(parentMem.GetDataType()));
+    const size_t itemSize = parentMem.GetDesc().GetElementSize();

-    const auto *src_data = reinterpret_cast<const uint8_t *>(parentMem.GetData()) +
-            itemSize * parentMem.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetData()) +
-            itemSize * getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+    const auto *src_data = reinterpret_cast<const uint8_t*>(parentMem.GetPtr());
+    auto *dst_data = reinterpret_cast<uint8_t*>(getChildEdgeAt(0)->getMemory().GetPtr());

    if (OD == 1 && OH == 1 && OW == 1 && ID == 1 && IH == 1 && IW == 1) {
        parallel_for(ON, [&](int n) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.cpp
@ -3,7 +3,6 @@
 //

 #include "mkldnn_deconv_node.h"
-#include "desc_iterator.hpp"
 #include <legacy/ie_layers.h>
 #include <mkldnn.hpp>
 #include <string>
@ -12,6 +11,7 @@
 #include <mkldnn_extension_utils.h>
 #include <legacy/ie_layers_internal.hpp>
 #include "ie_parallel.hpp"
+#include "utils/general_utils.h"

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@ -20,7 +20,7 @@ using namespace InferenceEngine;
 MKLDNNDeconvolutionNode::MKLDNNDeconvolutionNode(const InferenceEngine::CNNLayerPtr& layer,
                                                 const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(layer, eng, cache) {
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
+        return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
    });
 }

@ -127,25 +127,26 @@ void MKLDNNDeconvolutionNode::getSupportedDescriptors() {

 void MKLDNNDeconvolutionNode::setBiasAsPostOp(const InferenceEngine::Blob::Ptr& biases) {
    mkldnn::post_ops ops;
-    MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(biases->size(), 16))});
+    auto depthwiseSize = static_cast<ptrdiff_t>(rnd_up(biases->size(), 16));

    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-    PostOpsIntBlobMemory[0]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
+    PostOpsIntBlobMemory[0]->Create({depthwiseSize}, memory::data_type::f32, memory::format_tag::x);
    PostOpsIntBlobMemory[0]->FillZero();
-    std::vector<float> weights(biases->size());
-    for (int i = 0; i < biases->size(); i++) {
-        weights[i] = 1;
-    }
-    PostOpsIntBlobMemory[0]->SetData(memory::data_type::f32, memory::x, &weights[0],
-            biases->size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
+    std::vector<float> weights(depthwiseSize, 1.0f);
+    std::fill(weights.begin() + biases->size(), weights.end(), 0.0f);
+    PostOpsIntBlobMemory[0]->SetData(memory::data_type::f32, memory::format_tag::x, weights.data(),
+            weights.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));

    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-    PostOpsIntBlobMemory[1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
+    PostOpsIntBlobMemory[1]->Create({depthwiseSize}, memory::data_type::f32, memory::format_tag::x);
    PostOpsIntBlobMemory[1]->FillZero();
-    PostOpsIntBlobMemory[1]->SetData(memory::data_type::f32, memory::x, biases->buffer(),
-            biases->size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
+    auto biases_ptr = biases->buffer().as<float*>();
+    std::vector<float> bias(depthwiseSize, 0.0f);
+    std::copy(biases_ptr, biases_ptr + biases->size(), bias.begin());
+    PostOpsIntBlobMemory[1]->SetData(memory::data_type::f32, memory::format_tag::x, bias.data(),
+             bias.size() * MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));

-    ops.append_depthwise(depthwise_scale_shift,
+    ops.append_depthwise(algorithm::depthwise_scale_shift,
                         (const float *) PostOpsIntBlobMemory[0]->GetData(),
                         (const float *) PostOpsIntBlobMemory[1]->GetData());

@ -166,14 +167,12 @@ void MKLDNNDeconvolutionNode::filterSupportedDescriptors() {
        while (itd != descs.end()) {
            bool isSuitableDesc = true;
            if (!inputMemoryFormatsFilter.empty()) {
-                auto src_fmt = std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.src_desc.format;
-                if (src_fmt != inputMemoryFormatsFilter[0])
-                    isSuitableDesc = false;
+                auto src_tdesc = MKLDNNMemoryDesc(std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.diff_src_desc);
+                isSuitableDesc &= src_tdesc.isSame(inputMemoryFormatsFilter[0]);
            }
            if (!outputMemoryFormatsFilter.empty()) {
-                auto dst_fmt = std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.dst_desc.format;
-                if (dst_fmt != outputMemoryFormatsFilter[0])
-                    isSuitableDesc = false;
+                auto dst_tdesc = MKLDNNMemoryDesc(std::shared_ptr<mkldnn::convolution_backward_data::desc>(*itd)->data.diff_dst_desc);
+                isSuitableDesc &= dst_tdesc.isSame(outputMemoryFormatsFilter[0]);
            }
            if (!isSuitableDesc) {
                itd = descs.erase(itd);
@ -184,12 +183,6 @@ void MKLDNNDeconvolutionNode::filterSupportedDescriptors() {
    }
 }

-void MKLDNNDeconvolutionNode::execute(mkldnn::stream strm) {
-    if (prim) {
-        strm.submit({*prim});
-    }
-}
-
 bool MKLDNNDeconvolutionNode::created() const {
    return getType() == Deconvolution;
 }
@ -201,10 +194,11 @@ void MKLDNNDeconvolutionNode::createPrimitive() {
    auto prim_desc = createPrimitiveDescriptor<convolution_backward_data::primitive_desc,
            convolution_backward_data::desc, convolution_forward::primitive_desc>(attr);

-    prim.reset(new convolution_backward_data(prim_desc,
-            getParentEdgeAt(0)->getMemory().GetPrimitive(),
-            getWeights(),
-            getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    prim.reset(new convolution_backward_data(prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs = {{DNNL_ARG_DIFF_DST, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DIFF_SRC, dst}};
 }

 void MKLDNNDeconvolutionNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
@ -216,31 +210,41 @@ void MKLDNNDeconvolutionNode::createDescriptor(const std::vector<InferenceEngine
    if ((withGroups && !isDW) && (in_candidate.blocksExtended() || out_candidate.blocksExtended()))
        return;

-    MKLDNNMemoryDesc wgh_candidate{weightsDims, in_candidate.getDataType(), memory::any};
+    MKLDNNMemoryDesc wgh_candidate{weightsDims, in_candidate.getDataType(), memory::format_tag::any};
    for (auto alg : {algorithm::convolution_winograd, algorithm::convolution_direct}) {
-        try {
-            std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
-            conv_desc.reset(new convolution_forward::desc(prop_kind::forward_inference, alg,
-                                                          out_candidate, wgh_candidate, in_candidate, stride, dilation,
-                                                          paddingL, paddingR, padding_kind::zero));
+        auto convert = [] (const std::vector<ptrdiff_t>& orig_dims) {
+            return memory::dims(orig_dims.begin(), orig_dims.end());
+        };

-            std::shared_ptr<mkldnn::convolution_backward_data::desc> deconv_desc;
-            deconv_desc.reset(new convolution_backward_data::desc(alg, out_candidate, wgh_candidate,
-                                                        in_candidate, stride, dilation, paddingL, paddingR,
-                                                        padding_kind::zero));
-            descs_fwd.push_back(conv_desc);
-            descs_bwd.push_back(deconv_desc);
+        std::shared_ptr<mkldnn::convolution_forward::desc> conv_desc;
+        conv_desc.reset(new convolution_forward::desc(prop_kind::forward_inference, alg,
+                                                      out_candidate, wgh_candidate, in_candidate,
+                                                      convert(stride),
+                                                      convert(dilation),
+                                                      convert(paddingL),
+                                                      convert(paddingR)));

-            descs.emplace_back(deconv_desc,
-                               std::shared_ptr<convolution_forward::primitive_desc>(
-                                       new convolution_forward::primitive_desc(*conv_desc, getEngine())));
-        } catch(...) {}
+        std::shared_ptr<mkldnn::convolution_backward_data::desc> deconv_desc;
+        deconv_desc.reset(new convolution_backward_data::desc(alg, out_candidate, wgh_candidate,
+                                                    in_candidate,
+                                                    convert(stride),
+                                                    convert(dilation),
+                                                    convert(paddingL),
+                                                    convert(paddingR)));
+        descs_fwd.push_back(conv_desc);
+        descs_bwd.push_back(deconv_desc);
+
+        auto fwd_conv_pd = std::make_shared<convolution_forward::primitive_desc>(*conv_desc, getEngine(), true);
+        if (fwd_conv_pd->get(true) == nullptr)
+            continue;
+
+        descs.emplace_back(deconv_desc, fwd_conv_pd);
    }
 }

 MKLDNNMemoryDesc MKLDNNDeconvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(idx - 1).desc())
-                                               : MKLDNNMemoryDesc(primitive_desc_it.diff_dst_primitive_desc(idx).desc());
+    InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_desc(idx - 1))
+                                               : MKLDNNMemoryDesc(primitive_desc_it.diff_dst_desc(idx));

    if (desc.getLayout() == InferenceEngine::Layout::ANY) {
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
@ -268,7 +272,7 @@ MKLDNNMemoryDesc MKLDNNDeconvolutionNode::getSrcMemDesc(mkldnn::primitive_desc_i
 }

 MKLDNNMemoryDesc MKLDNNDeconvolutionNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.diff_src_primitive_desc(idx).desc());
+    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.diff_src_desc(idx));
    if (desc.getLayout() == InferenceEngine::Layout::ANY)
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
                                                            getChildEdgeAt(idx)->getDims().ToSizeVector(),
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_deconv_node.h
@ -23,7 +23,6 @@ public:
    void createPrimitive() override;
    void filterSupportedPrimitiveDescriptors() override;
    void filterSupportedDescriptors();
-    void execute(mkldnn::stream strm) override;
    bool created() const override;
    bool canBeInPlace() const override {
        return false;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.cpp
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_def_conv_node.h
@ -12,17 +12,66 @@

 namespace MKLDNNPlugin {

+struct jit_def_conv_params {
+    int ndims;
+    int mb;
+    int dg;
+    int ngroups, ic, oc, oc_padded;
+    int id, ih, iw, od, oh, ow;
+    int f_pad, l_pad, t_pad;
+    int back_pad, r_pad, b_pad;
+    int kd, kh, kw;
+    int stride_d, stride_h, stride_w;
+    int dilate_d, dilate_h, dilate_w;
+    bool with_bias;
+    bool with_sum;
+    int nthr;
+    int nb_ic, ic_block;
+    int nb_oc, oc_block;
+    int nb_ic_blocking, nb_oc_blocking;
+    int ur_w;
+    int ur_w_tail;
+    int typesize_in;
+    int typesize_off;
+    int typesize_bia;
+    int typesize_out;
+};
+
+struct jit_def_conv_call_args {
+    const void *src;
+    const void *off;
+    const void *filt;
+    const void *bias;
+    const void *dst;
+    const void *buf;
+    size_t oh_pos;
+};
+
+struct jit_uni_def_conv_kernel {
+    void (*ker_)(const jit_def_conv_call_args *);
+
+    void operator()(const jit_def_conv_call_args *args) {
+        assert(ker_);
+        ker_(args);
+    }
+
+    explicit jit_uni_def_conv_kernel(jit_def_conv_params jcp) : ker_(nullptr), jcp_(jcp) {}
+    virtual ~jit_uni_def_conv_kernel() {}
+
+    virtual void create_ker() = 0;
+
+    jit_def_conv_params jcp_;
+};
+
 class MKLDNNDeformableConvolutionNode : public MKLDNNNode {
 public:
    MKLDNNDeformableConvolutionNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
    ~MKLDNNDeformableConvolutionNode() override = default;

    void getSupportedDescriptors() override;
-    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
-                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
-    void initDescriptor(const InferenceEngine::LayerConfig& config) override;
    void createPrimitive() override;
    void initSupportedPrimitiveDescriptors() override;
+    void execute(mkldnn::stream strm) override;
    bool created() const override;
    bool canBeInPlace() const override {
        return false;
@ -31,18 +80,23 @@ public:
    InferenceEngine::Precision getRuntimePrecision() const override;

 private:
-    bool withBiases = false;
-    bool isDW = false;
-    bool isMerged = false;
-    bool isGrouped = false;
-    std::vector<ptrdiff_t> stride;
-    std::vector<ptrdiff_t> dilation;
-    std::vector<ptrdiff_t> paddingL;
-    std::vector<ptrdiff_t> paddingR;
-    InferenceEngine::SizeVector weightDims;
-    InferenceEngine::SizeVector biasesDims;
+    size_t group = 1;
+    std::vector<ptrdiff_t> stride = {};
+    std::vector<ptrdiff_t> dilation = {};
+    std::vector<ptrdiff_t> paddingL = {};

    int deformable_group = 1;
+
+    jit_def_conv_params jcp = {};
+
+    std::shared_ptr<jit_uni_def_conv_kernel> def_conv_kernel = nullptr;
+
+    void executeReference(const float* src, const float* offsets, const float* weights, float* dst,
+                          const std::vector<size_t>& src_strides, const std::vector<size_t>& off_strides,
+                          const std::vector<size_t>& wei_strides, const std::vector<size_t>& dst_strides);
+    void executeOptimized(const float* src, const float* offsets, const float* weights, float* dst,
+                          const std::vector<size_t>& src_strides, const std::vector<size_t>& off_strides,
+                          const std::vector<size_t>& dst_strides);
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@ -3,32 +3,35 @@
 //

 #include "mkldnn_eltwise_node.h"
+
 #include <legacy/ie_layers.h>
+#include <ie_parallel.hpp>
+
+#include <mkldnn_types.h>
+#include "utils/bfloat16.hpp"
+#include <cpu/x64/jit_uni_quantization_injector.hpp>
+#include <cpu/ref_eltwise.hpp>
+
+#include "mkldnn_extension_utils.h"
+#include "mkldnn_quantize_node.h"
+#include "mkldnn_pooling_node.h"
+#include "common/emitter.h"
+#include "jit_eltwise_emitters.hpp"
+#include "jit_mkldnn_emitters.hpp"
+#include <mkldnn_selective_build.h>
+
 #include <string>
 #include <vector>
 #include <memory>
 #include <algorithm>
 #include <cmath>
-#include <mkldnn_types.h>
-#include <mkldnn_extension_utils.h>
-#include "utils/bfloat16.hpp"
-#include "ie_parallel.hpp"
-#include "mkldnn_quantize_node.h"
 #include <map>
-#include "jit_uni_eltwise.hpp"
-#include "jit_uni_quantization.hpp"
-#include "common/emitter.h"
-#include "jit_eltwise_emitters.hpp"
-#include "jit_mkldnn_emitters.hpp"
-#include "ref_eltwise.hpp"
-#include "mkldnn_pooling_node.h"
-#include <mkldnn_selective_build.h>

 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl::utils;
-
 using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace Xbyak;

 #define GET_OFF(field) offsetof(jit_eltwise_call_args, field)
@ -44,9 +47,9 @@ struct SupportedPrecisions {

 struct EltwiseEmitterContext {
    std::shared_ptr<jit_emitter> emitter;
-    mkldnn::impl::cpu::jit_generator *host;
-    mkldnn::impl::cpu::cpu_isa_t host_isa;
-    const MKLDNNNode * node;
+    jit_generator *host;
+    cpu_isa_t host_isa;
+    const MKLDNNNode *node;
    InferenceEngine::Precision exec_prc;
 };

@ -60,10 +63,17 @@ struct EltwiseEmitter {
 }   // namespace

 template <cpu_isa_t isa>
-struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_generator {
+struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic)

-    explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {
+    explicit jit_uni_eltwise_generic(jit_eltwise_params jep, MKLDNNEltwiseNode& eltwiseNode) : jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        Precision exec_prc = Precision::UNSPECIFIED;

        std::set<Precision> supported_precision_intersection = get_supported_precisions(eltwiseNode);
@ -108,13 +118,15 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
                quantizeNode->appendPostOps(post_ops);

                quantization_injectors.push_back(std::make_shared<jit_uni_quantization_injector_f32<isa>>(
-                        this, post_ops.get()->entry_[post_ops.get()->len_ - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
+                        this, post_ops.get()->entry_[post_ops.len() - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias));
            }
        }

        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
            emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));

+        const auto &jep = jep_;
+
        this->preamble();

        for (int i = 0; i < jep.inputs_number; i++)
@ -130,7 +142,7 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
        Xbyak::Label tail_loop_label;
        Xbyak::Label tail_loop_end_label;

-        if (isa == avx512_common)
+        if (isa == x64::avx512_common)
            vpxord(vmm_zero, vmm_zero, vmm_zero);

        for (int i = 0; i < jep.inputs_number; i++) {
@ -287,12 +299,10 @@ struct jit_uni_eltwise_generic : public jit_uni_eltwise_kernel, public jit_gener
        for (int i = 0; i < post_op_emitters.size(); i++) {
            post_op_emitters[i]->emit_table();
        }
-
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xmm, isa == cpu::avx2, Ymm, Zmm>::type;
+    using Vmm = typename conditional3<isa == x64::sse41, Xmm, isa == x64::avx2, Ymm, Zmm>::type;

    Reg64 get_src_reg(int idx) {
        return Reg64(r8.getIdx() + idx);
@ -501,7 +511,7 @@ private:
            } else {
                auto quantizeNode = dynamic_cast<MKLDNNQuantizeNode*>(eltwiseNode.getFusedWith()[i].get());

-                bool do_dequantization = quantizeNode->getAlgorithm() == mkldnn::quantization_quantize_dequantize;
+                bool do_dequantization = quantizeNode->getOpType() == QuantizeOpType::FakeQuantization;
                bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != eltwiseNode.getFusedWith().size() - 1;
                int s_idx = vmm_dst.getIdx();

@ -558,6 +568,8 @@ private:
                        uni_vcvtdq2ps(vmm_src, vmm_src);
                    break;
                case Precision::I32:
+                    if (src_prc == Precision::FP32 || src_prc == Precision::BF16)
+                        uni_vcvtps2dq(vmm_src, vmm_src);
                    break;
                default:
                    assert(!"unknown dst_prc");
@ -601,6 +613,8 @@ private:
                    uni_vcvtdq2ps(xmm_src, xmm_src);
                break;
            case Precision::I32:
+                if (src_prc == Precision::FP32 || src_prc == Precision::BF16)
+                    uni_vcvtps2dq(xmm_src, xmm_src);
                break;
            default:
                assert(!"unknown dst_prc");
@ -617,6 +631,8 @@ private:
                    uni_vcvtps2dq(vmm_dst, vmm_dst);
                break;
            case Precision::I32:
+                if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16)
+                    uni_vcvtdq2ps(vmm_dst, vmm_dst);
                break;
            default:
                assert(!"unknown src_prc");
@ -635,7 +651,7 @@ private:
                vmovdqu16(op, ymm_dst);
                break;
            case Precision::I16:
-                if (isa == avx512_common) {
+                if (isa == x64::avx512_common) {
                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
                    vpmovusdw(op, vmm_dst);
                } else {
@ -643,36 +659,36 @@ private:
                }
                break;
            case Precision::U16:
-                if (isa == avx512_common) {
+                if (isa == x64::avx512_common) {
                    vpmovsdw(op, vmm_dst);
                } else {
                    uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
                }
                break;
            case Precision::I8:
-                if (isa == avx512_common) {
+                if (isa == x64::avx512_common) {
                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
                    vpmovsdb(op, vmm_dst);
                } else {
                    uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != x64::sse41)
                        vpermq(ymm_dst, ymm_dst, 0x08);
                    uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != x64::sse41)
                        vmovq(op, xmm_dst);
                    else
                        movd(op, xmm_dst);
                }
                break;
            case Precision::U8:
-                if (isa == avx512_common) {
+                if (isa == x64::avx512_common) {
                    vpmovusdb(op, vmm_dst);
                } else {
                    uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != x64::sse41)
                        vpermq(ymm_dst, ymm_dst, 0x08);
                    uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != x64::sse41)
                        vmovq(op, xmm_dst);
                    else
                        movd(op, xmm_dst);
@ -690,6 +706,8 @@ private:
                    uni_vcvtps2dq(xmm_dst, xmm_dst);
                break;
            case Precision::I32:
+                if (dst_prc == Precision::FP32 || dst_prc == Precision::BF16)
+                    uni_vcvtdq2ps(xmm_dst, xmm_dst);
                break;
            default:
                assert(!"unknown src_prc");
@ -742,91 +760,91 @@ MKLDNNEltwiseNode::initializers = {
            alpha = activationLayer->GetParamAsFloat("negative_slope", 0.0f);
            beta = 0.0f;
            opType = Relu;
-            algorithm = mkldnn::eltwise_relu;
+            algorithm = mkldnn::algorithm::eltwise_relu;
        }},
        {"gelu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Gelu;
-            algorithm = mkldnn::eltwise_gelu;
+            algorithm = mkldnn::algorithm::eltwise_gelu;
        }},
        {"elu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
            beta = 0.0f;
            opType = Elu;
-            algorithm = mkldnn::eltwise_elu;
+            algorithm = mkldnn::algorithm::eltwise_elu;
        }},
        {"tanh", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Tanh;
-            algorithm = mkldnn::eltwise_tanh;
+            algorithm = mkldnn::algorithm::eltwise_tanh;
        }},
        {"sigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Logistic;
-            algorithm = mkldnn::eltwise_logistic;
+            algorithm = mkldnn::algorithm::eltwise_logistic;
        }},
        {"logistic", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Logistic;
-            algorithm = mkldnn::eltwise_logistic;
+            algorithm = mkldnn::algorithm::eltwise_logistic;
        }},
        {"square", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Square;
-            algorithm = mkldnn::eltwise_square;
+            algorithm = mkldnn::algorithm::eltwise_square;
        }},
        {"abs", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Abs;
-            algorithm = mkldnn::eltwise_abs;
+            algorithm = mkldnn::algorithm::eltwise_abs;
        }},
        {"sqrt", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Sqrt;
-            algorithm = mkldnn::eltwise_sqrt;
+            algorithm = mkldnn::algorithm::eltwise_sqrt;
        }},
        {"linear", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
            beta = activationLayer->GetParamAsFloat("beta", 0.0f);
            opType = Linear;
-            algorithm = mkldnn::eltwise_linear;
+            algorithm = mkldnn::algorithm::eltwise_linear;
        }},
        {"bounded_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = activationLayer->GetParamAsFloat("alpha", 0.0f);
            beta = 0.0f;
            opType = BoundedRelu;
-            algorithm = mkldnn::eltwise_bounded_relu;
+            algorithm = mkldnn::algorithm::eltwise_bounded_relu;
        }},
        {"soft_relu", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = SoftRelu;
-            algorithm = mkldnn::eltwise_soft_relu;
+            algorithm = mkldnn::algorithm::eltwise_soft_relu;
        }},
        {"relu6", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = activationLayer->GetParamAsFloat("n", 6.0f);
            beta = 0.0f;
            opType = Relu6;
-            algorithm = mkldnn::eltwise_bounded_relu;
+            algorithm = mkldnn::algorithm::eltwise_bounded_relu;
        }},
        {"clamp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
-            alpha = activationLayer->GetParamAsFloat("max", 1.0f);
-            beta = activationLayer->GetParamAsFloat("min", 0.0f);
+            alpha = activationLayer->GetParamAsFloat("min", 1.0f);
+            beta = activationLayer->GetParamAsFloat("max", 0.0f);
            opType = Clamp;
-            algorithm = mkldnn::eltwise_clamp;
+            algorithm = mkldnn::algorithm::eltwise_clip;
        }},
        {"exp", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Exp;
-            algorithm = mkldnn::eltwise_exp;
+            algorithm = mkldnn::algorithm::eltwise_exp;
        }},
        {"not", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
@ -837,25 +855,25 @@ MKLDNNEltwiseNode::initializers = {
            alpha = activationLayer->GetParamAsFloat("alpha", 1.0f);
            beta = 0.0f;
            opType = Swish;
-            algorithm = mkldnn::eltwise_swish;
+            algorithm = mkldnn::algorithm::eltwise_swish;
        }},
        {"hswish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Hswish;
-            algorithm = mkldnn::eltwise_hswish;
+            algorithm = mkldnn::algorithm::eltwise_hswish;
        }},
        {"mish", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Mish;
-            algorithm = mkldnn::eltwise_mish;
+            algorithm = mkldnn::algorithm::eltwise_mish;
        }},
        {"hsigmoid", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
            beta = 0.0f;
            opType = Hsigmoid;
-            algorithm = mkldnn::eltwise_hsigmoid;
+            algorithm = mkldnn::algorithm::eltwise_hsigmoid;
        }},
        {"round", [](GenericLayer* activationLayer, EltwiseOpType& opType, mkldnn::algorithm& algorithm, float& alpha, float& beta) {
            alpha = 0.0f;
@ -863,9 +881,9 @@ MKLDNNEltwiseNode::initializers = {
            opType = Round;
            std::string mode = activationLayer->GetParamAsString("mode", "half_to_even");
            if (mode == "half_to_even")
-                algorithm = mkldnn::eltwise_round_half_to_even;
+                algorithm = mkldnn::algorithm::eltwise_round_half_to_even;
            else if (mode == "half_away_from_zero")
-                algorithm = mkldnn::eltwise_round_half_away_from_zero;
+                algorithm = mkldnn::algorithm::eltwise_round_half_away_from_zero;
            else
                THROW_IE_EXCEPTION << "Round layer with name " << activationLayer->name << " doesn't support mode " << mode;
        }},
@ -916,13 +934,13 @@ void MKLDNNEltwiseNode::init() {
    } else if (comparator(layerType, "scaleshift")) {
        if (getCnnLayer().get()->blobs.size() == 2) {
            eltwiseOp = MulAdd;
-            eltwiseAlgorithm = mkldnn::depthwise_scale_shift;
+            eltwiseAlgorithm = mkldnn::algorithm::depthwise_scale_shift;
        } else {
            eltwiseOp = Multiply;
        }
    } else if (comparator(layerType, "prelu")) {
        eltwiseOp = Prelu;
-        eltwiseAlgorithm = mkldnn::depthwise_prelu;
+        eltwiseAlgorithm = mkldnn::algorithm::depthwise_prelu;
    } else if (comparator(layerType, "activation") && initializers.find(getCnnLayer().get()->GetParamAsString("type")) != initializers.end()) {
        initializers[getCnnLayer().get()->GetParamAsString("type")](getCnnLayer().get(), eltwiseOp, eltwiseAlgorithm, alpha, beta);
    } else if (comparator(layerType, "relu") ||
@ -999,7 +1017,7 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
    if (!supportedPrimitiveDescriptors.empty())
        return;

-    canUseOptimizedImpl = mayiuse(cpu::sse42);
+    canUseOptimizedImpl = mayiuse(x64::sse41);

    size_t expectedInputsNum = getOpInputsNum();
    for (auto& postOp : fusedWith) {
@ -1087,35 +1105,39 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
    auto initDesc = [&] (LayoutType lt) -> PrimitiveDescInfo {
        auto createMemoryDesc = [lt](MKLDNNEdgePtr edge, Precision prc, size_t offset) -> TensorDesc {
            if (lt == ChannelsFirst) {
-                std::vector<size_t> blocks = edge->getDims().ToSizeVector();
-                std::vector<size_t> order;
-                order.push_back(0);
-                for (size_t j = 2; j < blocks.size(); j++)
-                    order.push_back(j);
-                if (blocks.size() > 1)
+                auto dims = edge->getDims().ToSizeVector();
+                auto ndims = dims.size();
+                std::vector<size_t> order(ndims);
+                std::iota(order.begin(), order.end(), 0);
+                if (ndims > 1) {
+                    order.erase(order.begin() + 1);
                    order.push_back(1);
+                }

-                return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+                std::vector<size_t> blocks(ndims);
+                for (size_t i = 0; i < order.size(); i++) {
+                    blocks[i] = dims[order[i]];
+                }
+
+                return TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset});
            } else if (lt == Blocked && edge->getDims()[1] != 1) {
-                size_t blockSize = mayiuse(cpu::avx512_common) ? 16 : 8;
+                size_t blockSize = mayiuse(x64::avx512_common) ? 16 : 8;

                std::vector<size_t> blocks = edge->getDims().ToSizeVector();
                std::vector<size_t> order(blocks.size());
-                for (size_t j = 0; j < order.size(); j++)
-                    order[j] = j;
+                std::iota(order.begin(), order.end(), 0);

                blocks[1] = div_up(blocks[1], blockSize);
                blocks.push_back(blockSize);
                order.push_back(1);

-                return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+                return TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset});
            } else {
                std::vector<size_t> blocks = edge->getDims().ToSizeVector();
                std::vector<size_t> order(blocks.size());
-                for (size_t j = 0; j < order.size(); j++)
-                    order[j] = j;
+                std::iota(order.begin(), order.end(), 0);

-                return MKLDNNMemoryDesc(TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset}));
+                return TensorDesc(prc, edge->getDims().ToSizeVector(), {blocks, order, offset});
            }
        };

@ -1143,17 +1165,17 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() {
        config.outConfs.push_back(dataConfig);

        impl_desc_type impl_type;
-        if (mayiuse(cpu::avx512_common)) {
+        if (mayiuse(x64::avx512_common)) {
            impl_type = impl_desc_type::jit_avx512;
-        } else if (mayiuse(cpu::avx2)) {
+        } else if (mayiuse(x64::avx2)) {
            impl_type = impl_desc_type::jit_avx2;
-        } else if (mayiuse(cpu::sse42)) {
+        } else if (mayiuse(x64::sse41)) {
            impl_type = impl_desc_type::jit_sse42;
        } else {
            impl_type = impl_desc_type::ref;
        }

-        return {config, impl_type, MKLDNNMemoryDesc(config.outConfs[0].desc).getFormat()};
+        return {config, impl_type};
    };

    bool isChannelsFirstApplicable = one_of(getChildEdgeAt(0)->getDims().ndims(), 1, 2, 4, 5);
@ -1243,10 +1265,10 @@ void MKLDNNEltwiseNode::createPrimitive() {

        start_offset_in.resize(inputNum);
        for (size_t i = 0; i < inputNum; i++) {
-            start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
+            start_offset_in[i] = getParentEdgeAt(i)->getMemory().GetDescriptor().data.offset0 *
                               MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getParentEdgeAt(i)->getMemory().GetDescriptor().data.data_type));
        }
-        start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding *
+        start_offset_out = getChildEdgeAt(0)->getMemory().GetDescriptor().data.offset0 *
                         MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(getChildEdgeAt(0)->getMemory().GetDescriptor().data.data_type));
    };

@ -1388,13 +1410,16 @@ void MKLDNNEltwiseNode::createPrimitive() {

    jep.oc_size = oc_size;

-    if (mayiuse(cpu::avx512_common)) {
-        eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx512_common>(jep, *this));
-    } else if (mayiuse(cpu::avx2)) {
-        eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::avx2>(jep, *this));
-    } else if (mayiuse(cpu::sse42)) {
-        eltwise_kernel.reset(new jit_uni_eltwise_generic<cpu::sse42>(jep, *this));
+    if (mayiuse(x64::avx512_common)) {
+        eltwise_kernel.reset(new jit_uni_eltwise_generic<x64::avx512_common>(jep, *this));
+    } else if (mayiuse(x64::avx2)) {
+        eltwise_kernel.reset(new jit_uni_eltwise_generic<x64::avx2>(jep, *this));
+    } else if (mayiuse(x64::sse41)) {
+        eltwise_kernel.reset(new jit_uni_eltwise_generic<x64::sse41>(jep, *this));
    }
+
+    if (eltwise_kernel)
+        eltwise_kernel->create_ker();
 }

 void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() {
@ -1448,6 +1473,26 @@ void MKLDNNEltwiseNode::selectOptimalPrimitiveDescriptor() {
    selectPrimitiveDescriptorByIndex(0);
 }

+void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() {
+    auto selected_pd = getSelectedPrimitiveDescriptor();
+    if (selected_pd == nullptr)
+        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";
+    auto config = selected_pd->getConfig();
+    if (!isInitConfig(config)) {
+        for (size_t i = 0; i < config.inConfs.size(); i++) {
+            config.inConfs[i].desc = getConfiguredInputDesc(config, i);
+        }
+
+        for (size_t i = 0; i < config.outConfs.size(); i++) {
+            config.outConfs[i].desc = getConfiguredOutputDesc(config, i);
+        }
+
+        initDescriptor(config);
+    } else {
+        initDescriptor(config);
+    }
+}
+
 void MKLDNNEltwiseNode::offset_out_calc(std::vector<size_t>& offset, std::vector<size_t>& dims) {
    int k = 1;
    for (int i = offset.size() - 1; i >= 0; i--) {
@ -1541,8 +1586,8 @@ void MKLDNNEltwiseNode::executeReference(const std::vector<const uint8_t *>& src
    size_t inputNum = src_ptrs.size();

    std::shared_ptr<ref_eltwise_scalar_fwd_t> ref_eltwise_injector = nullptr;
-    if (eltwiseAlgorithm != mkldnn::algorithm_undef) {
-        ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(static_cast<mkldnn_alg_kind_t>(eltwiseAlgorithm), alpha, beta);
+    if (eltwiseAlgorithm != mkldnn::algorithm::undef) {
+        ref_eltwise_injector = std::make_shared<ref_eltwise_scalar_fwd_t>(static_cast<mkldnn_alg_kind_t>(eltwiseAlgorithm), alpha, beta, 1.f);
    }

    parallel_nt(0, [&](const int ithr, const int nthr) {
@ -1664,29 +1709,29 @@ bool MKLDNNEltwiseNode::canBeInPlace() const {

 void MKLDNNEltwiseNode::appendPostOps(mkldnn::post_ops& ops) {
    switch (getAlgorithm()) {
-        case mkldnn::eltwise_relu:
-        case mkldnn::eltwise_tanh:
-        case mkldnn::eltwise_elu:
-        case mkldnn::eltwise_square:
-        case mkldnn::eltwise_abs:
-        case mkldnn::eltwise_sqrt:
-        case mkldnn::eltwise_linear:
-        case mkldnn::eltwise_bounded_relu:
-        case mkldnn::eltwise_soft_relu:
-        case mkldnn::eltwise_logistic:
-        case mkldnn::eltwise_exp:
-        case mkldnn::eltwise_gelu:
-        case mkldnn::eltwise_clamp:
-        case mkldnn::eltwise_swish:
-        case mkldnn::eltwise_hswish:
-        case mkldnn::eltwise_mish:
-        case mkldnn::eltwise_hsigmoid:
-        case mkldnn::eltwise_round_half_to_even:
-        case mkldnn::eltwise_round_half_away_from_zero:
+        case mkldnn::algorithm::eltwise_relu:
+        case mkldnn::algorithm::eltwise_tanh:
+        case mkldnn::algorithm::eltwise_elu:
+        case mkldnn::algorithm::eltwise_square:
+        case mkldnn::algorithm::eltwise_abs:
+        case mkldnn::algorithm::eltwise_sqrt:
+        case mkldnn::algorithm::eltwise_linear:
+        case mkldnn::algorithm::eltwise_bounded_relu:
+        case mkldnn::algorithm::eltwise_soft_relu:
+        case mkldnn::algorithm::eltwise_logistic:
+        case mkldnn::algorithm::eltwise_exp:
+        case mkldnn::algorithm::eltwise_gelu:
+        case mkldnn::algorithm::eltwise_clip:
+        case mkldnn::algorithm::eltwise_swish:
+        case mkldnn::algorithm::eltwise_hswish:
+        case mkldnn::algorithm::eltwise_mish:
+        case mkldnn::algorithm::eltwise_hsigmoid:
+        case mkldnn::algorithm::eltwise_round_half_to_even:
+        case mkldnn::algorithm::eltwise_round_half_away_from_zero:
            ops.append_eltwise(1.0, getAlgorithm(), getAlpha(), getBeta());
            break;
-        case mkldnn::depthwise_scale_shift:
-        case mkldnn::depthwise_prelu:
+        case mkldnn::algorithm::depthwise_scale_shift:
+        case mkldnn::algorithm::depthwise_prelu:
            if (scales.empty() && shifts.empty()) {
                size_t bufferSize = static_cast<size_t>(outDims[0][outDims[0].size() > 1 ? 1 : 0]);
                size_t bufferSizeAligned = rnd_up(bufferSize, 16);
@ -1742,7 +1787,7 @@ bool MKLDNNEltwiseNode::canFuse(const MKLDNNNodePtr& node) const {
        return true;
    };

-    if (!mayiuse(cpu::sse42))
+    if (!mayiuse(x64::sse41))
        return false;

    if (!isSuitableNode(this)) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.h
@ -99,6 +99,8 @@ struct jit_uni_eltwise_kernel {
    explicit jit_uni_eltwise_kernel(jit_eltwise_params jep, MKLDNNEltwiseNode& node) : ker_(nullptr), jep_(jep), eltwiseNode(node) {}
    virtual ~jit_uni_eltwise_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_eltwise_params jep_;
    MKLDNNEltwiseNode& eltwiseNode;
 };
@ -111,6 +113,7 @@ public:
    void getSupportedDescriptors() override;
    void initSupportedPrimitiveDescriptors() override;
    void selectOptimalPrimitiveDescriptor() override;
+    void initOptimalPrimitiveDescriptor() override;
    void createPrimitive() override;
    void execute(mkldnn::stream strm) override;
    bool created() const override;
@ -136,7 +139,7 @@ private:
    void init() override;

    EltwiseOpType eltwiseOp = Add;
-    mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm_undef;
+    mkldnn::algorithm eltwiseAlgorithm = mkldnn::algorithm::undef;

    std::shared_ptr<jit_uni_eltwise_kernel> eltwise_kernel = nullptr;
    jit_eltwise_params jep = {};
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.cpp
@ -5,12 +5,13 @@
 #include "mkldnn_fullyconnected_node.h"
 #include "mkldnn_eltwise_node.h"
 #include "mkldnn_quantize_node.h"
-#include "desc_iterator.hpp"
+
 #include <legacy/ie_layers.h>
 #include <string>
 #include <vector>
 #include <mkldnn_extension_utils.h>
 #include <mkldnn.hpp>
+#include "utils/general_utils.h"

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@ -19,40 +20,33 @@ using namespace InferenceEngine;
 MKLDNNFullyConnectedNode::MKLDNNFullyConnectedNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache)
        : MKLDNNNode(layer, eng, cache), withBiases(false), baseInputsNumber(0) {
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(0).desc());
+        return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(0));
    });
    internalBlobDesc.emplace_back([&](primitive_desc_iterator &primitive_desc_it, size_t idx) -> MKLDNNMemoryDesc {
        if (internalBlobs.size() <= 1)
            return MKLDNNMemoryDesc();
-        return MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(1).desc());
+        return MKLDNNMemoryDesc(primitive_desc_it.weights_desc(1));
    });

-    auto ws = layer->blobs.find("w-scale");
-    if (ws != layer->blobs.end()) {
-        wScale = ws->second;
-    }
-
    if (getCnnLayer()->type == "FullyConnected" || getCnnLayer()->type == "InnerProduct") {
        baseInputsNumber = getCnnLayer().get()->insData.size();
    }
+}

-    // Trying to find oi-scale
-    if (getCnnLayer()->type == "FullyConnected" && getCnnLayer()->precision == Precision::I8) {
-        if (baseInputsNumber != 1) {
-            THROW_IE_EXCEPTION << "Unsupported number of inputs for quantized FullyConnected " << getCnnLayer()->name;
-        }
-
-        auto ois = layer->blobs.find("oi-scale");
-        if ((getCnnLayer()->outData[0]->getPrecision() == Precision::I8 || getCnnLayer()->outData[0]->getPrecision() == Precision::U8)
-            && ois == layer->blobs.end()) {
-            THROW_IE_EXCEPTION << "Internal error of graph quantization - mismatch of intermediate scales and next layer type for fully connected "
-                << getCnnLayer()->name;
-        }
-        if (ois != layer->blobs.end()) {
-            // If we can find an oi-scale, then the next layer has to be an INT8.
-            oScale = ois->second;
-        }
-    }
+std::vector<memory::format_tag> MKLDNNFullyConnectedNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
+    if (dims.ndims() == 0)
+        return {memory::format_tag::x};
+    else if (dims.ndims() == 1)
+        return {memory::format_tag::x};
+    else if (dims.ndims() == 2)
+        return {memory::format_tag::nc};
+    else if (dims.ndims() == 3)
+        return {memory::format_tag::tnc};
+    else if (dims.ndims() == 4)
+        return {memory::format_tag::nChw8c, memory::format_tag::nChw16c, memory::format_tag::nhwc, memory::format_tag::nchw};
+    else if (dims.ndims() == 5)
+        return {memory::format_tag::nCdhw8c, memory::format_tag::nCdhw16c, memory::format_tag::ndhwc, memory::format_tag::ncdhw};
+    return {memory::format_tag::any};
 }

 void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
@ -64,8 +58,8 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
    precision = getCnnLayer()->outData[0]->getPrecision();
    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);

-    if (inputDataType == memory::f32) {
-        outputDataType = memory::f32;
+    if (inputDataType == memory::data_type::f32) {
+        outputDataType = memory::data_type::f32;
    }

    if (baseInputsNumber > 1) {
@ -77,10 +71,10 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
        }
        auto weightsDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[1].lock()->getPrecision());

-        // TODO(amalyse) what are the cases when we have non i8 weights and have to overide the precisions?
-        if (((inputDataType != memory::u8 && inputDataType != memory::s8) || weightsDataType != memory::s8) && inputDataType != memory::bf16) {
-            inputDataType = memory::f32;
-            outputDataType = memory::f32;
+        if ((!one_of(inputDataType , memory::data_type::u8, memory::data_type::s8) || weightsDataType != memory::data_type::s8) &&
+                inputDataType != memory::data_type::bf16) {
+            inputDataType = memory::data_type::f32;
+            outputDataType = memory::data_type::f32;
        }
    }

@ -99,66 +93,36 @@ void MKLDNNFullyConnectedNode::getSupportedDescriptors() {
    if (getChildEdges().empty())
        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();

-    MKLDNNDims inDims(fcLayer->input()->getDims());
-    MKLDNNDims outDims(fcLayer->outData[0]->getDims());
+    MKLDNNDims inDims = getParentEdgeAt(0)->getDims();
+    MKLDNNDims outDims = getChildEdgeAt(0)->getDims();

-    if (inDims.ndims() == 2) {
-        weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1])};
-    } else if (inDims.ndims() == 3) {
-        weightsDims = {static_cast<size_t>(outDims[2]), static_cast<size_t>(inDims[2])};
-    } else if (inDims.ndims() == 4) {
-        weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
-                       static_cast<size_t>(inDims[3])};
-    } else if (inDims.ndims() == 5) {
-        weightsDims = {fcLayer->_out_num, static_cast<size_t>(inDims[1]), static_cast<size_t>(inDims[2]),
-                       static_cast<size_t>(inDims[3]), static_cast<size_t>(inDims[4])};
-    } else {
-        THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4 or 2, got: "
+    if (!one_of(inDims.ndims(), 2, 3, 4, 5)) {
+        THROW_IE_EXCEPTION << "Unsupported source format for FC layer. Expected 5, 4, 3 or 2, got: "
                           << inDims.ndims() << " dims.";
    }

+    if (inDims.ndims() == 3) {
+        weightsDims = InferenceEngine::SizeVector({static_cast<size_t>(outDims[2]), static_cast<size_t>(inDims[2])});
+    } else {
+        weightsDims.push_back(outDims[1]);
+        for (int i = 1; i < inDims.ndims(); i++)
+            weightsDims.push_back(inDims[i]);
+    }
+    biasesDims.push_back(weightsDims[0]);
+
    if (baseInputsNumber == 1) {
        internalBlobs.push_back(createInternalBlob(weightsDims, true));
    }

    withBiases = (fcLayer->_biases != nullptr && fcLayer->_biases->size() != 0) || baseInputsNumber == 3;
-    if (inDims.ndims() == 3) {
-        biasesDims.push_back(static_cast<int>(outDims[2]));
-    } else {
-        biasesDims.push_back(static_cast<int>(fcLayer->_out_num));
-    }
+
    if (withBiases && baseInputsNumber == 1) {
        internalBlobs.push_back(createInternalBlob(biasesDims, false));
    }

-    if (this->getCnnLayer()->blobs.find("weights") != this->getCnnLayer()->blobs.end()) {
-        Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
-        if (weights->getTensorDesc().getPrecision() == Precision::I8) {
-            // The weights blob has incorrect dims, so we have to fix it
-            TensorDesc wdesc = internalBlobs[0]->getTensorDesc();
-            wdesc.setPrecision(Precision::I8);
-            InferenceEngine::TBlob<int8_t>::Ptr reshapedInt8Weights =
-                    InferenceEngine::TBlob<int8_t>::Ptr(
-                            new InferenceEngine::TBlob<int8_t>(wdesc, static_cast<int8_t *>(weights->buffer()),
-                                                               weights->byteSize()));
-
-            internalBlobs[0] = reshapedInt8Weights;
-            if (withBiases) {
-                Blob::Ptr biases = this->getCnnLayer()->blobs.find("biases")->second;
-                TensorDesc bdesc = internalBlobs[1]->getTensorDesc();
-                bdesc.setPrecision(Precision::I32);
-                InferenceEngine::TBlob<int32_t>::Ptr reshapedInt32Biases =
-                        InferenceEngine::TBlob<int32_t>::Ptr(
-                                new InferenceEngine::TBlob<int32_t>(bdesc, static_cast<int32_t *>(biases->buffer()),
-                                                                    biases->byteSize()));
-                internalBlobs[1] = reshapedInt32Biases;
-            }
-        }
-    }
-
-    for (auto format : getAvailableFormatsForDims(getParentEdgeAt(0)->getDims())) {
+    for (auto format : getAvailableFormatsForDims(inDims)) {
        MKLDNNMemoryDesc in_candidate(inDims, inputDataType, format);
-        MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, memory::any);
+        MKLDNNMemoryDesc out_candidate(outDims, outputDataType, memory::format_tag::any);

        createDescriptor({in_candidate}, {out_candidate});
    }
@ -173,17 +137,36 @@ void MKLDNNFullyConnectedNode::createPrimitive() {
    prim_desc = std::make_shared<inner_product_forward::primitive_desc>(
            createPrimitiveDescriptor<inner_product_forward::primitive_desc, inner_product_forward::desc>(*attr));

-    if (withBiases) {
-        prim.reset(new inner_product_forward(*prim_desc,
-                                             getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                             getWeights(),
-                                             getBias(),
-                                             getChildEdgeAt(0)->getMemory().GetPrimitive()));
-    } else {
-        prim.reset(new inner_product_forward(*prim_desc,
-                                             getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                             getWeights(),
-                                             getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    prim.reset(new inner_product_forward(*prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    if (withBiases)
+        primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_BIAS, getBias()}, {DNNL_ARG_DST, dst}};
+    else
+        primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_WEIGHTS, getWeights()}, {DNNL_ARG_DST, dst}};
+}
+
+void MKLDNNFullyConnectedNode::execute(mkldnn::stream strm) {
+    if (prim) {
+        auto reshapeMemory = [this](int argType) {
+            auto param = primArgs.find(argType);
+            if (param != primArgs.end()) {
+                auto oldMem = param->second;
+                auto dims = oldMem.get_desc().dims();
+                if (dims.size() == 3) {
+                    MKLDNNDims normalizedDims({static_cast<ptrdiff_t>(dims[0] * dims[1]), static_cast<ptrdiff_t>(dims[2])});
+                    mkldnn::memory::desc newMemDesc(oldMem.get_desc().reshape(normalizedDims));
+                    mkldnn::memory newMem(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
+                    primArgs.at(argType) = newMem;
+                }
+            }
+        };
+
+        reshapeMemory(DNNL_ARG_SRC);
+        reshapeMemory(DNNL_ARG_DST);
+
+        (*prim).execute(strm, primArgs);
    }
 }

@ -206,35 +189,35 @@ void MKLDNNFullyConnectedNode::setPostOps(mkldnn::primitive_attr &attr, bool ini
                MKLDNNDims depthwiseDims({static_cast<ptrdiff_t>(rnd_up(ndims == 3 ? getChildEdgeAt(0)->getDims()[2] : getChildEdgeAt(0)->getDims()[1], 16))});

                PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
+                PostOpsIntBlobMemory[blob_idx]->Create(depthwiseDims, memory::data_type::f32, memory::format_tag::x);
                PostOpsIntBlobMemory[blob_idx]->FillZero();

                // In case ndims == 3 graph optimizer allows fusing only if all weights values are the same
                if (depthwiseLayer->blobs["weights"]->size() == 1 || ndims == 3) {
                    float broadcastValue = static_cast<float *>(depthwiseLayer->_weights->buffer())[0];
-                    for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+                    for (int i = 0; i < PostOpsIntBlobMemory[blob_idx]->GetDesc().getDims()[0]; i++) {
                        static_cast<float *>(PostOpsIntBlobMemory[blob_idx]->GetData())[i] = broadcastValue;
                    }
                } else {
-                    PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::x,
+                    PostOpsIntBlobMemory[blob_idx]->SetData(memory::data_type::f32, memory::format_tag::x,
                                                            depthwiseLayer->_weights->buffer(),
                                                            depthwiseLayer->_weights->size() *
                                                            MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
                }

-                if (eltwiseNode->getAlgorithm() == depthwise_scale_shift) {
+                if (eltwiseNode->getAlgorithm() == algorithm::depthwise_scale_shift) {
                    PostOpsIntBlobMemory.push_back(MKLDNNMemoryPtr(new MKLDNNMemory(getEngine())));
-                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format::x);
+                    PostOpsIntBlobMemory[blob_idx + 1]->Create(depthwiseDims, memory::data_type::f32, memory::format_tag::x);
                    PostOpsIntBlobMemory[blob_idx + 1]->FillZero();

                    // In case ndims == 3 graph optimizer allows fusing only if all biases values are the same
                    if (depthwiseLayer->blobs["biases"]->size() == 1 || ndims == 3) {
                        float broadcastValue = static_cast<float *>(depthwiseLayer->_biases->buffer())[0];
-                        for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetPrimitiveDescriptor().desc().data.dims[0]; i++) {
+                        for (int i = 0; i < PostOpsIntBlobMemory[blob_idx + 1]->GetDesc().getDims()[0]; i++) {
                            static_cast<float *>(PostOpsIntBlobMemory[blob_idx + 1]->GetData())[i] = broadcastValue;
                        }
                    } else {
-                        PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::x,
+                        PostOpsIntBlobMemory[blob_idx + 1]->SetData(memory::data_type::f32, memory::format_tag::x,
                                                                    depthwiseLayer->_biases->buffer(),
                                                                    depthwiseLayer->_biases->size() *
                                                                    MKLDNNExtensionUtils::sizeOfDataType(memory::data_type::f32));
@ -273,31 +256,6 @@ bool MKLDNNFullyConnectedNode::created() const {
    return getType() == FullyConnected;
 }

-memory::format MKLDNNFullyConnectedNode::weightsFormatForSrcFormat(memory::format sourceFormat) {
-    switch (sourceFormat) {
-        case memory::format::x:
-            return memory::format::x;
-        case memory::format::nc:
-        case memory::format::tnc:
-        case memory::format::ntc:
-            return memory::format::oi;
-        case memory::format::nchw:
-            return memory::format::oihw;
-        case memory::format::ncdhw:
-            return memory::format::oidhw;
-        case memory::format::nChw8c:
-            return memory::format::oIhw8i;
-        case memory::format::nCdhw8c:
-            return memory::format::oIdhw8i;
-        case memory::format::nChw16c:
-            return memory::format::oIhw16i;
-        case memory::format::nCdhw16c:
-            return memory::format::oIdhw16i;
-        default:
-            THROW_IE_EXCEPTION << "Unsupported source format for node " << getName();
-    }
-}
-
 const std::vector<impl_desc_type>& MKLDNNFullyConnectedNode::getPrimitivesPriority() {
    std::vector<impl_desc_type> priorities = {
            impl_desc_type::unknown,
@ -335,25 +293,6 @@ const std::vector<impl_desc_type>& MKLDNNFullyConnectedNode::getPrimitivesPriori

 std::shared_ptr<mkldnn::primitive_attr> MKLDNNFullyConnectedNode::initPrimitiveAttr() {
    auto attr = std::make_shared<mkldnn::primitive_attr>(mkldnn::primitive_attr());
-    if (wScale != nullptr) {
-       float* wScaleData = static_cast<float*>(wScale->buffer());
-
-       std::vector<float> oScaleDataVector;
-       if (getCnnLayer()->precision == Precision::I8 && getCnnLayer()->outData[0]->getPrecision() != Precision::FP32) {
-           float *oScaleData = static_cast<float *>(oScale->buffer());
-
-           for (size_t c = 0; c < wScale->size(); c++) {
-               oScaleDataVector.push_back(wScaleData[c] / oScaleData[c]);
-           }
-       } else {
-           for (size_t c = 0; c < wScale->size(); c++) {
-               oScaleDataVector.push_back(wScaleData[c]);
-           }
-       }
-
-       attr->set_int_output_round_mode(mkldnn::round_nearest);
-       attr->set_output_scales(1 << 1 /*through C dim*/, oScaleDataVector);
-    }

    setPostOps(*attr, true);

@ -363,47 +302,31 @@ std::shared_ptr<mkldnn::primitive_attr> MKLDNNFullyConnectedNode::initPrimitiveA
 void MKLDNNFullyConnectedNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
                                                const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
    TensorDesc inDesc = inputDesc[0], outDesc = outputDesc[0];
+
    mkldnn::memory::data_type wdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
    mkldnn::memory::data_type bdt = MKLDNNExtensionUtils::IEPrecisionToDataType(inDesc.getPrecision());
    if (inDesc.getPrecision() == Precision::BF16) {
        bdt = mkldnn::memory::data_type::f32;
+    } else if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
+        wdt = memory::data_type::s8;
+        bdt = baseInputsNumber == 3 ? MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::data_type::f32;
    }

-    if (inDesc.getPrecision() == Precision::U8 || inDesc.getPrecision() == Precision::I8) {
-        wdt = memory::s8;
-        bdt = baseInputsNumber == 3 ? MKLDNNExtensionUtils::IEPrecisionToDataType(getCnnLayer()->insData[2].lock()->getPrecision()) : memory::f32;
-    }
-
-    if (this->getCnnLayer()->blobs.find("weights") != this->getCnnLayer()->blobs.end()) {
-        Blob::Ptr weights = this->getCnnLayer()->blobs.find("weights")->second;
-
-        if (weights->getTensorDesc().getPrecision() == Precision::I8) {
-            wdt = memory::s8;
-            bdt = memory::s32;
-
-            Precision outPrec;
-            if (getCnnLayer()->outData[0]->getPrecision() == Precision::FP32) {
-                outPrec = Precision::FP32;
-            } else {
-                // define precision accordninly normalizer
-                // TODO(amalyshe) do we need to have separate flow for last in int8 chain or not?
-                outPrec = outDesc.getPrecision();
-            }
-
-            inDesc = TensorDesc(inDesc.getPrecision(), inputDesc[0].getDims(), inputDesc[0].getBlockingDesc());
-            outDesc = TensorDesc(outPrec, outputDesc[0].getDims(), Layout::NC/*, outputDesc[0].getBlockingDesc()*/);
-        }
+    if (inDesc.getDims().size() == 3) {
+        auto inDims = inDesc.getDims();
+        auto outDims = outDesc.getDims();
+        InferenceEngine::SizeVector normalizedInDims = {inDims[0] * inDims[1], inDims[2]};
+        InferenceEngine::SizeVector normalizedOutDims = {outDims[0] * outDims[1], outDims[2]};
+        inDesc = InferenceEngine::TensorDesc(inDesc.getPrecision(), normalizedInDims, TensorDesc::getLayoutByDims(normalizedInDims));
+        outDesc = InferenceEngine::TensorDesc(outDesc.getPrecision(), normalizedOutDims, TensorDesc::getLayoutByDims(normalizedOutDims));
    }

    MKLDNNMemoryDesc in_candidate(inDesc);
    MKLDNNMemoryDesc out_candidate(outDesc);
-
-    memory::format weights_fmt = weightsFormatForSrcFormat(in_candidate.getFormat());
-
-    MKLDNNMemoryDesc wgh_candidate(MKLDNNDims(weightsDims), wdt, weights_fmt);
+    MKLDNNMemoryDesc wgh_candidate(MKLDNNDims(weightsDims), wdt, mkldnn::memory::format_tag::any);

    if (withBiases) {
-        MKLDNNMemoryDesc bias_candidate(MKLDNNDims(biasesDims), bdt, memory::any);
+        MKLDNNMemoryDesc bias_candidate(MKLDNNDims(biasesDims), bdt, memory::format_tag::any);
        MKLDNNDescriptor desc(std::shared_ptr<inner_product_forward::desc>(
                new inner_product_forward::desc(prop_kind::forward_scoring, in_candidate, wgh_candidate,
                                                bias_candidate, out_candidate)));
@ -417,17 +340,39 @@ void MKLDNNFullyConnectedNode::createDescriptor(const std::vector<InferenceEngin
 }

 MKLDNNMemoryDesc MKLDNNFullyConnectedNode::getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
-    InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_primitive_desc(idx - 1).desc())
-                                               : MKLDNNMemoryDesc(primitive_desc_it.src_primitive_desc(idx).desc());
+    InferenceEngine::TensorDesc desc = idx > 0 ? MKLDNNMemoryDesc(primitive_desc_it.weights_desc(idx - 1))
+                                               : MKLDNNMemoryDesc(primitive_desc_it.src_desc(idx));

-    if (desc.getLayout() == InferenceEngine::Layout::ANY)
+    if (desc.getLayout() == InferenceEngine::Layout::ANY) {
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
                                                            getParentEdgeAt(idx)->getDims().ToSizeVector(),
                                                            desc.getLayout()));
-    else
+    } else if (getParentEdgeAt(idx)->getDims().ndims() == 3) {
+        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
+                                                            getParentEdgeAt(idx)->getDims().ToSizeVector(),
+                                                            TensorDesc::getLayoutByDims(getParentEdgeAt(idx)->getDims().ToSizeVector())));
+    } else {
        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
                                                            getParentEdgeAt(idx)->getDims().ToSizeVector(),
                                                            desc.getBlockingDesc()));
+    }
+}
+
+MKLDNNMemoryDesc MKLDNNFullyConnectedNode::getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) {
+    InferenceEngine::TensorDesc desc = MKLDNNMemoryDesc(primitive_desc_it.dst_desc(idx));
+    if (desc.getLayout() == InferenceEngine::Layout::ANY) {
+        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
+                                                            getChildEdgeAt(idx)->getDims().ToSizeVector(),
+                                                            desc.getLayout()));
+    } else if (getChildEdgeAt(idx)->getDims().ndims() == 3) {
+        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
+                                                            getChildEdgeAt(idx)->getDims().ToSizeVector(),
+                                                            TensorDesc::getLayoutByDims(getChildEdgeAt(idx)->getDims().ToSizeVector())));
+    } else {
+        return MKLDNNMemoryDesc(InferenceEngine::TensorDesc(desc.getPrecision(),
+                                                            getChildEdgeAt(idx)->getDims().ToSizeVector(),
+                                                            desc.getBlockingDesc()));
+    }
 }

 const mkldnn::memory& MKLDNNFullyConnectedNode::getWeights() const {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_fullyconnected_node.h
@ -17,9 +17,12 @@ public:
    MKLDNNFullyConnectedNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
    ~MKLDNNFullyConnectedNode() override = default;

+    std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims &dims) const override;
    void getSupportedDescriptors() override;
    void createPrimitive() override;
+    void execute(mkldnn::stream strm) override;
    bool created() const override;
+
    bool canBeInPlace() const override {
        return false;
    }
@ -33,6 +36,7 @@ public:
    }

    MKLDNNMemoryDesc getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
+    MKLDNNMemoryDesc getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx) override;

    const mkldnn::memory& getWeights() const;
    const mkldnn::memory& getBias() const;
@ -45,13 +49,10 @@ protected:
 private:
    InferenceEngine::SizeVector weightsDims;
    InferenceEngine::SizeVector biasesDims;
-    mkldnn::memory::format weightsFormatForSrcFormat(mkldnn::memory::format sourceFormat);

    std::vector<MKLDNNMemoryPtr> PostOpsIntBlobMemory;
    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights);

-    InferenceEngine::Blob::Ptr wScale, oScale;
-
    bool withBiases;
    int baseInputsNumber;
 };
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_gemm_node.cpp
@ -200,7 +200,7 @@ inline void process_gemm(char transa, char transb, int M, int N, int K, float al

 inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint16_t *A, int lda,
                         const uint16_t *B, int ldb, float beta, float *C, int ldc) {
-    mkldnn_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+    dnnl_gemm_bf16bf16f32(transa, transb, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
 }

 inline void process_gemm(char transa, char transb, int M, int N, int K, float alpha, const uint8_t *A, int lda,
@ -231,13 +231,11 @@ void MKLDNNGemmNode::process_data() {

    auto& srcMemory0 = getParentEdgeAt(0)->getMemory();
    auto& srcMemory1 = getParentEdgeAt(1)->getMemory();
+    auto& dstMemory0 = getChildEdgeAt(0)->getMemory();

-    const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetData()) +
-                              srcMemory0.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData()) +
-                             srcMemory1.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
-                     getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+    const T0 *src0_ptr = reinterpret_cast<const T0*>(srcMemory0.GetPtr());
+    const T1 *src1_ptr = reinterpret_cast<const T1*>(srcMemory1.GetData());
+    float *dst_ptr = reinterpret_cast<float*>(dstMemory0.GetData());

    int MB1 = outDims.ndims() == 4 ? batchToProcess() : 1;
    int MB2 = outDims.ndims() == 3 ? batchToProcess() : outDims.ndims() > 3 ? outDims[outDims.ndims() - 3] : 1;
@ -255,8 +253,7 @@ void MKLDNNGemmNode::process_data() {
    const float *src2_ptr;
    if (isThreeInputs) {
        auto& srcMemory2 = getParentEdgeAt(2)->getMemory();
-        src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetData()) +
-                   srcMemory2.GetDescriptor().data.layout_desc.blocking.offset_padding;
+        src2_ptr = reinterpret_cast<const float *>(srcMemory2.GetPtr());
    } else {
        src2_ptr = dst_ptr;
    }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_generic_node.cpp
@ -55,12 +55,7 @@ void MKLDNNGenericNode::initSupportedPrimitiveDescriptors() {
        }

        for (auto& config : configs) {
-            std::vector<memory::format> outFormats;
-            for (auto& outConfig : config.outConfs) {
-                outFormats.push_back(MKLDNNMemory::Convert(outConfig.desc.getLayout()));
-            }
-
-            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormats);
+            supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
        }
    }
    if (impls.empty()) {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_input_node.cpp
@ -47,33 +47,30 @@ void MKLDNNInputNode::initSupportedPrimitiveDescriptors() {

    InferenceEngine::LayerConfig config;
    config.dynBatchSupport = true;
-    memory::format outFormat = mkldnn::memory::format_undef;
    if (getType() == Input || getType() == MemoryInput) {
        precision = getCnnLayer()->outData[0]->getPrecision();
        if (precision == InferenceEngine::Precision::U16 || isMeanImage) {
            precision = InferenceEngine::Precision::FP32;
        }
-        auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
        InferenceEngine::DataConfig dataConfig;
        dataConfig.inPlace = -1;
        dataConfig.constant = false;

-        outFormat = MKLDNNMemory::Convert(getCnnLayer()->outData[0]->getLayout());
-        dataConfig.desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
+        auto mem_tdesc = MKLDNNMemoryDesc(getCnnLayer()->outData[0]->getTensorDesc());
+        dataConfig.desc = mem_tdesc;
        config.outConfs.push_back(dataConfig);
    } else if (getType() == Output) {
        precision = getCnnLayer()->insData[0].lock()->getPrecision();
        if (precision == InferenceEngine::Precision::U16) precision = InferenceEngine::Precision::FP32;
-        auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
        InferenceEngine::DataConfig dataConfig;
        dataConfig.inPlace = -1;
        dataConfig.constant = false;

-        outFormat = MKLDNNMemory::Convert(getCnnLayer()->insData[0].lock()->getLayout());
-        dataConfig.desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, outFormat);
+        auto mem_tdesc = MKLDNNMemoryDesc(getCnnLayer()->insData[0].lock()->getTensorDesc());
+        dataConfig.desc = mem_tdesc;
        config.inConfs.push_back(dataConfig);
    }
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormat);
+    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 }

 void MKLDNNInputNode::createPrimitive() {
@ -145,17 +142,28 @@ void MKLDNNInputNode::execute(mkldnn::stream strm) {
        return;
    auto dstBlob = getChildEdgeAt(0)->getBlob();

-    if (constBlob->size() != dstBlob->size()) {
-        THROW_IE_EXCEPTION << "Incorrect blob sizes for node " << getName();
-    }
-
    if (constBlob->getTensorDesc() == dstBlob->getTensorDesc()
        || isCompatibleTensors(constBlob->getTensorDesc(), dstBlob->getTensorDesc())) {
        const int8_t *srcData = constBlob->cbuffer().as<int8_t *>();
        int8_t *dstData = dstBlob->buffer();

        cpu_memcpy_s(dstData, dstBlob->byteSize(), srcData, constBlob->byteSize());
+    } else if (constBlob->getTensorDesc().getPrecision() == InferenceEngine::Precision::BIN ||
+               dstBlob->getTensorDesc().getPrecision() == InferenceEngine::Precision::BIN) {
+        size_t dstSize = dstBlob->size() / 8;
+        if (constBlob->size() != dstSize) {
+            THROW_IE_EXCEPTION << "Incorrect blob sizes for node " << getName();
+        }
+
+        const int8_t *srcData = constBlob->cbuffer().as<int8_t *>();
+        int8_t *dstData = dstBlob->buffer();
+
+        cpu_memcpy_s(dstData, dstSize, srcData, constBlob->byteSize());
    } else {
+        if (constBlob->size() != dstBlob->size()) {
+            THROW_IE_EXCEPTION << "Incorrect blob sizes for node " << getName();
+        }
+
        switch (precision.size()) {
            case 1: {
                const int8_t *srcData = constBlob->cbuffer().as<int8_t *>();
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_interpolate_node.h"
-#include "desc_iterator.hpp"
+
 #include "mkldnn_quantize_node.h"
 #include <legacy/ie_layers.h>
 #include "mkldnn_eltwise_node.h"
@ -16,10 +16,11 @@
 #include "ie_parallel.hpp"
 #include <algorithm>

-#include "jit_generator.hpp"
-#include "jit_uni_eltwise.hpp"
-#include "jit_uni_depthwise.hpp"
-#include "jit_uni_quantization.hpp"
+#include <cpu/x64/jit_generator.hpp>
+#include <cpu/x64/jit_uni_eltwise.hpp>
+#include <cpu/x64/jit_uni_depthwise_injector.hpp>
+#include <cpu/x64/jit_uni_quantization_injector.hpp>
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>
 #include "common/cpu_memcpy.h"
 #include "utils/bfloat16.hpp"

@ -28,6 +29,7 @@ using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl;
 using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;
 using namespace Xbyak;

@ -39,16 +41,24 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_interpolate_kernel_f32)

    explicit jit_uni_interpolate_kernel_f32(jit_interpolate_config_params jcp, const mkldnn_primitive_attr &attr)
-    : jit_uni_interpolate_kernel(jcp, attr), jit_generator() {
+    : jit_uni_interpolate_kernel(jcp, attr), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        const auto &p = attr_.post_ops_;
-        for (int i = 0; i < p.len_; i++) {
+        for (int i = 0; i < p.len(); i++) {
            auto &post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
                        this,
                        post_op.eltwise.alg,
                        post_op.eltwise.alpha,
-                        post_op.eltwise.beta));
+                        post_op.eltwise.beta,
+                        1));
            } else if (post_op.is_depthwise()) {
                depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
                        this,
@ -64,9 +74,9 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi

        this->preamble();

-        if (attr_.post_ops_.len_ != 0)
+        if (attr_.post_ops_.len() != 0)
            mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
-        if (isa == cpu::avx512_common)
+        if (isa == cpu::x64::avx512_common)
            uni_vpxor(vmm_zero, vmm_zero, vmm_zero);

        switch (jcp_.mode) {
@ -145,12 +155,10 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi
        if ((jcp_.mode == InterpolateMode::cubic) && (jcp_.layout == InterpolateLayoutType::planar)) {
            prepare_cubic_planar_table();
        }
-
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;

    const int vlen = cpu_isa_traits<isa>::vlen;
@ -282,7 +290,7 @@ private:
                uni_vmovdqu(vmm_index, ptr[reg_index]);
                uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask);
                vgatherdps(vmm_val, ptr[reg_src_h + vmm_index], vmm_mask);
-                if (attr_.post_ops_.len_ != 0)
+                if (attr_.post_ops_.len() != 0)
                    apply_post_ops(jcp_.dst_dt, 1);
                store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);

@ -305,7 +313,7 @@ private:
                add(reg_src_aux, reg_index_offset);

                load_scalar(xmm_val, ptr[reg_src_aux], jcp_.src_dt);
-                if (attr_.post_ops_.len_ != 0)
+                if (attr_.post_ops_.len() != 0)
                    apply_post_ops(jcp_.dst_dt, 1);
                store_scalar(ptr[reg_dst], xmm_val, jcp_.dst_dt);

@ -328,7 +336,7 @@ private:

    void nn_blk() {
        int step = vlen / sizeof(float);
-        if (isa == cpu::sse42)
+        if (isa == cpu::x64::sse41)
            step *= 2;

        Xbyak::Label nn_loop_label;
@ -343,15 +351,15 @@ private:
            add(reg_src_aux, reg_index_offset);

            load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
-            if (attr_.post_ops_.len_ != 0)
+            if (attr_.post_ops_.len() != 0)
                apply_post_ops(jcp_.dst_dt, 0);
            store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);

-            if (isa == cpu::sse42) {
+            if (isa == cpu::x64::sse41) {
                int sse42_offset = 4;
                add(reg_src_aux, sse42_offset * jcp_.src_data_size);
                load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
-                if (attr_.post_ops_.len_ != 0) {
+                if (attr_.post_ops_.len() != 0) {
                    add(reg_oc_off, sse42_offset * sizeof(float));
                    apply_post_ops(jcp_.dst_dt, 0);
                    sub(reg_oc_off, sse42_offset * sizeof(float));
@ -398,7 +406,7 @@ private:
            add(reg_src_aux, reg_index_offset);

            mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
-            if (attr_.post_ops_.len_ != 0)
+            if (attr_.post_ops_.len() != 0)
                mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);

            L(nn_loop_label);
@ -407,7 +415,7 @@ private:
                jl(nn_loop_end_label, T_NEAR);

                load_vector(vmm_val, ptr[reg_src_aux], jcp_.src_dt);
-                if (attr_.post_ops_.len_ != 0)
+                if (attr_.post_ops_.len() != 0)
                    apply_post_ops(jcp_.dst_dt, 0);
                store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);

@ -427,7 +435,7 @@ private:
                jl(nn_tail_loop_end_label, T_NEAR);

                load_scalar(xmm_val, ptr[reg_src_aux], jcp_.src_dt);
-                if (attr_.post_ops_.len_ != 0)
+                if (attr_.post_ops_.len() != 0)
                    apply_post_ops(jcp_.dst_dt, 0);
                store_scalar(ptr[reg_dst], xmm_val, jcp_.dst_dt);

@ -468,7 +476,7 @@ private:
        mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);

        int step = vlen / sizeof(float);
-        int blk = (isa == cpu::sse42) ? (2 * step) : step;
+        int blk = (isa == cpu::x64::sse41) ? (2 * step) : step;

        Xbyak::Label main_loop_label;
        Xbyak::Label main_loop_end_label;
@ -493,13 +501,13 @@ private:

            linear_onnx_worker();

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, false);  // vmm_val is vmm_valTR
                add(reg_oc_off, step * sizeof(float));
            }
            store_vector(ptr[reg_dst], vmm_valTR, jcp_.dst_dt);

-            if ((isa == cpu::sse42) && (jcp_.layout == InterpolateLayoutType::block)) {
+            if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) {
                int sse42_offset = 4;  // vmm is xmm here
                load_vector(vmm_valTL, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
                load_vector(vmm_valTR, ptr[reg_src_aux + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
@ -508,7 +516,7 @@ private:

                linear_onnx_worker();

-                if (attr_.post_ops_.len_ != 0) {
+                if (attr_.post_ops_.len() != 0) {
                    apply_post_ops(jcp_.dst_dt, false);
                    add(reg_oc_off, step * sizeof(float));
                }
@ -552,7 +560,7 @@ private:

            linear_onnx_worker();

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, false);  // vmm_val is vmm_valTR
                add(reg_oc_off, step * sizeof(float));
            }
@ -583,7 +591,7 @@ private:

            linear_onnx_worker();

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, false);  // vmm_val is vmm_valTR
                add(reg_oc_off, step * sizeof(float));
            }
@ -638,14 +646,14 @@ private:
            vgatherdps(vmm_valBR, ptr[reg_src + vmm_index], vmm_mask);

            // reg_src_aux point to weight
-            load_vector(vmm_weightL, ptr[reg_src_aux], memory::f32);
-            load_vector(vmm_weightR, ptr[reg_src_aux + weight_stride], memory::f32);
-            load_vector(vmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::f32);
-            load_vector(vmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::f32);
+            load_vector(vmm_weightL, ptr[reg_src_aux], memory::data_type::f32);
+            load_vector(vmm_weightR, ptr[reg_src_aux + weight_stride], memory::data_type::f32);
+            load_vector(vmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::data_type::f32);
+            load_vector(vmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::data_type::f32);

            linear_onnx_worker();

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, true);  // vmm_val is vmm_valTR, broadcase is true
            }
            store_vector(ptr[reg_dst], vmm_valTR, jcp_.dst_dt);
@ -686,14 +694,14 @@ private:
            add(reg_src_aux1, reg_index_offset);
            load_scalar(xmm_valBR, ptr[reg_src_aux1], jcp_.src_dt);

-            load_scalar(xmm_weightL, ptr[reg_src_aux], memory::f32);
-            load_scalar(xmm_weightR, ptr[reg_src_aux + weight_stride], memory::f32);
-            load_scalar(xmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::f32);
-            load_scalar(xmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::f32);
+            load_scalar(xmm_weightL, ptr[reg_src_aux], memory::data_type::f32);
+            load_scalar(xmm_weightR, ptr[reg_src_aux + weight_stride], memory::data_type::f32);
+            load_scalar(xmm_weightT, ptr[reg_src_aux + 2 * weight_stride], memory::data_type::f32);
+            load_scalar(xmm_weightB, ptr[reg_src_aux + 3 * weight_stride], memory::data_type::f32);

            linear_onnx_worker();

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, true);  // process on vmm_val, vmm_val is vmm_valTR, and bc
            }
            store_scalar(ptr[reg_dst], xmm_valTR, jcp_.dst_dt);
@ -740,7 +748,7 @@ private:
        uni_vbroadcastss(vmm_weightY3, ptr[reg_src_aux1 + 3 * sizeof(float)]);

        int step = vlen / sizeof(float);
-        int blk = (isa == cpu::sse42) ? (2 * step) : step;
+        int blk = (isa == cpu::x64::sse41) ? (2 * step) : step;

        Xbyak::Label main_loop_label;
        Xbyak::Label main_loop_end_label;
@ -760,13 +768,13 @@ private:

            cubic_c_gathered_matrix(false);

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, false);     // vmm_val is default dst value to post_ops and store
                add(reg_oc_off, step * sizeof(float));
            }
            store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);

-            if ((isa == cpu::sse42) && (jcp_.layout == InterpolateLayoutType::block)) {
+            if ((isa == cpu::x64::sse41) && (jcp_.layout == InterpolateLayoutType::block)) {
                int sse42_offset = 4;  // vmm is xmm here
                add(reg_src, sse42_offset * jcp_.src_data_size);
                add(reg_dst, sse42_offset * jcp_.dst_data_size);
@ -775,7 +783,7 @@ private:

                cubic_c_gathered_matrix(false);

-                if (attr_.post_ops_.len_ != 0) {
+                if (attr_.post_ops_.len() != 0) {
                    apply_post_ops(jcp_.dst_dt, false);
                    add(reg_oc_off, step * sizeof(float));  // second step for one blk
                }
@ -814,7 +822,7 @@ private:

            cubic_c_gathered_matrix(true);

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, false);     // vmm_val is default dst value
                add(reg_oc_off, step * sizeof(float));
            }
@ -974,7 +982,7 @@ private:
            vgatherdps(vmm_weightY, ptr[reg_weight_y + 3 * sizeof(float) + (vmm_tbl_y * grid_len)], vmm_mask);
            cubic_planar_line(false);

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, true);  // oc_off is broadcast and always the same value for this channel
            }
            store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -997,15 +1005,15 @@ private:

            // get idx for input
            movss(Xmm(vmm_tbl_y.getIdx()), ptr[reg_tbl_y]);
-            gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, memory::s32, true);
+            gather_i32_indices(vmm_index_in_y, reg_index_y, 0, vmm_tbl_y, 1, memory::data_type::s32, true);

            movss(Xmm(vmm_val.getIdx()), ptr[reg_tbl_x]);
-            gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, memory::s32, true);
+            gather_i32_indices(vmm_index_in_x, reg_index, 0, vmm_val, 1, memory::data_type::s32, true);
            // gather weightX by input idx, used in y0-y3
-            gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, memory::f32, true);
-            gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, memory::f32, true);
-            gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, memory::f32, true);
-            gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, memory::f32, true);
+            gather_i32_indices(vmm_weightX0, reg_weight_x, 0, vmm_val, grid_len, memory::data_type::f32, true);
+            gather_i32_indices(vmm_weightX1, reg_weight_x, sizeof(float), vmm_val, grid_len, memory::data_type::f32, true);
+            gather_i32_indices(vmm_weightX2, reg_weight_x, 2 * sizeof(float), vmm_val, grid_len, memory::data_type::f32, true);
+            gather_i32_indices(vmm_weightX3, reg_weight_x, 3 * sizeof(float), vmm_val, grid_len, memory::data_type::f32, true);
            // vmm_val is now relieved and used for dst_value

            uni_vpxor(vmm_val, vmm_val, vmm_val);
@ -1015,7 +1023,7 @@ private:
            vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1));
            vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);

-            gather_i32_indices(vmm_weightY, reg_weight_y, 0, vmm_tbl_y, grid_len, memory::f32, true);
+            gather_i32_indices(vmm_weightY, reg_weight_y, 0, vmm_tbl_y, grid_len, memory::data_type::f32, true);
            cubic_planar_line(true);

            // y1
@ -1023,7 +1031,7 @@ private:
            vpminsd(vmm_index_y_itr, vmm_index_in_y, cubic_planar_table_val(1));
            vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
            // weight y1: shift weight_size
-            gather_i32_indices(vmm_weightY, reg_weight_y, sizeof(float), vmm_tbl_y, grid_len, memory::f32, true);
+            gather_i32_indices(vmm_weightY, reg_weight_y, sizeof(float), vmm_tbl_y, grid_len, memory::data_type::f32, true);
            cubic_planar_line(true);

            // y2
@ -1032,7 +1040,7 @@ private:
            vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1));
            vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
            // weight y2
-            gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, memory::f32, true);
+            gather_i32_indices(vmm_weightY, reg_weight_y, 2 * sizeof(float), vmm_tbl_y, grid_len, memory::data_type::f32, true);
            cubic_planar_line(true);

            // y3
@ -1042,10 +1050,10 @@ private:
            vpminsd(vmm_index_y_itr, vmm_index_y_itr, cubic_planar_table_val(1));
            vpmaxsd(vmm_index_y_itr, vmm_index_y_itr, vmm_zero);
            // weight y3
-            gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, memory::f32, true);
+            gather_i32_indices(vmm_weightY, reg_weight_y, 3 * sizeof(float), vmm_tbl_y, grid_len, memory::data_type::f32, true);
            cubic_planar_line(true);

-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, true);  // oc_off is broadcast and always the same value for this channel
            }
            store_scalar(ptr[reg_dst], Xmm(vmm_val.getIdx()), jcp_.dst_dt);
@ -1093,7 +1101,7 @@ private:
        vpaddd(vmm_mask, vmm_mask, vmm_one);  // (IW - 1) + 1 = IW
        uni_vpmulld(vmm_mask, vmm_mask, vmm_index_y_itr);
        uni_vpaddd(vmm_index_x_itr, vmm_index_x_itr, vmm_mask);
-        gather_i32_indices(vmm_src, reg_src, 0, vmm_index_x_itr, jcp_.src_data_size, memory::f32, is_scalar);
+        gather_i32_indices(vmm_src, reg_src, 0, vmm_index_x_itr, jcp_.src_data_size, memory::data_type::f32, is_scalar);

        if (itr == 0) {
            uni_vfmadd231ps(vmm_dstX, vmm_src, vmm_weightX0);
@ -1134,19 +1142,19 @@ private:
    inline void gather_i32_indices(Vmm vmm_src, const Xbyak::Reg64 &base, int offset, Vmm vmm_indices, int scale,
                                memory::data_type src_dt, bool is_scalar) {
        Xbyak::Address table_idx = ptr[base + offset + vmm_indices * scale];
-        if ((isa == cpu::avx512_common) && !is_scalar) {
+        if ((isa == cpu::x64::avx512_common) && !is_scalar) {
            // [0-15] bit of int to mask
            kmovw(k_mask, cubic_planar_table_val(3));
-            if (src_dt == memory::f32) {
+            if (src_dt == memory::data_type::f32) {
                vgatherdps(vmm_src | k_mask, table_idx);  // dword index, packed single data
-            } else if (src_dt == memory::s32) {
+            } else if (src_dt == memory::data_type::s32) {
                vpgatherdd(vmm_src | k_mask, table_idx);  // dword index, dword data
            }
-        } else if ((isa == cpu::avx2) && !is_scalar) {
+        } else if ((isa == cpu::x64::avx2) && !is_scalar) {
            uni_vpcmpeqd(vmm_mask, vmm_mask, vmm_mask);
-            if (src_dt == memory::f32) {
+            if (src_dt == memory::data_type::f32) {
                vgatherdps(vmm_src, table_idx, vmm_mask);
-            } else if (src_dt == memory::s32) {
+            } else if (src_dt == memory::data_type::s32) {
                vpgatherdd(vmm_src, table_idx, vmm_mask);
            }
        } else {
@ -1177,17 +1185,17 @@ private:

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
@ -1195,23 +1203,23 @@ private:
                assert(!"unknown dst_dt");
        }

-        if (src_dt != memory::f32 && src_dt != data_type::bf16)
+        if (src_dt != memory::data_type::f32 && src_dt != data_type::bf16)
            uni_vcvtdq2ps(vmm_src, vmm_src);
    }

    inline void load_xmm(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(xmm_src, op);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(xmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(xmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(xmm_src, op);
                uni_vpslld(xmm_src, xmm_src, 16);
                break;
@ -1219,25 +1227,25 @@ private:
                assert(!"unknown dst_dt");
        }

-        if (src_dt != memory::f32 && src_dt != data_type::bf16)
+        if (src_dt != memory::data_type::f32 && src_dt != data_type::bf16)
            uni_vcvtdq2ps(xmm_src, xmm_src);
    }

    inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(xmm_src, op);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                movsx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                movzx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                pinsrw(xmm_src, op, 0x0);
                uni_vpslld(xmm_src, xmm_src, 16);
                break;
@ -1254,38 +1262,38 @@ private:
        Ymm ymm_dst = Ymm(vmm_dst.getIdx());
        Xmm xmm_dst = Xmm(vmm_dst.getIdx());

-        if (dst_dt == memory::f32) {
+        if (dst_dt == memory::data_type::f32) {
            uni_vmovups(op, vmm_dst);
-        } else if (dst_dt == memory::u8) {
+        } else if (dst_dt == memory::data_type::u8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
-            if (isa == cpu::avx512_common) {
+            if (isa == cpu::x64::avx512_common) {
                vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
                vpmovusdb(op, vmm_dst);
            } else {
                uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
            }
-        } else if (dst_dt == memory::s8) {
+        } else if (dst_dt == memory::data_type::s8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
-            if (isa == cpu::avx512_common) {
+            if (isa == cpu::x64::avx512_common) {
                vpmovsdb(op, vmm_dst);
            } else {
                uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
            }
-        } else if (dst_dt == memory::bf16) {
+        } else if (dst_dt == memory::data_type::bf16) {
            if (mayiuse(avx512_core_bf16))
                vcvtneps2bf16(ymm_dst, vmm_dst);
            else
@ -1295,26 +1303,26 @@ private:
    }

    inline void store_xmm(const Xbyak::Address &op, Xmm xmm_dst, memory::data_type dst_dt) {
-        if (dst_dt != memory::f32 && dst_dt != memory::bf16) {
+        if (dst_dt != memory::data_type::f32 && dst_dt != memory::data_type::bf16) {
            uni_vcvtps2dq(xmm_dst, xmm_dst);
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(op, xmm_dst);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
                movd(op, xmm_dst);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                movd(op, xmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                pshuflw(xmm_dst, xmm_dst, 0x0d);  // 01 01 01 01 --> 01 01 11 00  imm=0b00001101
                pshufhw(xmm_dst, xmm_dst, 0x0d);  // 01 01 11 00 --> 11 00 11 00
                pshufd(xmm_dst, xmm_dst, 0x08);   // 11 00 11 00 --> 11 11 00 00  imm=0b00001000
@ -1331,23 +1339,23 @@ private:
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(op, xmm_dst);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
                mov(op, reg_tmp_8);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
                mov(op, reg_tmp_8);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpsrld(xmm_dst, xmm_dst, 16);
                pextrw(op, xmm_dst, 0x0);
                break;
@ -1362,7 +1370,7 @@ private:
        int eltwise_inj_idx = 0;
        int depthwise_inj_idx = 0;
        int quantization_inj_idx = 0;
-        for (int i = 0; i < p.len_; i++) {
+        for (int i = 0; i < p.len(); i++) {
            auto& post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
@ -1377,7 +1385,7 @@ private:
                depthwise_inj_idx++;
            } else if (post_op.is_quantization()) {
                bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
-                bool do_rounding = do_dequantization || dst_dt == memory::f32 || i != p.len_ - 1;
+                bool do_rounding = do_dequantization || dst_dt == memory::data_type::f32 || i != p.len() - 1;

                int s_idx = vmm_val.getIdx();

@ -1636,49 +1644,56 @@ void MKLDNNInterpolateNode::initSupportedPrimitiveDescriptors() {
    auto scalesType = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::FP32);
    auto axesType = MKLDNNExtensionUtils::IEPrecisionToDataType(Precision::I32);

-    auto pushDesc = [&](memory::format dataFormat, impl_desc_type implDetail) {
+    auto pushDesc = [&](memory::format_tag dataFormat, impl_desc_type implDetail) {
        config.inConfs[DATA_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(DATA_ID)->getDims(), inputDataType, dataFormat);
-        config.inConfs[TARGET_SHAPE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(TARGET_SHAPE_ID)->getDims(), targetShapeType, memory::x);
-        config.inConfs[SCALES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(SCALES_ID)->getDims(), scalesType, memory::x);
+        config.inConfs[TARGET_SHAPE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(TARGET_SHAPE_ID)->getDims(), targetShapeType, memory::format_tag::x);
+        config.inConfs[SCALES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(SCALES_ID)->getDims(), scalesType, memory::format_tag::x);
        if (isAxesSpecified)
-            config.inConfs[AXES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXES_ID)->getDims(), axesType, memory::x);
+            config.inConfs[AXES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXES_ID)->getDims(), axesType, memory::format_tag::x);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, dataFormat);
        supportedPrimitiveDescriptors.push_back({config, implDetail, dataFormat});
    };

+    auto channels = getParentEdgeAt(DATA_ID)->getDims().ndims() > 1 ? getParentEdgeAt(DATA_ID)->getDims()[1] : 1;
    if (mode != InterpolateMode::linear) {
        // blk and by_channel JIT kernel on sse42 or above machine
-        if (mayiuse(cpu::sse42)) {
+        if (mayiuse(cpu::x64::sse41)) {
            if (getParentEdgeAt(DATA_ID)->getDims().ndims() == 4) {
-                if (mayiuse(cpu::avx512_common)) {
-                    pushDesc(memory::nhwc, jit_avx512);
-                    pushDesc(memory::nChw16c, jit_avx512);
-                } else if (mayiuse(cpu::avx2)) {
-                    pushDesc(memory::nhwc, jit_avx2);
-                    pushDesc(memory::nChw8c, jit_avx2);
+                if (mayiuse(cpu::x64::avx512_common)) {
+                    pushDesc(memory::format_tag::nhwc, jit_avx512);
+                    if (channels != 1)
+                        pushDesc(memory::format_tag::nChw16c, jit_avx512);
+                } else if (mayiuse(cpu::x64::avx2)) {
+                    pushDesc(memory::format_tag::nhwc, jit_avx2);
+                    if (channels != 1)
+                        pushDesc(memory::format_tag::nChw8c, jit_avx2);
                } else {
-                    pushDesc(memory::nhwc, jit_sse42);
-                    pushDesc(memory::nChw8c, jit_sse42);
+                    pushDesc(memory::format_tag::nhwc, jit_sse42);
+                    if (channels != 1)
+                        pushDesc(memory::format_tag::nChw8c, jit_sse42);
                }
            } else if (getParentEdgeAt(DATA_ID)->getDims().ndims() == 5 && mode == InterpolateMode::nearest) {
-                if (mayiuse(cpu::avx512_common)) {
-                    pushDesc(memory::ndhwc, jit_avx512);
-                    pushDesc(memory::nCdhw16c, jit_avx512);
-                } else if (mayiuse(cpu::avx2)) {
-                    pushDesc(memory::ndhwc, jit_avx2);
-                    pushDesc(memory::nCdhw8c, jit_avx2);
+                if (mayiuse(cpu::x64::avx512_common)) {
+                    pushDesc(memory::format_tag::ndhwc, jit_avx512);
+                    if (channels != 1)
+                        pushDesc(memory::format_tag::nCdhw16c, jit_avx512);
+                } else if (mayiuse(cpu::x64::avx2)) {
+                    pushDesc(memory::format_tag::ndhwc, jit_avx2);
+                    if (channels != 1)
+                        pushDesc(memory::format_tag::nCdhw8c, jit_avx2);
                } else {
-                    pushDesc(memory::ndhwc, jit_sse42);
-                    pushDesc(memory::nCdhw8c, jit_sse42);
+                    pushDesc(memory::format_tag::ndhwc, jit_sse42);
+                    if (channels != 1)
+                        pushDesc(memory::format_tag::nCdhw8c, jit_sse42);
                }
            }
        }

        // planar for 1.ref on machine without sse42(if no sse42, canFuse() is false). 2.JIT kernel for f32 && avx2(gather).(with fuse)
-        if (!mayiuse(cpu::sse42))
+        if (!mayiuse(cpu::x64::sse41))
            pushDesc(MKLDNNMemory::GetPlainFormat(getParentEdgeAt(DATA_ID)->getDims()), ref);

-        if (mayiuse(cpu::avx2) && inputPrec == Precision::FP32) {
+        if (mayiuse(cpu::x64::avx2) && inputPrec == Precision::FP32) {
            pushDesc(MKLDNNMemory::GetPlainFormat(getParentEdgeAt(DATA_ID)->getDims()), jit_avx2);
        }
    } else {
@ -1708,7 +1723,6 @@ void MKLDNNInterpolateNode::createPrimitive() {
        THROW_IE_EXCEPTION << "Interpolate layer with name '" << getName() << "' did not set preferable primitive descriptor";

    auto selectedPD = getSelectedPrimitiveDescriptor();
-    Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
    auto jcp = jit_interpolate_config_params();
    jcp.mode = mode;
    jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[0].desc.getPrecision());
@ -1722,29 +1736,33 @@ void MKLDNNInterpolateNode::createPrimitive() {
    jcp.IW = srcDimPad[dimSize - 1];
    jcp.IH = srcDimPad[dimSize - 2];

-    if (MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims()) == selected_layout) {
+    if (getChildEdgeAt(0)->getMemory().GetDesc().isPlainFormat()) {
        jcp.layout = InterpolateLayoutType::planar;
-    } else if ((selected_layout == NHWC) || (selected_layout == NDHWC)) {
-        jcp.layout = InterpolateLayoutType::by_channel;
-    } else {
+    } else if (getChildEdgeAt(0)->getMemory().GetDesc().isBlockedCFormat()) {
        jcp.layout = InterpolateLayoutType::block;
+    } else {
+        jcp.layout = InterpolateLayoutType::by_channel;
    }

+    configured_for_layout = jcp.layout;
+
    if (mode == InterpolateMode::nearest || mode == InterpolateMode::linear_onnx || mode == InterpolateMode::cubic) {
        if (jcp.layout != InterpolateLayoutType::planar) {
-            if (mayiuse(cpu::avx512_common)) {
-                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
-            } else if (mayiuse(cpu::avx2)) {
-                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::avx2>(jcp, *attr.get()));
-            } else if (mayiuse(cpu::sse42)) {
-                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::sse42>(jcp, *attr.get()));
+            if (mayiuse(cpu::x64::avx512_common)) {
+                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx512_common>(jcp, *attr.get()));
+            } else if (mayiuse(cpu::x64::avx2)) {
+                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
+            } else if (mayiuse(cpu::x64::sse41)) {
+                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::sse41>(jcp, *attr.get()));
            }
        } else {
            // gather ISA(for planar JIT kernel) for avx2 and fp32
-            if (mayiuse(cpu::avx2) && inputPrec == Precision::FP32) {
-                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::avx2>(jcp, *attr.get()));
+            if (mayiuse(cpu::x64::avx2) && inputPrec == Precision::FP32) {
+                interpolateKernel.reset(new jit_uni_interpolate_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
            }
        }
+        if (interpolateKernel)
+            interpolateKernel->create_ker();
    }

    // build indices table
@ -2133,10 +2151,8 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
    auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
    auto &srcMemPtr = getParentEdgeAt(DATA_ID)->getMemoryPtr();

-    uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetData()) +
-            dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dstDataSize;
-    uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData()) +
-            srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * srcDataSize;
+    uint8_t *dst_data = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
+    uint8_t *src_data_origin = reinterpret_cast<uint8_t*>(srcMemPtr->GetData());

    size_t dimSize = srcDim.size();
    SizeVector srcDimPad = getPaddedInputShape();
@ -2145,16 +2161,6 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
    auto srcDimPad5d = to5Dim(srcDimPad);
    auto dstDim5d = to5Dim(dstDim);

-    InterpolateLayoutType layout;
-    Layout selected_layout = getParentEdgeAt(DATA_ID)->getDesc().getLayout();
-    if (MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims()) == selected_layout) {
-        layout = InterpolateLayoutType::planar;
-    } else if ((selected_layout == NHWC) || (selected_layout == NDHWC)) {
-        layout = InterpolateLayoutType::by_channel;
-    } else {
-        layout = InterpolateLayoutType::block;
-    }
-
    uint8_t *src_data = nullptr;
    std::vector<uint8_t> srcPadded;
    if (hasPad) {
@ -2167,7 +2173,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
        SizeVector inShapeBlock = getBlockND(srcDim5d);
        SizeVector inShapePadBlock = getBlockND(srcDimPad5d);

-        if (layout == InterpolateLayoutType::planar) {
+        if (configured_for_layout == InterpolateLayoutType::planar) {
            srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
            uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
            parallel_for4d(srcDim5d[0], srcDim5d[1], srcDim5d[2], srcDim5d[3], [&](int n, int c, int d, int h) {
@ -2177,7 +2183,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
                cpu_memcpy(srcPad, src, srcDim5d[4] * srcDataSize);
            });
            src_data = src_data_pad;
-        } else if (layout == InterpolateLayoutType::by_channel) {
+        } else if (configured_for_layout == InterpolateLayoutType::by_channel) {
            srcPadded.resize(inShapePadBlock[0] * srcDataSize, 0);
            uint8_t *src_data_pad = static_cast<uint8_t *>(&srcPadded[0]);
            parallel_for4d(srcDim5d[0], srcDim5d[2], srcDim5d[3], srcDim5d[4], [&](int n, int d, int h, int w) {
@ -2188,8 +2194,8 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
                cpu_memcpy(srcPad, src, srcDim5d[1] * srcDataSize);
            });
            src_data = src_data_pad;
-        } else if (layout == InterpolateLayoutType::block) {
-            size_t blkSize = mayiuse(cpu::avx512_common) ? 16 : 8;
+        } else if (configured_for_layout == InterpolateLayoutType::block) {
+            size_t blkSize = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
            size_t CB = div_up(srcDimPad5d[1], blkSize);
            size_t eltsTotal = srcDimPad5d[0] * CB * srcDimPad5d[2] * srcDimPad5d[3] * srcDimPad5d[4] * blkSize;
            srcPadded.resize(eltsTotal * srcDataSize, 0x0);
@ -2227,7 +2233,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
    switch (mode) {
        case InterpolateMode::nearest: {
            if (interpolateKernel) {
-                if (layout == InterpolateLayoutType::planar) {
+                if (configured_for_layout == InterpolateLayoutType::planar) {
                    NNPlanar(src_data, dst_data, N, C, ID, IH, IW, OD, OH, OW);
                } else {
                    NNCGathered(src_data, dst_data, N, C, ID, IH, IW, OD, OH, OW);
@ -2239,7 +2245,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
        }
        case InterpolateMode::linear_onnx: {
            if (interpolateKernel) {
-                if (layout == InterpolateLayoutType::planar) {
+                if (configured_for_layout == InterpolateLayoutType::planar) {
                    linearOnnxPlanar(src_data, dst_data, N, C, IH, IW, OH, OW);
                } else {
                    linearOnnxCGathered(src_data, dst_data, N, C, IH, IW, OH, OW);
@ -2251,7 +2257,7 @@ void MKLDNNInterpolateNode::execute(mkldnn::stream strm) {
        }
        case InterpolateMode::cubic: {
            if (interpolateKernel) {
-                if (layout == InterpolateLayoutType::planar) {
+                if (configured_for_layout == InterpolateLayoutType::planar) {
                    cubicPlanar(src_data, dst_data, N, C, IH, IW, OH, OW);
                } else {
                    cubicCGathered(src_data, dst_data, N, C, IH, IW, OH, OW);
@ -2284,8 +2290,7 @@ void MKLDNNInterpolateNode::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr
    int *index_h = static_cast<int*>(&indexTable[OD]);
    int *index_w = static_cast<int*>(&indexTable[OD + OH]);

-    Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
-    bool is_nhwc = (layout == NHWC || layout == NDHWC) ? true : false;
+    bool is_nhwc = (configured_for_layout == by_channel);

    for (int b = 0; b < B; b++) {
        if (is_nhwc) {
@ -2308,7 +2313,7 @@ void MKLDNNInterpolateNode::NNCGathered(const uint8_t *in_ptr_, uint8_t *out_ptr
                (*interpolateKernel)(&arg);
            });
        } else {  // for blk
-            int blk_size = mayiuse(cpu::avx512_common) ? 16 : 8;
+            int blk_size = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
            int CB = div_up(C, blk_size);
            const uint8_t *in_ptr = in_ptr_ + (IW * IH * ID * CB * blk_size * b) * srcDataSize;
            uint8_t *out_ptr = out_ptr_ + (OW * OH * OD * CB * blk_size * b) * dstDataSize;
@ -2414,10 +2419,9 @@ void MKLDNNInterpolateNode::linearOnnxCGathered(const uint8_t *in_ptr_, uint8_t
    float *weightTop = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW]);
    float *weightBottom = reinterpret_cast<float*>(&indexTable[scratchLen + 2 * OW + OH]);

-    Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
-    bool isByChannel = (layout == NHWC) ? true : false;
+    bool isByChannel = (configured_for_layout == by_channel) ? true : false;

-    int blkSize = mayiuse(cpu::avx512_common) ? 16 : 8;
+    int blkSize = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
    int CB = div_up(C, blkSize);
    int CSize = isByChannel ? C : blkSize * CB;
    int CGatherLen = isByChannel ? C : blkSize;
@ -2600,14 +2604,11 @@ void MKLDNNInterpolateNode::cubicCGathered(const uint8_t *in_ptr_, uint8_t *out_
    int *yOrigin = static_cast<int*>(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW]);
    float *yFactor = reinterpret_cast<float*>(&indexTable[(CUBIC_GRID_LEN + idxNum) * OW + OH]);

-    Layout layout = getParentEdgeAt(0)->getDesc().getLayout();
-    bool isByChannel = (layout == NHWC) ? true : false;
-
-    int blkSize = mayiuse(cpu::avx512_common) ? 16 : 8;
+    int blkSize = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
    int CB = div_up(C, blkSize);
-    int CSize = isByChannel ? C : blkSize * CB;
-    int CGatherLen = isByChannel ? C : blkSize;
-    int workAmount = isByChannel ? C : CB;
+    int CSize = configured_for_layout == InterpolateLayoutType::by_channel ? C : blkSize * CB;
+    int CGatherLen = configured_for_layout == InterpolateLayoutType::by_channel ? C : blkSize;
+    int workAmount = configured_for_layout == InterpolateLayoutType::by_channel ? C : CB;

    parallel_for3d(B, OH, OW, [&](size_t b, size_t h, size_t w) {
        uint8_t *out_ptr_nhw = out_ptr_ + (OH * OW * CSize * b + OW * CGatherLen * h + CGatherLen * w) * dstDataSize;
@ -2848,7 +2849,7 @@ bool MKLDNNInterpolateNode::canFuse(const MKLDNNNodePtr& node) const {
        return false;
    };

-    if (!mayiuse(cpu::sse42) || mode == InterpolateMode::linear) {
+    if (!mayiuse(cpu::x64::sse41) || mode == InterpolateMode::linear) {
        return false;
    }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_interpolate_node.h
@ -75,6 +75,8 @@ struct jit_uni_interpolate_kernel {
    explicit jit_uni_interpolate_kernel(jit_interpolate_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
    virtual ~jit_uni_interpolate_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_interpolate_config_params jcp_;
    const mkldnn_primitive_attr &attr_;
 };
@ -163,6 +165,8 @@ private:
    InferenceEngine::Precision inputPrec, outputPrec;
    size_t srcDataSize, dstDataSize;

+    InterpolateLayoutType configured_for_layout;
+
    std::vector<int> indexTable;

    std::shared_ptr<jit_uni_interpolate_kernel> interpolateKernel;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_lrn_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_lrn_node.h"
-#include "desc_iterator.hpp"
+
 #include <legacy/ie_layers.h>
 #include <string>
 #include <mkldnn_extension_utils.h>
@ -52,8 +52,11 @@ void MKLDNNLrnNode::createPrimitive() {

    auto prim_desc = createPrimitiveDescriptor<lrn_forward::primitive_desc, lrn_forward::desc>();

-    prim.reset(new lrn_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                               getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    prim.reset(new lrn_forward(prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
 }

 bool MKLDNNLrnNode::created() const {
@ -86,7 +89,7 @@ void MKLDNNLrnNode::initOptimalPrimitiveDescriptor() {

 void MKLDNNLrnNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
                                     const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
-    algorithm alg = (isAcrossMaps) ? lrn_across_channels : lrn_within_channel;
+    algorithm alg = (isAcrossMaps) ? algorithm::lrn_across_channels : algorithm::lrn_within_channel;
    MKLDNNMemoryDesc in_candidate(inputDesc[0]);
    MKLDNNDescriptor desc(std::shared_ptr<lrn_forward::desc>(
            new lrn_forward::desc(prop_kind::forward_scoring, alg, in_candidate, size, alpha, beta, k)));
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_memory_node.cpp
@ -39,7 +39,7 @@ void MKLDNNMemoryOutputNode::initSupportedPrimitiveDescriptors() {
    config.inConfs[0].inPlace = -1;
    config.inConfs[0].constant = false;
    config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, MKLDNNMemory::GetPlainFormat(getParentEdgeAt(0)->getDims()));
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, memory::format::any);
+    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, memory::format_tag::any);
 }

 void MKLDNNMemoryOutputNode::execute(mkldnn::stream strm)  {
@ -76,14 +76,8 @@ void MKLDNNMemoryInputNode::createPrimitive() {
 */
 inline
 static void simple_copy(MKLDNNMemory& dst, const MKLDNNMemory& src) {
-    auto getDataWithOff = [] (const MKLDNNMemory& mem) {
-        auto elemSize = MKLDNNExtensionUtils::sizeOfDataType(mem.GetDataType());
-        return static_cast<uint8_t*>(mem.GetData()) +
-                mem.GetDescriptor().data.layout_desc.blocking.offset_padding * elemSize;
-    };
-
-    auto srcPtr = getDataWithOff(src);
-    auto dstPtr = getDataWithOff(dst);
+    auto srcPtr = static_cast<uint8_t*>(src.GetPtr());
+    auto dstPtr = static_cast<uint8_t*>(dst.GetPtr());
    auto srcSizeInByte = src.GetSize();
    auto dstSizeInByte = dst.GetSize();

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_mvn_node.h"
-#include "desc_iterator.hpp"
+
 #include "mkldnn_quantize_node.h"
 #include <legacy/ie_layers.h>
 #include "mkldnn_eltwise_node.h"
@ -17,16 +17,17 @@
 #include "ie_parallel.hpp"
 #include <algorithm>

-#include "jit_generator.hpp"
-#include "jit_uni_eltwise.hpp"
-#include "jit_uni_depthwise.hpp"
-#include "jit_uni_quantization.hpp"
+#include <cpu/x64/jit_generator.hpp>
+#include <cpu/x64/jit_uni_eltwise.hpp>
+#include <cpu/x64/jit_uni_depthwise_injector.hpp>
+#include <cpu/x64/jit_uni_quantization_injector.hpp>
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl;
-using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;
 using namespace Xbyak;

@ -38,7 +39,7 @@ static inline bool isFloatCompatible(Precision prc) {
 }

 static inline bool isFloatCompatible(memory::data_type type) {
-    return memory::f32 == type || memory::bf16 == type;
+    return memory::data_type::f32 == type || memory::data_type::bf16 == type;
 }

 // normalize_variance = false : src->mean
@ -47,7 +48,14 @@ template <cpu_isa_t isa>
 struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_mvn_mean_kernel_f32)

-    explicit jit_uni_mvn_mean_variance_kernel_f32(jit_mvn_config_params jcp) : jit_uni_mvn_mean_variance_kernel(jcp), jit_generator() {
+    explicit jit_uni_mvn_mean_variance_kernel_f32(jit_mvn_config_params jcp) : jit_uni_mvn_mean_variance_kernel(jcp), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        this->preamble();
        mov(reg_src, ptr[reg_params + GET_OFF(src)]);
        if (jcp_.normalize_variance) {
@ -59,7 +67,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
        mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
        mov(reg_stride, ptr[reg_params + GET_OFF(src_stride)]);

-        int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::sse42) ? 2 : 1;  // block size is also 8 on cpu::sse42
+        int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::x64::sse41) ? 2 : 1;  // block size is also 8 on cpu::x64::sse41
        for (int i = 0; i < repeats; i++) {
            int offset_sse42 = i * 4;
            if (i > 0) {
@ -120,9 +128,9 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
            if (jcp_.planar_layout) {
                Vmm vmm_dst = jcp_.normalize_variance ? vmm_variance : vmm_sum;
                // hsum+store
-                if (isa == cpu::sse42) {
+                if (isa == cpu::x64::sse41) {
                    hsum_store(vmm_dst);
-                } else if (isa == cpu::avx2) {
+                } else if (isa == cpu::x64::avx2) {
                    Xbyak::Ymm ymm_sum = Xbyak::Ymm(vmm_dst.getIdx());
                    vextractf128(xmm_aux1, ymm_sum, 0);
                    vextractf128(xmm_aux2, ymm_sum, 1);
@ -162,11 +170,10 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k
        }

        this->postamble();
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;

    Xbyak::Reg64 reg_src = r8;
@ -199,17 +206,17 @@ private:

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
@ -224,13 +231,20 @@ template <cpu_isa_t isa>
 struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_mvn_kernel_f32)

-    explicit jit_uni_mvn_kernel_f32(jit_mvn_config_params jcp, const mkldnn_primitive_attr &attr) : jit_uni_mvn_kernel(jcp, attr), jit_generator() {
+    explicit jit_uni_mvn_kernel_f32(jit_mvn_config_params jcp, const mkldnn_primitive_attr &attr) : jit_uni_mvn_kernel(jcp, attr), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        const auto &p = attr_.post_ops_;
-        for (int i = 0; i < p.len_; i++) {
+        for (int i = 0; i < p.len(); i++) {
            auto &post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
-                        this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
+                        this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale));
            } else if (post_op.is_depthwise()) {
                depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
                        this, post_op.depthwise.alg));
@ -252,13 +266,13 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
        mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
        mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]);
        mov(reg_dst_stride, ptr[reg_params + GET_OFF(dst_stride)]);
-        if (attr_.post_ops_.len_ != 0)
+        if (attr_.post_ops_.len() != 0)
            mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);

        if (isa == avx512_common)
            uni_vpxor(vmm_zero, vmm_zero, vmm_zero);

-        int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::sse42) ? 2 : 1;  // block size is also 8 on cpu::sse42
+        int repeats = (!jcp_.planar_layout && !jcp_.across_channels && isa == cpu::x64::sse41) ? 2 : 1;  // block size is also 8 on cpu::x64::sse41
        for (int i = 0; i < repeats; i++) {
            int offset_sse42 = i * 4;
            if (i > 0) {
@ -270,7 +284,7 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator
                add(reg_dst, offset_sse42 * jcp_.dst_data_size);
                add(reg_mean, offset_sse42 * sizeof(float));
                add(reg_variance_inv, offset_sse42 * sizeof(float));
-                if (attr_.post_ops_.len_ != 0)
+                if (attr_.post_ops_.len() != 0)
                    add(reg_oc_off, offset_sse42 * sizeof(float));
            }

@ -319,12 +333,10 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator

        for (auto& inj : eltwise_injectors)
            inj->prepare_table();
-
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;

    const int vlen = cpu_isa_traits<isa>::vlen;
@ -360,17 +372,17 @@ private:

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
@ -386,39 +398,39 @@ private:
        Ymm ymm_dst = Ymm(vmm_dst.getIdx());
        Xmm xmm_dst = Xmm(vmm_dst.getIdx());

-        if (dst_dt == memory::f32) {
+        if (dst_dt == memory::data_type::f32) {
            uni_vmovups(op, vmm_dst);
-        } else if (dst_dt == memory::bf16) {
+        } else if (dst_dt == memory::data_type::bf16) {
            if (mayiuse(avx512_core_bf16))
                vcvtneps2bf16(ymm_dst, vmm_dst);
            else
                emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
            vmovdqu16(op, ymm_dst);
-        } else if (dst_dt == memory::u8) {
+        } else if (dst_dt == memory::data_type::u8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
-            if (isa == cpu::avx512_common) {
+            if (isa == cpu::x64::avx512_common) {
                vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
                vpmovusdb(op, vmm_dst);
            } else {
                uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
            }
-        } else if (dst_dt == memory::s8) {
+        } else if (dst_dt == memory::data_type::s8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
-            if (isa == cpu::avx512_common) {
+            if (isa == cpu::x64::avx512_common) {
                vpmovsdb(op, vmm_dst);
            } else {
                uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
@ -431,7 +443,7 @@ private:
        int eltwise_inj_idx = 0;
        int depthwise_inj_idx = 0;
        int quantization_inj_idx = 0;
-        for (int i = 0; i < p.len_; i++) {
+        for (int i = 0; i < p.len(); i++) {
            auto& post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors[eltwise_inj_idx]->compute_vector_range(vmm_val.getIdx(), vmm_val.getIdx() + 1);
@ -445,7 +457,7 @@ private:
                depthwise_inj_idx++;
            } else if (post_op.is_quantization()) {
                bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
-                bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
+                bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len() - 1;
                int s_idx = vmm_val.getIdx();

                quantization_injectors[quantization_inj_idx]->init_crop_ptrs(reg_oc_off);
@ -539,18 +551,18 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {
    config.inConfs[0].inPlace = -1;
    config.outConfs[0].inPlace = canBeInplace ? 0 : -1;

-    auto pushDesc = [&](memory::format format, impl_desc_type impl_type) {
+    auto pushDesc = [&](memory::format_tag format, impl_desc_type impl_type) {
        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format);
        supportedPrimitiveDescriptors.push_back({config, impl_type, format});
    };

    impl_desc_type impl_type;
-    if (mayiuse(cpu::avx512_common)) {
+    if (mayiuse(cpu::x64::avx512_common)) {
        impl_type = impl_desc_type::jit_avx512;
-    } else if (mayiuse(cpu::avx2)) {
+    } else if (mayiuse(cpu::x64::avx2)) {
        impl_type = impl_desc_type::jit_avx2;
-    } else if (mayiuse(cpu::sse42)) {
+    } else if (mayiuse(cpu::x64::sse41)) {
        impl_type = impl_desc_type::jit_sse42;
    } else {
        impl_type = impl_desc_type::ref;
@ -558,24 +570,24 @@ void MKLDNNMVNNode::initSupportedPrimitiveDescriptors() {

    if (across_channels == 0 && normalize_variance == 1) {
        if (getParentEdgeAt(0)->getDims().ndims() == 4) {
-            pushDesc(memory::nhwc, impl_type);
+            pushDesc(memory::format_tag::nhwc, impl_type);
        } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
-            pushDesc(memory::ndhwc, impl_type);
+            pushDesc(memory::format_tag::ndhwc, impl_type);
        }
    }

    if (isFloatCompatible(inputPrecision) && isFloatCompatible(outputPrecision)) {
        if (impl_desc_type::jit_avx512 == impl_type) {
            if (getParentEdgeAt(0)->getDims().ndims() == 4) {
-                pushDesc(memory::nChw16c, impl_type);
+                pushDesc(memory::format_tag::nChw16c, impl_type);
            } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
-                pushDesc(memory::nCdhw16c, impl_type);
+                pushDesc(memory::format_tag::nCdhw16c, impl_type);
            }
        } else if (impl_desc_type::jit_avx2 ==  impl_type || impl_desc_type::jit_sse42 == impl_type) {
            if (getParentEdgeAt(0)->getDims().ndims() == 4) {
-                pushDesc(memory::nChw8c, impl_type);
+                pushDesc(memory::format_tag::nChw8c, impl_type);
            } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
-                pushDesc(memory::nCdhw8c, impl_type);
+                pushDesc(memory::format_tag::nCdhw8c, impl_type);
            }
        }

@ -607,34 +619,42 @@ void MKLDNNMVNNode::createPrimitive() {
    jcp.normalize_variance = normalize_variance;
    jcp.across_channels = across_channels;

-    if (mayiuse(cpu::avx512_common)) {
-        mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
+    if (mayiuse(cpu::x64::avx512_common)) {
+        mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::x64::avx512_common>(jcp, *attr.get()));

        jcp.normalize_variance = false;
-        mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx512_common>(jcp));
+        mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx512_common>(jcp));
        if (normalize_variance) {
            jcp.normalize_variance = true;
-            mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx512_common>(jcp));
+            mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx512_common>(jcp));
        }
-    } else if (mayiuse(cpu::avx2)) {
-        mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::avx2>(jcp, *attr.get()));
+    } else if (mayiuse(cpu::x64::avx2)) {
+        mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));

        jcp.normalize_variance = false;
-        mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx2>(jcp));
+        mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx2>(jcp));
        if (normalize_variance) {
            jcp.normalize_variance = true;
-            mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::avx2>(jcp));
+            mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::avx2>(jcp));
        }
-    } else if (mayiuse(cpu::sse42)) {
-        mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::sse42>(jcp, *attr.get()));
+    } else if (mayiuse(cpu::x64::sse41)) {
+        mvn_kernel.reset(new jit_uni_mvn_kernel_f32<cpu::x64::sse41>(jcp, *attr.get()));

        jcp.normalize_variance = false;
-        mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::sse42>(jcp));
+        mvn_mean_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::sse41>(jcp));
        if (normalize_variance) {
            jcp.normalize_variance = true;
-            mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::sse42>(jcp));
+            mvn_variance_kernel.reset(new jit_uni_mvn_mean_variance_kernel_f32<cpu::x64::sse41>(jcp));
        }
    }
+    if (mvn_kernel)
+        mvn_kernel->create_ker();
+
+    if (mvn_mean_kernel)
+        mvn_mean_kernel->create_ker();
+
+    if (mvn_variance_kernel)
+        mvn_variance_kernel->create_ker();
 }

 void MKLDNNMVNNode::setPostOps(mkldnn::primitive_attr &attr, bool initWeights) {
@ -782,11 +802,11 @@ std::tuple<size_t, size_t, size_t, size_t, size_t> MKLDNNMVNNode::get5dShapes(co
 template <typename in_data_t, typename out_data_t>
 void MKLDNNMVNNode::mvn_pln(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) {
    size_t blk_size = 1;  // blk size in vmm
-    if (mayiuse(cpu::avx512_common)) {
+    if (mayiuse(cpu::x64::avx512_common)) {
        blk_size = 16;
-    } else if (mayiuse(cpu::avx2)) {
+    } else if (mayiuse(cpu::x64::avx2)) {
        blk_size = 8;
-    } else if (mayiuse(cpu::sse42)) {
+    } else if (mayiuse(cpu::x64::sse41)) {
        blk_size = 4;
    }

@ -1005,10 +1025,10 @@ template <typename in_data_t, typename out_data_t>
 void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, const SizeVector& dims) {
    size_t blk_size = 1;  // channel blk for memory layout
    size_t ele_in_vmm = 4;
-    if (mayiuse(cpu::avx512_common)) {
+    if (mayiuse(cpu::x64::avx512_common)) {
        blk_size = 16;
        ele_in_vmm = 16;
-    } else if (mayiuse(cpu::avx2)) {
+    } else if (mayiuse(cpu::x64::avx2)) {
        blk_size = 8;
        ele_in_vmm = 8;
    } else {
@ -1036,7 +1056,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
    size_t C3 = C2 * CB;
    size_t C5 = C * D * H * W;

-    size_t threads_num =  mkldnn_get_max_threads();
+    size_t threads_num =  parallel_get_num_threads();
    size_t aux_buffer_size = across_channels ? blk_size : rnd_up(C, blk_size);
    std::vector<float> mean_buffer(aux_buffer_size * threads_num);
    std::vector<float> variance_buffer(aux_buffer_size * threads_num);
@ -1053,7 +1073,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con

                float mean_internal = 0.0f;
                if ((min_cb == blk_size) && mvn_mean_kernel) {
-                    auto mean_buffer_ptr = &mean_buffer[blk_size * mkldnn_get_thread_num()];
+                    auto mean_buffer_ptr = &mean_buffer[blk_size * parallel_get_thread_num()];
                    for (int i = 0; i < blk_size; i++)
                        mean_buffer_ptr[i] = 0.f;

@ -1089,7 +1109,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con

                    float variance_internal = 0.0f;
                    if ((blk_size == min_cb) && mvn_variance_kernel) {
-                        auto variance_buffer_ptr = &variance_buffer[blk_size * mkldnn_get_thread_num()];
+                        auto variance_buffer_ptr = &variance_buffer[blk_size * parallel_get_thread_num()];
                        for (int i = 0; i < blk_size; i++)
                            variance_buffer_ptr[i] = 0.f;

@ -1321,7 +1341,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
                                    float dst_value = (src_data[ch + w * src_stride] - mean_buffer_ptr[c]) * variance_buffer_ptr[c];
                                    if (!fusedWith.empty()) {
                                        const auto &p = (*attr.get()).post_ops_;
-                                        for (int i = 0; i < p.len_; i++) {
+                                        for (int i = 0; i < p.len(); i++) {
                                            auto &post_op = p.entry_[i];
                                            if (post_op.is_eltwise()) {
                                                //  only eltwise_relu supported
@ -1335,7 +1355,7 @@ void MKLDNNMVNNode::mvn_blk(const in_data_t* src_data, out_data_t* dst_data, con
                                                bool do_dequantization = post_op.quantization.alg ==
                                                                         alg_kind::quantization_quantize_dequantize;
                                                bool do_rounding = do_dequantization || isFloatCompatible(output_prec) ||
-                                                                   i != p.len_ - 1;
+                                                                   i != p.len() - 1;

                                                auto quant = post_op.quantization;
                                                float crl = quant.crop_low_data->shifts_[quant.crop_low_data->count_ == 1 ? 0 : cb * blk_size + c];
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_mvn_node.h
@ -48,6 +48,8 @@ struct jit_uni_mvn_mean_variance_kernel {
    explicit jit_uni_mvn_mean_variance_kernel(jit_mvn_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
    virtual ~jit_uni_mvn_mean_variance_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_mvn_config_params jcp_;
 };

@ -62,6 +64,8 @@ struct jit_uni_mvn_kernel {
    explicit jit_uni_mvn_kernel(jit_mvn_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
    virtual ~jit_uni_mvn_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_mvn_config_params jcp_;
    const mkldnn_primitive_attr &attr_;
 };
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.cpp
@ -2,39 +2,48 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "mkldnn_normalize_node.h"
+
+#include <legacy/ie_layers_internal.hpp>
+#include <ie_parallel.hpp>
+
 #include "mkldnn_quantize_node.h"
 #include "mkldnn_eltwise_node.h"
-#include <mkldnn_extension_utils.h>
 #include "utils/bfloat16.hpp"
-#include <legacy/ie_layers_internal.hpp>
-#include "ie_parallel.hpp"
-#include "jit_uni_eltwise.hpp"
-#include "jit_uni_depthwise.hpp"
-#include "jit_uni_quantization.hpp"
+#include "mkldnn_extension_utils.h"
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>
+#include <cpu/x64/jit_uni_depthwise_injector.hpp>
+#include <cpu/x64/jit_uni_quantization_injector.hpp>
 #include "bf16transformer.h"
 #include "common/cpu_memcpy.h"
-#include "mkldnn_normalize_node.h"
 #include <mkldnn_selective_build.h>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl;
-using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;
 using namespace Xbyak;

 #define GET_OFF(field) offsetof(jit_normalize_call_args, field)

 static inline bool isFloatCompatible(memory::data_type type) {
-    return memory::f32 == type || memory::bf16 == type;
+    return memory::data_type::f32 == type || memory::data_type::bf16 == type;
 }

 template <cpu_isa_t isa>
 struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_modulo_kernel_f32)

-    jit_uni_normalize_modulo_kernel_f32(jit_normalize_config_params jcp) : jit_uni_normalize_modulo_kernel(jcp), jit_generator() {
+    jit_uni_normalize_modulo_kernel_f32(jit_normalize_config_params jcp) : jit_uni_normalize_modulo_kernel(jcp), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        this->preamble();
        mov(reg_src, ptr[reg_params + GET_OFF(src)]);
        mov(reg_modulo, ptr[reg_params + GET_OFF(modulo)]);
@ -52,7 +61,7 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker

            load_vector(vmm_val, ptr[reg_src], jcp_.src_dt);
            uni_vfmadd231ps(vmm_sqr_sum, vmm_val, vmm_val);
-            if (isa == cpu::sse42 && jcp_.is_blk) {
+            if (isa == cpu::x64::sse41 && jcp_.is_blk) {
                int sse42_offset = 4;
                load_vector(vmm_val, ptr[reg_src + sse42_offset * jcp_.src_data_size], jcp_.src_dt);
                uni_vfmadd231ps(vmm_sqr_sum, vmm_val, vmm_val);
@ -69,9 +78,9 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker
            uni_vmovups(ptr[reg_modulo], vmm_sqr_sum);
        } else {
            // hsum+store
-            if (isa == cpu::sse42) {
+            if (isa == cpu::x64::sse41) {
                hsum_store(vmm_sqr_sum);
-            } else if (isa == cpu::avx2) {
+            } else if (isa == cpu::x64::avx2) {
                Xbyak::Ymm ymm_sqr_sum = Xbyak::Ymm(vmm_sqr_sum.getIdx());
                vextractf128(xmm_aux1, ymm_sqr_sum, 0);
                vextractf128(xmm_aux2, ymm_sqr_sum, 1);
@ -91,11 +100,10 @@ struct jit_uni_normalize_modulo_kernel_f32 : public jit_uni_normalize_modulo_ker
        }

        this->postamble();
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;

@ -121,18 +129,18 @@ private:

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
            default:
@ -149,13 +157,20 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_normalize_kernel_f32)

    explicit jit_uni_normalize_kernel_f32(jit_normalize_config_params jcp, const mkldnn_primitive_attr &attr)
-    : jit_uni_normalize_kernel(jcp, attr), jit_generator() {
+    : jit_uni_normalize_kernel(jcp, attr), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        const auto &p = attr_.post_ops_;
-        for (int i = 0; i < p.len_; i++) {
+        for (int i = 0; i < p.len(); i++) {
            auto &post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                eltwise_injectors.push_back(std::make_shared<jit_uni_eltwise_injector_f32<isa>>(
-                        this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
+                        this, post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale));
            } else if (post_op.is_depthwise()) {
                depthwise_injectors.push_back(std::make_shared<jit_uni_depthwise_injector_f32<isa>>(
                        this, post_op.depthwise.alg));
@ -176,7 +191,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
        mov(reg_weights, ptr[reg_params + GET_OFF(weights)]);
        mov(reg_fused_factor, ptr[reg_params + GET_OFF(fused_factor)]);
        mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]);
-        if (attr_.post_ops_.len_ != 0)
+        if (attr_.post_ops_.len() != 0)
            mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]);
        if (isa == avx512_common)
            uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
@ -195,12 +210,10 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji
            emu_vcvtneps2bf16->emit_table();
        for (auto& inj : eltwise_injectors)
            inj->prepare_table();
-
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;

@ -255,7 +268,7 @@ private:
        Xbyak::Label tail_loop_label;
        Xbyak::Label tail_loop_end_label;

-        int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
+        int step = jcp_.src_dt == memory::data_type::bf16 ? 16 : (vlen / sizeof(float));
        L(main_loop_label);
        {
            cmp(reg_work_amount, step);
@ -276,7 +289,7 @@ private:
                    add(reg_modulo, vlen);
                }
            }
-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, 1);
            }
            store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -300,17 +313,17 @@ private:
                uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
            } else {
                if (jcp_.channel_shared) {
-                    load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::f32);
+                    load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32);
                    uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
                    add(reg_fused_factor, step * sizeof(float));
                } else {
-                    load_scalar(xmm_modulo, ptr[reg_modulo], memory::f32);
+                    load_scalar(xmm_modulo, ptr[reg_modulo], memory::data_type::f32);
                    uni_vmulps(xmm_val, xmm_val, xmm_modulo);
                    uni_vmulps(xmm_val, xmm_val, xmm_scale);
                    add(reg_modulo, step * sizeof(float));
                }
            }
-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, 1);  // vector and boradcast
            }
            store_scalar(ptr[reg_dst], xmm_val, jcp_.dst_dt);
@ -338,7 +351,7 @@ private:
        Xbyak::Label tail_loop_label;
        Xbyak::Label tail_loop_end_label;

-        int step = jcp_.src_dt == memory::bf16 ? 16 : (vlen / sizeof(float));
+        int step = jcp_.src_dt == memory::data_type::bf16 ? 16 : (vlen / sizeof(float));
        L(main_loop_label);
        {
            cmp(reg_work_amount, step);
@ -359,7 +372,7 @@ private:
                    add(reg_weights, vlen);
                }
            }
-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, 0);
                add(reg_oc_off, vlen);  // out channel offset of fused ops weights in byte
            }
@ -384,17 +397,17 @@ private:
                uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
            } else {
                if (jcp_.across_spatial) {
-                    load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::f32);
+                    load_scalar(xmm_fused_factor, ptr[reg_fused_factor], memory::data_type::f32);
                    uni_vmulps(xmm_val, xmm_val, xmm_fused_factor);
                    add(reg_fused_factor, step * sizeof(float));
                } else {
-                    load_scalar(xmm_scale, ptr[reg_weights], memory::f32);
+                    load_scalar(xmm_scale, ptr[reg_weights], memory::data_type::f32);
                    uni_vmulps(xmm_val, xmm_val, xmm_scale);
                    uni_vmulps(xmm_val, xmm_val, xmm_modulo);
                    add(reg_weights, step * sizeof(float));
                }
            }
-            if (attr_.post_ops_.len_ != 0) {
+            if (attr_.post_ops_.len() != 0) {
                apply_post_ops(jcp_.dst_dt, 0);
                add(reg_oc_off, step * sizeof(float));
            }
@ -413,15 +426,15 @@ private:
    inline void normalize_blk() {
        size_t blk_size = 0;
        size_t simd_w = 0;
-        if (isa == cpu::avx512_common) {
+        if (isa == cpu::x64::avx512_common) {
            blk_size = simd_w = 16;
-        } else if (isa == cpu::avx2) {
+        } else if (isa == cpu::x64::avx2) {
            blk_size = simd_w = 8;
        } else {
            blk_size = 8;
            simd_w = 4;
        }
-        bool is_sse42 = (isa == cpu::sse42);
+        bool is_sse42 = (isa == cpu::x64::sse41);

        if (jcp_.across_spatial) {
            if (jcp_.channel_shared) {
@ -444,7 +457,7 @@ private:
                load_vector(vmm_val, ptr[reg_src], jcp_.src_dt);
                uni_vmulps(vmm_val, vmm_val, vmm_fused_factor);

-                if (attr_.post_ops_.len_ != 0) {
+                if (attr_.post_ops_.len() != 0) {
                    apply_post_ops(jcp_.dst_dt, 0);
                }
                store_vector(ptr[reg_dst], vmm_val, jcp_.dst_dt);
@ -457,7 +470,7 @@ private:
                    } else {
                        uni_vmulps(vmm_val, vmm_val, vmm_fused_factor2);  // ld once
                    }
-                    if (attr_.post_ops_.len_ != 0) {
+                    if (attr_.post_ops_.len() != 0) {
                        add(reg_oc_off, sse42_offset * sizeof(float));
                        apply_post_ops(jcp_.dst_dt, 0);
                        sub(reg_oc_off, sse42_offset * sizeof(float));
@ -497,7 +510,7 @@ private:
                    uni_vmulps(vmm_val, vmm_val, vmm_modulo);
                    add(reg_weights, vlen);
                }
-                if (attr_.post_ops_.len_ != 0) {
+                if (attr_.post_ops_.len() != 0) {
                    apply_post_ops(jcp_.dst_dt, 0);
                    add(reg_oc_off, vlen);  // vlen is related isa
                }
@ -514,7 +527,7 @@ private:
                        uni_vmulps(vmm_val, vmm_val, vmm_modulo);  // bc once
                        add(reg_weights, vlen);  // 4 * sizeof(float)
                    }
-                    if (attr_.post_ops_.len_ != 0) {
+                    if (attr_.post_ops_.len() != 0) {
                        apply_post_ops(jcp_.dst_dt, 0);
                        add(reg_oc_off, vlen);  // vlen is related isa
                    }
@ -532,18 +545,18 @@ private:

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
            default:
@ -555,19 +568,19 @@ private:

    inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(xmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                pinsrw(xmm_src, op, 0x0);
                uni_vpslld(xmm_src, xmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                movsx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                movzx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
@ -584,39 +597,39 @@ private:
        Ymm ymm_dst = Ymm(vmm_dst.getIdx());
        Xmm xmm_dst = Xmm(vmm_dst.getIdx());

-        if (dst_dt == memory::f32) {
+        if (dst_dt == memory::data_type::f32) {
            uni_vmovups(op, vmm_dst);
-        } else if (dst_dt == memory::bf16) {
+        } else if (dst_dt == memory::data_type::bf16) {
            if (mayiuse(avx512_core_bf16))
                vcvtneps2bf16(ymm_dst, vmm_dst);
            else
                emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
            vmovdqu16(op, ymm_dst);
-        } else if (dst_dt == memory::u8) {
+        } else if (dst_dt == memory::data_type::u8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
-            if (isa == cpu::avx512_common) {
+            if (isa == cpu::x64::avx512_common) {
                vpmaxsd(vmm_dst, vmm_dst, vmm_zero);
                vpmovusdb(op, vmm_dst);
            } else {
                uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
            }
-        } else if (dst_dt == memory::s8) {
+        } else if (dst_dt == memory::data_type::s8) {
            uni_vcvtps2dq(vmm_dst, vmm_dst);
-            if (isa == cpu::avx512_common) {
+            if (isa == cpu::x64::avx512_common) {
                vpmovsdb(op, vmm_dst);
            } else {
                uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vpermq(ymm_dst, ymm_dst, 0x08);
                uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
-                if (isa != cpu::sse42)
+                if (isa != cpu::x64::sse41)
                    vmovq(op, xmm_dst);
                else
                    movd(op, xmm_dst);
@ -630,21 +643,21 @@ private:
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(op, xmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpsrld(xmm_dst, xmm_dst, 16);
                pextrw(op, xmm_dst, 0x0);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
                mov(op, reg_tmp_8);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
@ -662,7 +675,7 @@ private:
        int eltwise_inj_idx = 0;
        int depthwise_inj_idx = 0;
        int quantization_inj_idx = 0;
-        for (int i = 0; i < p.len_; i++) {
+        for (int i = 0; i < p.len(); i++) {
            auto& post_op = p.entry_[i];
            if (post_op.is_eltwise()) {
                if (eltwise_injectors.size() <= eltwise_inj_idx
@ -686,7 +699,7 @@ private:
                        || quantization_injectors[quantization_inj_idx] == nullptr)
                    assert(!"Invalid quantization injectors.");
                bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
-                bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len_ - 1;
+                bool do_rounding = do_dequantization || isFloatCompatible(dst_dt) || i != p.len() - 1;

                int s_idx = vmm_val.getIdx();

@ -835,7 +848,7 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {
    config.inConfs[0].inPlace = -1;
    config.outConfs[0].inPlace = canBeInplace ? 0 : -1;

-    auto pushDesc = [&](memory::format format) {
+    auto pushDesc = [&](memory::format_tag format) {
        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), outputDataType, format);
        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, format});
@ -843,12 +856,12 @@ void MKLDNNNormalizeNode::initSupportedPrimitiveDescriptors() {

    // only plain layout support when w/o sse42
    if (getParentEdgeAt(0)->getDims().ndims() == 4) {
-        if (mayiuse(cpu::sse42)) {
-            pushDesc(memory::nhwc);
-            if (mayiuse(cpu::avx512_common)) {
-                pushDesc(memory::nChw16c);
+        if (mayiuse(cpu::x64::sse41)) {
+            pushDesc(memory::format_tag::nhwc);
+            if (mayiuse(cpu::x64::avx512_common)) {
+                pushDesc(memory::format_tag::nChw16c);
            } else {
-                pushDesc(memory::nChw8c);
+                pushDesc(memory::format_tag::nChw8c);
            }
        }
    }
@ -890,15 +903,20 @@ void MKLDNNNormalizeNode::createPrimitive() {
        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set.";

    auto selectedPD = getSelectedPrimitiveDescriptor();
-    Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
-    auto jcp = jit_normalize_config_params();
    jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[0].desc.getPrecision());
    jcp.dst_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().outConfs[0].desc.getPrecision());
    jcp.src_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.src_dt);
    jcp.dst_data_size = MKLDNNExtensionUtils::sizeOfDataType(jcp.dst_dt);
-    jcp.is_nchw = selected_layout == MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims());
-    jcp.is_nhwc = selected_layout == Layout::NHWC;
-    jcp.is_blk = selected_layout == Layout::BLOCKED;
+
+    jcp.is_nchw = jcp.is_nhwc = jcp.is_blk = false;
+    if (getParentEdgeAt(0)->getMemory().GetDesc().isPlainFormat()) {
+        jcp.is_nchw = true;
+    } else if (getParentEdgeAt(0)->getMemory().GetDesc().isBlockedCFormat()) {
+        jcp.is_blk = true;
+    } else {
+        jcp.is_nhwc = true;
+    }
+
    jcp.across_spatial = across_spatial;
    jcp.channel_shared = channel_shared;
    auto dims = getParentEdgeAt(0)->getDesc().getDims();
@ -908,25 +926,30 @@ void MKLDNNNormalizeNode::createPrimitive() {
    jcp.h = (dims_size > 2) ? dims[2] : 1lu;
    jcp.w = (dims_size > 3) ? dims[3] : 1lu;

-    if (mayiuse(cpu::avx512_common)) {
-        normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::avx512_common>(jcp));
-        normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::avx512_common>(jcp, *attr.get()));
-    } else if (mayiuse(cpu::avx2)) {
-        normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::avx2>(jcp));
-        normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::avx2>(jcp, *attr.get()));
-    } else if (mayiuse(cpu::sse42)) {
-        normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::sse42>(jcp));
-        normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::sse42>(jcp, *attr.get()));
+    if (mayiuse(cpu::x64::avx512_common)) {
+        normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::x64::avx512_common>(jcp));
+        normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::x64::avx512_common>(jcp, *attr.get()));
+    } else if (mayiuse(cpu::x64::avx2)) {
+        normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::x64::avx2>(jcp));
+        normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::x64::avx2>(jcp, *attr.get()));
+    } else if (mayiuse(cpu::x64::sse41)) {
+        normalize_modulo_kernel.reset(new jit_uni_normalize_modulo_kernel_f32<cpu::x64::sse41>(jcp));
+        normalize_kernel.reset(new jit_uni_normalize_kernel_f32<cpu::x64::sse41>(jcp, *attr.get()));
    }
+    if (normalize_kernel)
+        normalize_kernel->create_ker();
+
+    if (normalize_modulo_kernel)
+        normalize_modulo_kernel->create_ker();

    const auto &p = (*attr.get()).post_ops_;
-    for (int i = 0; i < p.len_; i++) {
+    for (int i = 0; i < p.len(); i++) {
        auto &post_op = p.entry_[i];
        if (post_op.is_eltwise()) {
-            eltwise_injectors_ref.push_back(std::make_shared<ref_eltwise_scalar_fwd_t>(
-                post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta));
+            eltwise_injectors_ref.push_back(std::make_shared<cpu::ref_eltwise_scalar_fwd_t>(
+                post_op.eltwise.alg, post_op.eltwise.alpha, post_op.eltwise.beta, post_op.eltwise.scale));
        } else if (post_op.is_depthwise()) {
-            depthwise_injectors_ref.push_back(std::make_shared<ref_depthwise_scalar_fwd_t>(
+            depthwise_injectors_ref.push_back(std::make_shared<cpu::ref_depthwise_scalar_fwd_t>(
                    post_op.depthwise.alg));
        }
    }
@ -958,12 +981,8 @@ struct MKLDNNNormalizeNode::NormalizeExecute {
 void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
    auto &srcMemPtr = getParentEdgeAt(0)->getMemoryPtr();
    auto &dstMemPtr = getChildEdgeAt(0)->getMemoryPtr();
-    const uint8_t *src_ptr = reinterpret_cast<const uint8_t*>(srcMemPtr->GetData()) +
-            srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
-            MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemPtr->GetDescriptor().data.data_type));
-    uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(dstMemPtr->GetData()) +
-            dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
-            MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemPtr->GetDescriptor().data.data_type));
+    const uint8_t *src_ptr = reinterpret_cast<const uint8_t*>(srcMemPtr->GetPtr());
+    uint8_t *dst_ptr = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());

    auto dims = getParentEdgeAt(0)->getDesc().getDims();

@ -990,11 +1009,11 @@ void MKLDNNNormalizeNode::execute(mkldnn::stream strm) {
 template <typename in_data_t, typename out_data_t>
 void MKLDNNNormalizeNode::normalize_nchw(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
    size_t blk_size = 1;  // elt in vmm
-    if (mayiuse(cpu::avx512_common)) {
+    if (mayiuse(cpu::x64::avx512_common)) {
        blk_size = 16;
-    } else if (mayiuse(cpu::avx2)) {
+    } else if (mayiuse(cpu::x64::avx2)) {
        blk_size = 8;
-    } else if (mayiuse(cpu::sse42)) {
+    } else if (mayiuse(cpu::x64::sse41)) {
        blk_size = 4;
    }

@ -1186,11 +1205,11 @@ void MKLDNNNormalizeNode::normalize_nchw_ref(const in_data_t* src_data, out_data
 template <typename in_data_t, typename out_data_t>
 void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
    size_t blk_size = 1;  // elt in vmm
-    if (mayiuse(cpu::avx512_common)) {
+    if (mayiuse(cpu::x64::avx512_common)) {
        blk_size = 16;
-    } else if (mayiuse(cpu::avx2)) {
+    } else if (mayiuse(cpu::x64::avx2)) {
        blk_size = 8;
-    } else if (mayiuse(cpu::sse42)) {
+    } else if (mayiuse(cpu::x64::sse41)) {
        blk_size = 4;
    }

@ -1307,11 +1326,11 @@ void MKLDNNNormalizeNode::normalize_nhwc(const in_data_t* src_data, out_data_t*
 template <typename in_data_t, typename out_data_t>
 void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
    size_t blk_size = 1;  // channel blk for memory layout
-    if (mayiuse(cpu::avx512_common)) {
+    if (mayiuse(cpu::x64::avx512_common)) {
        blk_size = 16;
-    } else if (mayiuse(cpu::avx2)) {
+    } else if (mayiuse(cpu::x64::avx2)) {
        blk_size = 8;
-    } else if (mayiuse(cpu::sse42)) {
+    } else if (mayiuse(cpu::x64::sse41)) {
        blk_size = 8;
    }

@ -1439,20 +1458,18 @@ void MKLDNNNormalizeNode::normalize_blk(const in_data_t* src_data, out_data_t* d

 template <typename in_data_t, typename out_data_t>
 void MKLDNNNormalizeNode::normalize_function(const in_data_t* src_data, out_data_t* dst_data, const InferenceEngine::SizeVector& dims) {
-    auto selectedPD = getSelectedPrimitiveDescriptor();
-    Layout selected_layout = selectedPD->getConfig().inConfs[0].desc.getLayout();
-    if (mayiuse(cpu::sse42) && normalize_modulo_kernel && normalize_kernel) {
-        if (selected_layout == MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims())) {
+    if (mayiuse(cpu::x64::sse41) && normalize_modulo_kernel && normalize_kernel) {
+        if (jcp.is_nchw) {
            normalize_nchw(src_data, dst_data, dims);
-        } else if (selected_layout == Layout::NHWC) {
+        } else if (jcp.is_nhwc) {
            normalize_nhwc(src_data, dst_data, dims);
-        } else if (selected_layout == Layout::BLOCKED) {
+        } else if (jcp.is_blk) {
            normalize_blk(src_data, dst_data, dims);
        } else {
            THROW_IE_EXCEPTION << "The selected layout is not supported.";
        }
    } else {
-        if (selected_layout == MKLDNNMemory::GetPlainLayout(getChildEdgeAt(0)->getDims())) {
+        if (jcp.is_nchw) {
            normalize_nchw_ref(src_data, dst_data, dims);
        } else {
            THROW_IE_EXCEPTION << "Only support plain layout on machine w/o sse42.";
@ -1464,7 +1481,7 @@ inline void MKLDNNNormalizeNode::apply_post_ops_scalar(float &dst_value, int ind
    const auto &p = (*attr.get()).post_ops_;
    int eltwise_inj_idx = 0;
    int depthwise_inj_idx = 0;
-    for (int i = 0; i < p.len_; i++) {
+    for (int i = 0; i < p.len(); i++) {
        auto &post_op = p.entry_[i];
        if (post_op.is_eltwise()) {
            dst_value = eltwise_injectors_ref[eltwise_inj_idx]->compute_scalar(dst_value);
@ -1476,7 +1493,7 @@ inline void MKLDNNNormalizeNode::apply_post_ops_scalar(float &dst_value, int ind
            depthwise_inj_idx++;
        } else if (post_op.is_quantization()) {
            bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
-            bool do_rounding = do_dequantization || output_prec == Precision::FP32 || i != p.len_ - 1;
+            bool do_rounding = do_dequantization || output_prec == Precision::FP32 || i != p.len() - 1;

            auto quant = post_op.quantization;

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_normalize_node.h
@ -4,8 +4,12 @@

 #pragma once

-#include "ref_eltwise.hpp"
-#include "ref_depthwise.hpp"
+#include <mkldnn_node.h>
+#include <mkldnn.hpp>
+#include <cassert>
+
+#include <cpu/ref_eltwise.hpp>
+#include <cpu/ref_depthwise_injector.hpp>

 using namespace InferenceEngine;

@ -47,6 +51,8 @@ struct jit_uni_normalize_modulo_kernel {
    jit_uni_normalize_modulo_kernel(jit_normalize_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
    virtual ~jit_uni_normalize_modulo_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_normalize_config_params jcp_;
 };

@ -61,6 +67,8 @@ struct jit_uni_normalize_kernel {
    explicit jit_uni_normalize_kernel(jit_normalize_config_params jcp, const mkldnn_primitive_attr &attr) : ker_(nullptr), jcp_(jcp), attr_(attr) {}
    virtual ~jit_uni_normalize_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_normalize_config_params jcp_;
    const mkldnn_primitive_attr &attr_;
 };
@ -118,6 +126,8 @@ private:

    std::vector<std::shared_ptr<mkldnn::impl::cpu::ref_eltwise_scalar_fwd_t>> eltwise_injectors_ref;
    std::vector<std::shared_ptr<mkldnn::impl::cpu::ref_depthwise_scalar_fwd_t>> depthwise_injectors_ref;
+
+    jit_normalize_config_params jcp = {};
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.cpp
@ -87,16 +87,16 @@ void MKLDNNPadNode::initSupportedPrimitiveDescriptors() {
    config.outConfs[0].inPlace = -1;
    config.outConfs[0].constant = false;

-    auto pushSupportedPrimitiveDescriptor = [&](memory::format memoryFormat) {
+    auto pushSupportedPrimitiveDescriptor = [&](memory::format_tag memoryFormat) {
        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), dataType, memoryFormat);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), dataType, memoryFormat);
        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::ref, memoryFormat});
    };

    if (numOfDims == 4)
-        pushSupportedPrimitiveDescriptor(mkldnn::memory::nhwc);
+        pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nhwc);
    else if (numOfDims == 5)
-        pushSupportedPrimitiveDescriptor(mkldnn::memory::ndhwc);
+        pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::ndhwc);

    pushSupportedPrimitiveDescriptor(MKLDNNMemory::GetPlainFormat(getParentEdgeAt(0)->getDims()));

@ -107,14 +107,14 @@ void MKLDNNPadNode::initSupportedPrimitiveDescriptors() {

    if (numOfDims == 4) {
        if (srcDims[1] % 8 == 0 && canUseBlocked(8))
-            pushSupportedPrimitiveDescriptor(mkldnn::memory::nChw8c);
+            pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nChw8c);
        if (srcDims[1] % 16 == 0 && canUseBlocked(16))
-            pushSupportedPrimitiveDescriptor(mkldnn::memory::nChw16c);
+            pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nChw16c);
    } else if (numOfDims == 5) {
        if (srcDims[1] % 8 == 0 && canUseBlocked(8))
-            pushSupportedPrimitiveDescriptor(mkldnn::memory::nCdhw8c);
+            pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nCdhw8c);
        if (srcDims[1] % 16 == 0 && canUseBlocked(16))
-            pushSupportedPrimitiveDescriptor(mkldnn::memory::nCdhw16c);
+            pushSupportedPrimitiveDescriptor(mkldnn::memory::format_tag::nCdhw16c);
    }
 }

@ -136,8 +136,7 @@ void MKLDNNPadNode::createPrimitive() {
    params.srcStrides = getParentEdgeAt(0)->getBlob()->getTensorDesc().getBlockingDesc().getStrides();
    params.dstStrides = getChildEdgeAt(0)->getBlob()->getTensorDesc().getBlockingDesc().getStrides();

-    auto layout = this->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc.getLayout();
-    if (layout == BLOCKED) {
+    if (getParentEdgeAt(0)->getMemory().GetDesc().isBlockedCFormat()) {
        padsBegin[1] /= params.srcDims[params.srcDims.size() - 1];
        padsEnd[1] /= params.srcDims[params.srcDims.size() - 1];
        padsBegin.push_back(0);
@ -259,8 +258,8 @@ void MKLDNNPadNode::padConstant() {

 template<typename T>
 void MKLDNNPadNode::padConstantCommon() {
-    T* srcData = reinterpret_cast<T*>(getDataPtr(this->getParentEdgeAt(0)->getMemory()));
-    T* dstData = reinterpret_cast<T*>(getDataPtr(this->getChildEdgeAt(0)->getMemory()));
+    T* srcData = reinterpret_cast<T*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    T* dstData = reinterpret_cast<T*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
    T value = static_cast<T>(padValue);

    parallel_nt(0, [&](const int ithr, const int nthr) {
@ -301,8 +300,8 @@ void MKLDNNPadNode::padConstantCommon() {
 }

 void MKLDNNPadNode::padConstantZero() {
-    uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
-    uint8_t* dstData = getDataPtr(this->getChildEdgeAt(0)->getMemory());
+    uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());

    parallel_nt(0, [&](const int ithr, const int nthr) {
        size_t start = 0, end = 0;
@ -342,8 +341,8 @@ void MKLDNNPadNode::padConstantZero() {
 }

 void MKLDNNPadNode::padEdge() {
-    uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
-    uint8_t* dstData = getDataPtr(this->getChildEdgeAt(0)->getMemory());
+    uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());

    parallel_nt(0, [&](const int ithr, const int nthr) {
        size_t start = 0, end = 0;
@ -379,8 +378,8 @@ void MKLDNNPadNode::padEdge() {
 }

 void MKLDNNPadNode::padReflectOrSymmetric(const bool isSymmetric) {
-    uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
-    uint8_t* dstData = getDataPtr(this->getChildEdgeAt(0)->getMemory());
+    uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(0)->getMemoryPtr()->GetPtr());

    size_t shift = isSymmetric ? 1 : 0;

@ -425,11 +424,6 @@ inline void MKLDNNPadNode::getDstIdx(const InferenceEngine::SizeVector& indexes,
    dstIdx *= (padMode == CONSTANT && padValue != 0) ? 1 : params.sizeData;
 }

-inline uint8_t* MKLDNNPadNode::getDataPtr(const MKLDNNMemory& memoryPtr) const {
-    return reinterpret_cast<uint8_t*>(memoryPtr.GetData()) + memoryPtr.GetDescriptor().data.layout_desc.blocking.offset_padding *
-                                      MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(memoryPtr.GetDescriptor().data.data_type));
-}
-
 bool MKLDNNPadNode::created() const {
    return getType() == Pad;
 }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pad_node.h
@ -36,7 +36,6 @@ private:
    void padReflectOrSymmetric(const bool isSymmetric = false);

    inline void getDstIdx(const InferenceEngine::SizeVector& indexes, size_t& dstIdx) const;
-    inline uint8_t* getDataPtr(const MKLDNNMemory& memoryPtr) const;

    PadMode padMode = CONSTANT;
    float padValue = 0.f;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.cpp
@ -8,23 +8,31 @@
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
 #include "ie_parallel.hpp"
-#include "jit_generator.hpp"
+#include <cpu/x64/jit_generator.hpp>
+
 #include <algorithm>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl;
-using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;

 #define GET_OFF(field) offsetof(jit_args_permute, field)

-template <cpu::cpu_isa_t isa>
+template <cpu::x64::cpu_isa_t isa>
 struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_permute_kernel_f32)

-    explicit jit_uni_permute_kernel_f32(jit_permute_conf_t jpp) : jit_uni_permute_kernel(jpp), jit_generator() {
+    explicit jit_uni_permute_kernel_f32(jit_permute_conf_t jpp) : jit_uni_permute_kernel(jpp), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
        this->preamble();

        mov(reg_src, ptr[reg_params + GET_OFF(src)]);
@ -33,8 +41,6 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge
        loop(jpp.n);

        this->postamble();
-
-        ker_ = (decltype(ker_))this->getCode();
    }

    void load(const Xbyak::Xmm &xmm, const Xbyak::Address &addr) {
@ -115,7 +121,7 @@ struct jit_uni_permute_kernel_f32 : public jit_uni_permute_kernel, public jit_ge
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
    uint32_t vlen = cpu_isa_traits<isa>::vlen;

    Xbyak::Reg64 reg_src = r8;
@ -174,52 +180,52 @@ void MKLDNNPermuteNode::initSupportedPrimitiveDescriptors() {
    config.outConfs[0].inPlace = -1;
    config.outConfs[0].constant = false;
    if (getParentEdgeAt(0)->getDims().ndims() == 4) {
-        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nchw);
-        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nchw);
-        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nchw});
+        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
+        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
+        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nchw});

        auto srcDims = getParentEdgeAt(0)->getDims();
        if (srcDims[1] % 8 == 0) {
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw8c);
-            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nChw8c});
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw8c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nChw8c});
        }

        if (srcDims[1] % 16 == 0) {
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nChw16c);
-            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nChw16c});
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw16c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nChw16c});
        }

        if (prec == Precision::I8 || prec == Precision::U8) {
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nhwc);
-            config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::nhwc);
-            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nhwc});
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nhwc);
+            config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nhwc);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nhwc});
        }
    } else if (getParentEdgeAt(0)->getDims().ndims() == 5) {
-        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ncdhw);
-        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ncdhw);
-        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::ncdhw});
+        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
+        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
+        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::ncdhw});

        auto srcDims = getParentEdgeAt(0)->getDims();
        if (srcDims[1] % 8 == 0) {
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw8c);
-            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nCdhw8c});
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw8c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nCdhw8c});
        }

        if (srcDims[1] % 16 == 0) {
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::nCdhw16c);
-            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::nCdhw16c});
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw16c);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::nCdhw16c});
        }

        if (prec == Precision::I8 || prec == Precision::U8) {
-            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::ndhwc);
-            config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::ndhwc);
-            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::ndhwc});
+            config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ndhwc);
+            config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ndhwc);
+            supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, memory::format_tag::ndhwc});
        }
    } else {
-        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::any);
-        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType,
-                                                   MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims()));
-        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, MKLDNNMemory::GetPlainFormat(getChildEdgeAt(0)->getDims())});
+        // general plain case
+        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType);
+        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType);
+        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown});
    }
 }

@ -332,7 +338,7 @@ void MKLDNNPermuteNode::createPrimitive() {
        }
    }

-    int max_threads = mkldnn_get_max_threads();
+    int max_threads = dnnl_get_max_threads();
    const int n_max = 3;    //  max count dims for parallel
    int n = 0;
    int work_amount = sorted_dst_dims[0];
@ -351,24 +357,27 @@ void MKLDNNPermuteNode::createPrimitive() {
    jpp.ndims = sorted_order.size();
    jpp.data_size = MKLDNNExtensionUtils::sizeOfDataType(data_type);

-    if (mayiuse(cpu::avx512_common)) {
-        permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::avx512_common>(jpp));
-    } else if (mayiuse(cpu::avx2)) {
-        permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::avx2>(jpp));
-    } else if (mayiuse(cpu::sse42)) {
-        permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::sse42>(jpp));
+    if (mayiuse(cpu::x64::avx512_common)) {
+        permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx512_common>(jpp));
+    } else if (mayiuse(cpu::x64::avx2)) {
+        permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::avx2>(jpp));
+    } else if (mayiuse(cpu::x64::sse41)) {
+        permute_kernel.reset(new jit_uni_permute_kernel_f32<cpu::x64::sse41>(jpp));
    }
+    if (permute_kernel)
+        permute_kernel->create_ker();
 }

 static void permute_to_0231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
    // Supports only NCHW to NHWC
    int block_size = 1;
-    if (!MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat())) {
-        block_size = srcMemPtr->GetDescriptor().data.layout_desc.blocking.block_dims[1];
+    if (!srcMemPtr->GetDesc().isPlainFormat()) {
+        const auto &blk_desc = srcMemPtr->GetDescriptor().data.format_desc.blocking;
+        auto found = std::find(blk_desc.inner_idxs, blk_desc.inner_idxs + blk_desc.inner_nblks, 1);
+        auto pos = std::distance(found, blk_desc.inner_idxs);
+        block_size = blk_desc.inner_blks[pos];
    }

    const int C = srcMemPtr->GetDims()[1];
@ -394,13 +403,14 @@ static void permute_to_0231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_0213(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
    int block_size = 1;
-    if (!MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat())) {
-        block_size = srcMemPtr->GetDescriptor().data.layout_desc.blocking.block_dims[1];
+    if (!srcMemPtr->GetDesc().isPlainFormat()) {
+        const auto &blk_desc = srcMemPtr->GetDescriptor().data.format_desc.blocking;
+        auto found = std::find(blk_desc.inner_idxs, blk_desc.inner_idxs + blk_desc.inner_nblks, 1);
+        auto pos = std::distance(found, blk_desc.inner_idxs);
+        block_size = blk_desc.inner_blks[pos];
    }

    const int C = srcMemPtr->GetDims()[1];
@ -419,10 +429,8 @@ static void permute_to_0213(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_0312(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int C = srcMemPtr->GetDims()[1];
    const int H = srcMemPtr->GetDims()[2];
@ -439,10 +447,8 @@ static void permute_to_0312(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&

 template <size_t scale_H = 0, size_t scale_W = 0>
 static void permute_to_014253(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int C  = srcMemPtr->GetDims()[1];
    const int CH  = scale_H > 0 ? static_cast<int>(scale_H) : srcMemPtr->GetDims()[2];
@ -477,10 +483,8 @@ static void permute_to_014253(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPt
 }

 static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int C  = srcMemPtr->GetDims()[1];
    const int H  = srcMemPtr->GetDims()[2];
@ -507,10 +511,8 @@ static void permute_to_3012(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int C  = srcMemPtr->GetDims()[1];
    const int S  = srcMemPtr->GetDims()[2];
@ -533,10 +535,8 @@ static void permute_to_021(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -571,13 +571,14 @@ static void permute_to_034152(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPt
 }

 static void permute_to_0132(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());
    int src_block_size = 1;
-    if (!MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat())) {
-        src_block_size = srcMemPtr->GetDescriptor().data.layout_desc.blocking.block_dims[1];
+    if (!srcMemPtr->GetDesc().isPlainFormat()) {
+        const auto &blk_desc = srcMemPtr->GetDescriptor().data.format_desc.blocking;
+        auto found = std::find(blk_desc.inner_idxs, blk_desc.inner_idxs + blk_desc.inner_nblks, 1);
+        auto pos = std::distance(found, blk_desc.inner_idxs);
+        src_block_size = blk_desc.inner_blks[pos];
    }

    const int C = srcMemPtr->GetDims()[1];
@ -596,10 +597,8 @@ static void permute_to_0132(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_03142(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -630,10 +629,8 @@ static void permute_to_03142(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
 }

 static void permute_to_1203(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int C = srcMemPtr->GetDims()[1];
    const int H = srcMemPtr->GetDims()[2];
@ -649,10 +646,8 @@ static void permute_to_1203(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_02134(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -678,10 +673,8 @@ static void permute_to_02134(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
 }

 static void permute_to_02431(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -707,10 +700,8 @@ static void permute_to_02431(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
 }

 static void permute_to_04231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -736,10 +727,8 @@ static void permute_to_04231(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
 }

 static void permute_to_102(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int C  = srcMemPtr->GetDims()[1];
    const int S  = srcMemPtr->GetDims()[2];
@ -762,10 +751,8 @@ static void permute_to_102(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr&
 }

 static void permute_to_02341(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -791,10 +778,8 @@ static void permute_to_02341(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr
 }

 static void permute_to_04123(int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetData());
-    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetData());
-    src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
-    dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding;
+    auto src_data = reinterpret_cast<const float *>(srcMemPtr->GetPtr());
+    auto dst_data = reinterpret_cast<float *>(dstMemPtr->GetPtr());

    const int DIM1 = srcMemPtr->GetDims()[1];
    const int DIM2 = srcMemPtr->GetDims()[2];
@ -824,52 +809,52 @@ const std::multimap<InferenceEngine::SizeVector, MKLDNNPermuteNode::PermuteImpl>
            return true;
        })},  // NCHW -> NHWC case
        {{0, 1, 4, 2, 5, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_014253<2, 2>, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && srcMemPtr->GetDims()[2] == 2 && srcMemPtr->GetDims()[3] == 2;
+            return srcMemPtr->GetDesc().isPlainFormat() && srcMemPtr->GetDims()[2] == 2 && srcMemPtr->GetDims()[3] == 2;
        })},  // Dense upsample convolution case (scale = 2)
        {{0, 1, 4, 2, 5, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_014253<0, 0>, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},  // Dense upsample convolution case (generic)
        {{3, 0, 1, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_3012, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && MB == srcMemPtr->GetDims()[0];
+            return srcMemPtr->GetDesc().isPlainFormat() && MB == srcMemPtr->GetDims()[0];
        })},  // LPR case
        {{0, 2, 1, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_0213, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},  // shufflenet
        {{0, 2, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_021, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},  // self attention block
        {{0, 3, 4, 1, 5, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_034152, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},  // learning-to-see-in-the-dark-sony
        {{0, 1, 3, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_0132, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
            return true;
        })},
        {{0, 3, 1, 4, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_03142, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
        {{1, 2, 0, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_1203, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && MB == srcMemPtr->GetDims()[0];
+            return srcMemPtr->GetDesc().isPlainFormat() && MB == srcMemPtr->GetDims()[0];
        })},
        {{0, 2, 1, 3, 4}, MKLDNNPermuteNode::PermuteImpl(permute_to_02134, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
        {{0, 2, 4, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_02431, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
        {{0, 4, 2, 3, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_04231, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
        {{0, 3, 1, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_0312, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
        {{1, 0, 2}, MKLDNNPermuteNode::PermuteImpl(permute_to_102, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat()) && MB == srcMemPtr->GetDims()[0];
+            return srcMemPtr->GetDesc().isPlainFormat() && MB == srcMemPtr->GetDims()[0];
        })},
        {{0, 2, 3, 4, 1}, MKLDNNPermuteNode::PermuteImpl(permute_to_02341, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
        {{0, 4, 1, 2, 3}, MKLDNNPermuteNode::PermuteImpl(permute_to_04123, [](int MB, MKLDNNMemoryPtr& srcMemPtr, MKLDNNMemoryPtr& dstMemPtr) {
-            return MKLDNNMemory::IsPlainFormat(srcMemPtr->GetFormat());
+            return srcMemPtr->GetDesc().isPlainFormat();
        })},
 };

@ -887,14 +872,11 @@ void MKLDNNPermuteNode::execute(mkldnn::stream strm) {
    }

    if (permute_kernel) {
-        auto src_data = reinterpret_cast<const char *>(srcMemPtr->GetData());
-        auto dst_data = reinterpret_cast<char *>(dstMemPtr->GetData());
+        auto src_data = reinterpret_cast<const char *>(srcMemPtr->GetPtr());
+        auto dst_data = reinterpret_cast<char *>(dstMemPtr->GetPtr());

        const auto &jpp = (*permute_kernel).jpp;

-        src_data += srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * jpp.data_size;
-        dst_data += dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * jpp.data_size;
-
        SizeVector dst_dims = jpp.dst_block_dims;
        SizeVector dst_strides = jpp.dst_strides;
        SizeVector src_strides = jpp.src_strides;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_permute_node.h
@ -37,6 +37,8 @@ struct jit_uni_permute_kernel {

    jit_permute_conf_t jpp;

+    virtual void create_ker() = 0;
+
    explicit jit_uni_permute_kernel(jit_permute_conf_t jpp) : ker_(nullptr), jpp(jpp) {}
    virtual ~jit_uni_permute_kernel() {}
 };
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_pooling_node.h"
-#include "desc_iterator.hpp"
+
 #include "mkldnn_quantize_node.h"
 #include "mkldnn_conv_node.h"
 #include "mkldnn_concat_node.h"
@ -14,6 +14,7 @@
 #include <mkldnn_types.h>
 #include <mkldnn_extension_utils.h>
 #include <legacy/ie_layers_internal.hpp>
+#include <utils/general_utils.h>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
@ -23,6 +24,22 @@ MKLDNNPoolingNode::MKLDNNPoolingNode(const InferenceEngine::CNNLayerPtr& layer,
        MKLDNNWeightsSharing::Ptr &cache)
        : MKLDNNNode(layer, eng, cache) {}

+std::vector<memory::format_tag> MKLDNNPoolingNode::getAvailableFormatsForDims(const MKLDNNDims &dims) const {
+    if (dims.ndims() == 0)
+        return {memory::format_tag::x};
+    else if (dims.ndims() == 1)
+        return {memory::format_tag::x};
+    else if (dims.ndims() == 2)
+        return {memory::format_tag::nc};
+    else if (dims.ndims() == 3)
+        return {memory::format_tag::tnc, memory::format_tag::ntc};
+    else if (dims.ndims() == 4)
+        return {memory::format_tag::nChw8c, memory::format_tag::nChw16c, memory::format_tag::nhwc, memory::format_tag::nchw};
+    else if (dims.ndims() == 5)
+        return {memory::format_tag::nCdhw8c, memory::format_tag::nCdhw16c, memory::format_tag::ndhwc, memory::format_tag::ncdhw};
+    return {memory::format_tag::any};
+}
+
 void MKLDNNPoolingNode::getSupportedDescriptors() {
    if (!descs.empty())
        return;
@ -88,18 +105,18 @@ void MKLDNNPoolingNode::getSupportedDescriptors() {
    }
    if (inputPrecision == Precision::I8 || inputPrecision == Precision::U8) {
        // i8 layers supports only ndhwc and nhwc layouts
-        MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
-        MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ndhwc : memory::format::nhwc};
+        MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc};
+        MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format_tag::ndhwc : memory::format_tag::nhwc};
        createDescriptor({ in_candidate }, { out_candidate });
    } else if ((parentDims.ndims() == 4 || parentDims.ndims() == 5) && parentDims[1] == 1) {
        // WA. We should force planar layout since it provides better performance
-        MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
-        MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format::ncdhw : memory::format::nchw};
+        MKLDNNMemoryDesc in_candidate{parentDims, inputDataType, parentDims.ndims() == 5 ? memory::format_tag::ncdhw : memory::format_tag::nchw};
+        MKLDNNMemoryDesc out_candidate{childDims, outputDataType, parentDims.ndims() == 5 ? memory::format_tag::ncdhw : memory::format_tag::nchw};
        createDescriptor({ in_candidate }, { out_candidate });
    } else {
-        if (inputDataType != memory::bf16) {
-            inputDataType = memory::f32;
-            outputDataType = memory::f32;
+        if (inputDataType != memory::data_type::bf16) {
+            inputDataType = memory::data_type::f32;
+            outputDataType = memory::data_type::f32;
        }
        // It doesn't support any format
        for (auto format : getAvailableFormatsForDims(parentDims)) {
@ -119,8 +136,11 @@ void MKLDNNPoolingNode::createPrimitive() {

    auto prim_desc = createPrimitiveDescriptor<pooling_forward::primitive_desc, pooling_forward::desc>(attr);

-    prim.reset(new pooling_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                   getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    prim.reset(new pooling_forward(prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
 }

 bool MKLDNNPoolingNode::created() const {
@ -149,23 +169,28 @@ void MKLDNNPoolingNode::createDescriptor(const std::vector<InferenceEngine::Tens
            }
        }
        if (!exclude_pad && (not_zero_l || not_zero_r))
-            alg = pooling_avg_include_padding;
+            alg = algorithm::pooling_avg_include_padding;
        else
-            alg = pooling_avg_exclude_padding;
+            alg = algorithm::pooling_avg_exclude_padding;
    } else if (type == PoolingLayer::PoolType::MAX) {
-        alg = pooling_max;
+        alg = algorithm::pooling_max;
    } else {
        // TODO: Handle rest of the possible: STOCH, ROI, SPACIAL_PYRAMID
        THROW_IE_EXCEPTION << "Unsupported pooling type";
    }

+    auto convert = [] (std::vector<ptrdiff_t> orig_dims) {
+        return memory::dims(orig_dims.begin(), orig_dims.end());
+    };
    std::shared_ptr<pooling_forward::desc> desc_ptr(
            new pooling_forward::desc(prop_kind::forward_scoring, alg,
                                      in_candidate, out_candidate,
-                                      stride, kernel, effective_pad_begin, effective_pad_end,
-                                      mkldnn::padding_kind::zero));
+                                      convert(stride),
+                                      convert(kernel),
+                                      convert(effective_pad_begin),
+                                      convert(effective_pad_end)));

-    if (alg == pooling_avg_include_padding) {
+    if (alg == algorithm::pooling_avg_include_padding) {
        // In case of AVG including paddings the norm coeff should be calculated
        // with tacking into account original pads. So we need to restore
        // original values for end paddings.
@ -190,7 +215,7 @@ void MKLDNNPoolingNode::initSupportedPrimitiveDescriptors() {

    for (auto& desc : descs) {
        auto itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);
-        while (itpd.is_not_end()) {
+        while (static_cast<bool>(itpd)) {
            InferenceEngine::LayerConfig config;
            config.dynBatchSupport = true;
            for (size_t i = 0; i < descInputNumbers(desc); i++) {
@ -201,30 +226,18 @@ void MKLDNNPoolingNode::initSupportedPrimitiveDescriptors() {
                config.inConfs.push_back(dataConfig);
            }

-            std::vector<mkldnn::memory::format> outFormats;
            for (size_t i = 0; i < descOutputNumbers(desc); i++) {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = canBeInPlace() ? 0 : -1;
                dataConfig.constant = false;
                dataConfig.desc = MKLDNNExtensionUtils::getUninitTensorDesc(getDstMemDesc(itpd, i));
                config.outConfs.push_back(dataConfig);
-
-                auto primDesc = itpd.fetch();
-                auto dstPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(dst_pd), 0);
-                if (dstPrimDesc) {
-                    outFormats.emplace_back(static_cast<memory::format>(itpd.dst_primitive_desc().desc().data.format));
-                } else {
-                    // This path is needed to correctly handle Deconvolution node
-                    auto diffSrcPrimDesc = mkldnn_primitive_desc_query_pd(primDesc.get(), mkldnn::convert_to_c(diff_src_pd), 0);
-                    if (diffSrcPrimDesc) {
-                        outFormats.emplace_back(static_cast<memory::format>(itpd.diff_src_primitive_desc().desc().data.format));
-                    }
-                }
            }
-            impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());

-            supportedPrimitiveDescriptors.emplace_back(config, impl_type, outFormats);
-            itpd++;
+            supportedPrimitiveDescriptors.emplace_back(config, impl_type);
+            if (!itpd.next_impl())
+                break;
        }
    }
 }
@ -249,18 +262,18 @@ void MKLDNNPoolingNode::initDescriptor(const InferenceEngine::LayerConfig &confi
    size_t selected_count = 0;
    for (size_t j = 0; j < descs.size(); j++) {
        const auto &desc = descs[j];
-        std::shared_ptr<primitive_desc_iterator> itpd;
+        primitive_desc_iterator itpd;

-        itpd = std::make_shared<primitive_desc_iterator>(desc.createPrimitiveDescriptorIterator(getEngine(), attr));
+        itpd = desc.createPrimitiveDescriptorIterator(getEngine(), attr);

-        while (itpd->is_not_end()) {
+        while (itpd) {
            InferenceEngine::LayerConfig cfg;
            cfg.dynBatchSupport = true;
            for (size_t i = 0; i < descInputNumbers(desc); i++) {
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = canBeInPlace() ? 0 : -1;
                dataConfig.constant = false;
-                dataConfig.desc = getSrcMemDesc(*itpd, i);
+                dataConfig.desc = getSrcMemDesc(itpd, i);
                cfg.inConfs.push_back(dataConfig);
            }

@ -268,10 +281,10 @@ void MKLDNNPoolingNode::initDescriptor(const InferenceEngine::LayerConfig &confi
                InferenceEngine::DataConfig dataConfig;
                dataConfig.inPlace = -1;
                dataConfig.constant = false;
-                dataConfig.desc = getDstMemDesc(*itpd, i);
+                dataConfig.desc = getDstMemDesc(itpd, i);
                cfg.outConfs.push_back(dataConfig);
            }
-            impl_desc_type impl_type = parse_impl_name(itpd->get_impl_info_str().c_str());
+            impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
            if (selected_count == selectedPrimitiveDescriptorIndex) {
                if (impl_type != selectedPD->getImplementationType()) {
                    THROW_IE_EXCEPTION << "Cannot get the original layer configuration!";
@ -284,7 +297,8 @@ void MKLDNNPoolingNode::initDescriptor(const InferenceEngine::LayerConfig &confi
                }
            }
            selected_count++;
-            (*itpd)++;
+            if (!itpd.next_impl())
+                break;
        }
    }

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_pooling_node.h
@ -19,6 +19,7 @@ public:

    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
+    std::vector<mkldnn::memory::format_tag> getAvailableFormatsForDims(const MKLDNNDims &dims) const override;
    void getSupportedDescriptors() override;
    void initSupportedPrimitiveDescriptors() override;
    void initDescriptor(const InferenceEngine::LayerConfig &config) override;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.cpp
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_quantize_node.h
@ -6,14 +6,68 @@

 #include <ie_common.h>
 #include <mkldnn_node.h>
+#include <common/primitive_attr.hpp>
+
 #include <string>
 #include <memory>
 #include <vector>
 #include <utility>
-#include <primitive_attr.hpp>

 namespace MKLDNNPlugin {

+enum QuantizeOpType {
+    FakeQuantization,
+    Quantization,
+    Binarization,
+};
+
+struct jit_quantize_params {
+    int c;
+
+    InferenceEngine::Precision src_prc;
+    InferenceEngine::Precision wei_prc;
+    InferenceEngine::Precision dst_prc;
+
+    InferenceEngine::Layout src_layout;
+
+    QuantizeOpType op_type;
+};
+
+struct jit_quantize_call_args {
+    const uint8_t* from;
+    const uint8_t* to;
+    const float* thresholds;
+    const float* output_mask;
+
+    const float* crop_low;
+    const float* crop_high;
+    const float* input_scale;
+    const float* input_shift;
+    const float* output_scale;
+    const float* output_shift;
+
+    size_t src_step;
+    size_t dst_step;
+    size_t block_size;
+    size_t work_amount;
+};
+
+struct jit_uni_quantize_kernel {
+    void (*ker_)(const jit_quantize_call_args *);
+
+    void operator()(const jit_quantize_call_args *args) {
+        assert(ker_);
+        ker_(args);
+    }
+
+    explicit jit_uni_quantize_kernel(jit_quantize_params jqp) : ker_(nullptr), jqp_(jqp) {}
+    virtual ~jit_uni_quantize_kernel() {}
+
+    virtual void create_ker() = 0;
+
+    jit_quantize_params jqp_;
+};
+
 class MKLDNNQuantizeNode : public MKLDNNNode {
 public:
    MKLDNNQuantizeNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
@ -27,8 +81,8 @@ public:

    size_t getAxis() const { return axis; }

-    bool isBinarization() const { return quantizeAlgorithm == mkldnn::algorithm::binarization_depthwise; }
-    mkldnn::algorithm getAlgorithm() const { return quantizeAlgorithm; }
+    bool isBinarization() const { return quantizeOpType == QuantizeOpType::Binarization; }
+    QuantizeOpType getOpType() const { return quantizeOpType; }

    const float* getBinarizationTresholdsPtr() const { return &binarizationThresholds[0]; }
    const float* getBinarizationOutputMaskPtr() const { return reinterpret_cast<const float*>(&binarizationOutputMask[0]); }
@ -61,7 +115,10 @@ public:

 private:
    void init() override;
-    std::vector<mkldnn::memory::format> getDataFormats() const;
+    std::vector<mkldnn::memory::format_tag> getDataFormats() const;
+    void executeReference();
+    void executeBinarization();
+    void executeQuantization();

    int levels = -1;

@ -94,7 +151,11 @@ private:
    InferenceEngine::Precision inputPrecision = InferenceEngine::Precision::FP32;
    InferenceEngine::Precision outputPrecision = InferenceEngine::Precision::FP32;

-    mkldnn::algorithm quantizeAlgorithm = mkldnn::algorithm::algorithm_undef;
+    QuantizeOpType quantizeOpType = FakeQuantization;
+
+    jit_quantize_params jqp = {};
+
+    std::shared_ptr<jit_uni_quantize_kernel> quantize_kernel = nullptr;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_reduce_node.h"
-#include "desc_iterator.hpp"
+
 #include "mkldnn_quantize_node.h"
 #include <legacy/ie_layers.h>
 #include <mkldnn.hpp>
@ -16,16 +16,17 @@
 #include "ie_parallel.hpp"
 #include <algorithm>

-#include "jit_generator.hpp"
-#include "jit_uni_eltwise.hpp"
-#include "jit_uni_depthwise.hpp"
-#include "jit_uni_quantization.hpp"
+#include <cpu/x64/jit_generator.hpp>
+#include <cpu/x64/jit_uni_eltwise.hpp>
+#include <cpu/x64/jit_uni_depthwise_injector.hpp>
+#include <cpu/x64/jit_uni_quantization_injector.hpp>
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl;
-using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;
 using namespace Xbyak;

@ -67,7 +68,7 @@ using namespace Xbyak;

 // some utility functions
 static inline bool isFloatCompatible(memory::data_type type) {
-    return memory::f32 == type || memory::bf16 == type;
+    return memory::data_type::f32 == type || memory::data_type::bf16 == type;
 }

 template <cpu_isa_t isa>
@ -75,8 +76,15 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_kernel_f32)

    explicit jit_uni_reduce_kernel_f32(jit_reduce_config_params jcp)
-    : jit_uni_reduce_kernel(jcp), jit_generator() {
-        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
+    : jit_uni_reduce_kernel(jcp), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
+        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f, 1));

        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
            emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -94,10 +102,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
            mov(reg_table, l_table);
        }

-        if (isa == cpu::avx512_common || jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::Or)
+        if (isa == cpu::x64::avx512_common || jcp_.reduce_mode == Reduce::And || jcp_.reduce_mode == Reduce::Or)
            uni_vpxor(vmm_zero, vmm_zero, vmm_zero);

-        if ((isa == cpu::avx512_common && jcp_.reduce_mode == Reduce::And) || jcp_.reduce_mode == Reduce::Or) {
+        if ((isa == cpu::x64::avx512_common && jcp_.reduce_mode == Reduce::And) || jcp_.reduce_mode == Reduce::Or) {
            uni_vmovups(vmm_aux, table_val(0));
        }

@ -115,12 +123,10 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene
        } else if (jcp_.reduce_mode == Reduce::LogSumExp) {
            exp_injector->prepare_table();
        }
-
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;

@ -250,7 +256,7 @@ private:
                load_vector(vmm_src, ptr[reg_src], jcp_.src_dt);
                reduce_kernel(vmm_src, vmm_dst);

-                if (isa == cpu::sse42) {
+                if (isa == cpu::x64::sse41) {
                    load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt);
                    reduce_kernel(vmm_src, vmm_dst_aux);
                }
@ -424,7 +430,7 @@ private:
            load_vector(vmm_src, ptr[reg_src], jcp_.src_dt);
            reduce_kernel(vmm_src, vmm_dst);

-            if (isa == cpu::sse42) {
+            if (isa == cpu::x64::sse41) {
                load_vector(vmm_src, ptr[reg_src + 4 * jcp_.src_data_size], jcp_.src_dt);
                reduce_kernel(vmm_src, vmm_dst);
            }
@ -530,7 +536,7 @@ private:

    inline void load_dst_vector() {
        load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt);
-        if (isa == cpu::sse42)
+        if (isa == cpu::x64::sse41)
            load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt);
    }

@ -538,30 +544,30 @@ private:
        if (jcp_.reduce_mode == Reduce::Or && isa != avx512_common) {
            vcmpneqps(vmm_dst, vmm_dst, vmm_zero);
            uni_vandps(vmm_dst, vmm_dst, vmm_aux);
-            if (isa == cpu::sse42) {
+            if (isa == cpu::x64::sse41) {
                vcmpneqps(vmm_dst_aux, vmm_dst_aux, vmm_zero);
                uni_vandps(vmm_dst_aux, vmm_dst_aux, vmm_aux);
            }
        }
        store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt);
-        if (isa == cpu::sse42)
+        if (isa == cpu::x64::sse41)
            store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt);
    }

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
            default:
@ -574,19 +580,19 @@ private:

    inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(xmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                pinsrw(xmm_src, op, 0x0);
                uni_vpslld(xmm_src, xmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                movsx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                movzx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
@ -608,41 +614,41 @@ private:
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(op, vmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                if (mayiuse(avx512_core_bf16))
                    vcvtneps2bf16(ymm_dst, vmm_dst);
                else
                    emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
                vmovdqu16(op, ymm_dst);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                if (isa == avx512_common) {
                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
                    vpmovsdb(op, vmm_dst);
                } else {
                    uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vpermq(ymm_dst, ymm_dst, 0x08);
                    uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vmovq(op, xmm_dst);
                    else
                        movd(op, xmm_dst);
                }
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                if (isa == avx512_common) {
                    vpmovusdb(op, vmm_dst);
                } else {
                    uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vpermq(ymm_dst, ymm_dst, 0x08);
                    uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vmovq(op, xmm_dst);
                    else
                        movd(op, xmm_dst);
@ -659,21 +665,21 @@ private:
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(op, xmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpsrld(xmm_dst, xmm_dst, 16);
                pextrw(op, xmm_dst, 0x0);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
                mov(op, reg_tmp_8);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
@ -685,9 +691,9 @@ private:
    }

    inline void load_embedded_horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) {
-        if (isa == cpu::sse42) {
+        if (isa == cpu::x64::sse41) {
            load_embedded_horiz_store(vmm_dst, dst_dt);
-        } else if (isa == cpu::avx2) {
+        } else if (isa == cpu::x64::avx2) {
            Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
            vextractf128(xmm_aux1, ymm_dst, 0);
            vextractf128(xmm_aux2, ymm_dst, 1);
@ -712,20 +718,20 @@ private:
        movhlps(xmm_aux3, xmm_dst);  // aux3:f(3,4),f(4,4),4,4
        horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
        switch (dst_dt) {
-            case memory::f32:
-            case memory::bf16:
+            case memory::data_type::f32:
+            case memory::data_type::bf16:
                load_scalar(xmm_aux3, ptr[reg_dst], dst_dt);
                horiz_ps(xmm_dst, xmm_aux3);
                store_scalar(ptr[reg_dst], xmm_dst, dst_dt);
                break;
-            case memory::s32:
+            case memory::data_type::s32:
                movss(xmm_aux3, ptr[reg_dst]);
                uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
                horiz_ps(xmm_dst, xmm_aux3);
                uni_vcvtps2dq(xmm_dst, xmm_dst);
                movss(ptr[reg_dst], xmm_dst);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                vpbroadcastb(xmm_aux3, ptr[reg_dst]);
                uni_vpmovzxbd(xmm_aux3, xmm_aux3);
                uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -735,7 +741,7 @@ private:
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                pextrb(ptr[reg_dst], xmm_dst, 0);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                vpbroadcastb(xmm_aux3, ptr[reg_dst]);
                uni_vpmovsxbd(xmm_aux3, xmm_aux3);
                uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -814,8 +820,15 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduce_post_kernel_f32)

    explicit jit_uni_reduce_post_kernel_f32(jit_reduce_config_params jcp)
-    : jit_uni_reduce_post_kernel(jcp), jit_generator() {
-        log_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_log, 0.f, 0.f));
+    : jit_uni_reduce_post_kernel(jcp), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
+        log_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_log, 0.f, 0.f, 1.f));

        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
            emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -828,7 +841,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
        if (!jcp_.planar_layout)
            mov(reg_reduce_c, ptr[reg_params + GET_OFF(reduce_c)]);

-        if (isa == cpu::avx512_common)
+        if (isa == cpu::x64::avx512_common)
            uni_vpxor(vmm_zero, vmm_zero, vmm_zero);

        reduce_post_main();
@ -843,12 +856,10 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi
        if (jcp_.reduce_mode == Reduce::LogSum || jcp_.reduce_mode == Reduce::LogSumExp) {
            log_injector->prepare_table();
        }
-
-        ker_ = (decltype(ker_)) this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2,
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
            Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;

@ -902,12 +913,12 @@ private:

                // load
                load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt);
-                if (isa == cpu::sse42)
+                if (isa == cpu::x64::sse41)
                    load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt);

                // reduce and store
                horiz_reduce_store(vmm_dst, jcp_.dst_dt);
-                if (isa == cpu::sse42)
+                if (isa == cpu::x64::sse41)
                    load_embedded_horiz_reduce_store(vmm_dst_aux, jcp_.dst_dt);

                add(reg_dst, step * jcp_.dst_data_size);
@ -941,17 +952,17 @@ private:

                    // load
                    load_vector(vmm_dst, ptr[reg_dst], jcp_.dst_dt);
-                    if (isa == cpu::sse42)
+                    if (isa == cpu::x64::sse41)
                        load_vector(vmm_dst_aux, ptr[reg_dst + 4 * jcp_.dst_data_size], jcp_.dst_dt);

                    // reduce
                    reduce_map_kernel(vmm_dst);
-                    if (isa == cpu::sse42)
+                    if (isa == cpu::x64::sse41)
                        reduce_map_kernel(vmm_dst_aux);

                    // store
                    store_vector(ptr[reg_dst], vmm_dst, jcp_.dst_dt);
-                    if (isa == cpu::sse42)
+                    if (isa == cpu::x64::sse41)
                        store_vector(ptr[reg_dst + 4 * jcp_.dst_data_size], vmm_dst_aux, jcp_.dst_dt);

                    add(reg_dst, step * jcp_.dst_data_size);
@ -1019,18 +1030,18 @@ private:

    inline void load_vector(Vmm vmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(vmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpmovzxwd(vmm_src, op);
                uni_vpslld(vmm_src, vmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpmovsxbd(vmm_src, op);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpmovzxbd(vmm_src, op);
                break;
            default:
@ -1043,19 +1054,19 @@ private:

    inline void load_scalar(Xmm xmm_src, const Xbyak::Address &op, memory::data_type src_dt) {
        switch (src_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(xmm_src, op);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                pinsrw(xmm_src, op, 0x0);
                uni_vpslld(xmm_src, xmm_src, 16);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                movsx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                movzx(reg_tmp_32, op);
                movq(xmm_src, reg_tmp_64);
                break;
@ -1077,41 +1088,41 @@ private:
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                uni_vmovups(op, vmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                if (mayiuse(avx512_core_bf16))
                    vcvtneps2bf16(ymm_dst, vmm_dst);
                else
                    emu_vcvtneps2bf16->emit({static_cast<size_t>(vmm_dst.getIdx())}, {static_cast<size_t>(ymm_dst.getIdx())});
                vmovdqu16(op, ymm_dst);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                if (isa == avx512_common) {
                    vmaxps(vmm_dst, vmm_zero, vmm_dst);
                    vpmovsdb(op, vmm_dst);
                } else {
                    uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vpermq(ymm_dst, ymm_dst, 0x08);
                    uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vmovq(op, xmm_dst);
                    else
                        movd(op, xmm_dst);
                }
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                if (isa == avx512_common) {
                    vpmovusdb(op, vmm_dst);
                } else {
                    uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vpermq(ymm_dst, ymm_dst, 0x08);
                    uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
-                    if (isa != cpu::sse42)
+                    if (isa != cpu::x64::sse41)
                        vmovq(op, xmm_dst);
                    else
                        movd(op, xmm_dst);
@ -1128,21 +1139,21 @@ private:
        }

        switch (dst_dt) {
-            case memory::f32:
-            case memory::s32:
+            case memory::data_type::f32:
+            case memory::data_type::s32:
                movss(op, xmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpsrld(xmm_dst, xmm_dst, 16);
                pextrw(op, xmm_dst, 0x0);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
                mov(op, reg_tmp_8);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                movq(reg_tmp_64, xmm_dst);
@ -1154,9 +1165,9 @@ private:
    }

    inline void horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) {
-        if (isa == cpu::sse42) {
+        if (isa == cpu::x64::sse41) {
            horize_store(vmm_dst, dst_dt);
-        } else if (isa == cpu::avx2) {
+        } else if (isa == cpu::x64::avx2) {
            Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
            vextractf128(xmm_aux1, ymm_dst, 0);
            vextractf128(xmm_aux2, ymm_dst, 1);
@ -1181,24 +1192,24 @@ private:
        movhlps(xmm_aux3, xmm_dst);  // aux3:f(3,4),f(4,4),4,4
        horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
        switch (dst_dt) {
-            case memory::f32:
+            case memory::data_type::f32:
                movss(ptr[reg_dst], xmm_dst);
                break;
-            case memory::bf16:
+            case memory::data_type::bf16:
                uni_vpsrld(xmm_dst, xmm_dst, 16);
                pextrw(ptr[reg_dst], xmm_dst, 0x0);
                break;
-            case memory::s32:
+            case memory::data_type::s32:
                uni_vcvtps2dq(xmm_dst, xmm_dst);
                movss(ptr[reg_dst], xmm_dst);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                uni_vcvtps2dq(xmm_dst, xmm_dst);
                uni_vpackusdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                pextrb(ptr[reg_dst], xmm_dst, 0);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                uni_vcvtps2dq(xmm_dst, xmm_dst);
                uni_vpackssdw(xmm_dst, xmm_dst, xmm_dst);
                uni_vpacksswb(xmm_dst, xmm_dst, xmm_dst);
@ -1210,9 +1221,9 @@ private:
    }

    inline void load_embedded_horiz_reduce_store(Vmm vmm_dst, memory::data_type dst_dt) {
-        if (isa == cpu::sse42) {
+        if (isa == cpu::x64::sse41) {
            load_embedded_horiz_store(vmm_dst, dst_dt);
-        } else if (isa == cpu::avx2) {
+        } else if (isa == cpu::x64::avx2) {
            Xbyak::Ymm ymm_dst = Xbyak::Ymm(vmm_dst.getIdx());
            vextractf128(xmm_aux1, ymm_dst, 0);
            vextractf128(xmm_aux2, ymm_dst, 1);
@ -1237,20 +1248,20 @@ private:
        movhlps(xmm_aux3, xmm_dst);  // aux3:f(3,4),f(4,4),4,4
        horiz_ps(xmm_dst, xmm_aux3); // dst:f(1,2,3,4),...
        switch (dst_dt) {
-            case memory::f32:
-            case memory::bf16:
+            case memory::data_type::f32:
+            case memory::data_type::bf16:
                load_scalar(xmm_aux3, ptr[reg_dst], dst_dt);
                horiz_ps(xmm_dst, xmm_aux3);
                store_scalar(ptr[reg_dst], xmm_dst, dst_dt);
                break;
-            case memory::s32:
+            case memory::data_type::s32:
                movss(xmm_aux3, ptr[reg_dst]);
                uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
                horiz_ps(xmm_dst, xmm_aux3);
                uni_vcvtps2dq(xmm_dst, xmm_dst);
                movss(ptr[reg_dst], xmm_dst);
                break;
-            case memory::u8:
+            case memory::data_type::u8:
                vpbroadcastb(xmm_aux3, ptr[reg_dst]);
                uni_vpmovzxbd(xmm_aux3, xmm_aux3);
                uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -1260,7 +1271,7 @@ private:
                uni_vpackuswb(xmm_dst, xmm_dst, xmm_dst);
                pextrb(ptr[reg_dst], xmm_dst, 0);
                break;
-            case memory::s8:
+            case memory::data_type::s8:
                vpbroadcastb(xmm_aux3, ptr[reg_dst]);
                uni_vpmovsxbd(xmm_aux3, xmm_aux3);
                uni_vcvtdq2ps(xmm_aux3, xmm_aux3);
@ -1369,7 +1380,7 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
    Precision inputPrecision = getCnnLayer()->insData[REDUCE_DATA].lock()->getPrecision();
    Precision outputPrecision = getCnnLayer()->outData[0]->getPrecision();

-    jit_mode = (mayiuse(cpu::sse42)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 &&
+    jit_mode = (mayiuse(cpu::x64::sse41)) && getParentEdgeAt(REDUCE_DATA)->getDims().ndims() <= 5 &&
               std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), inputPrecision) != std::end(supportedPrecisions) &&
               std::find(std::begin(supportedPrecisions), std::end(supportedPrecisions), outputPrecision) != std::end(supportedPrecisions);

@ -1405,19 +1416,19 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
    config.inConfs[REDUCE_INDEXES].inPlace = -1;
    config.outConfs[0].inPlace = -1;

-    auto pushDesc = [&](memory::format inFormat, memory::format outFormat, memory::data_type inDataType,
+    auto pushDesc = [&](memory::format_tag inFormat, memory::format_tag outFormat, memory::data_type inDataType,
            memory::data_type outDataType, impl_desc_type impl_type) {
        config.inConfs[REDUCE_DATA].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_DATA)->getDims(), inDataType, inFormat);
-        config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::s32, memory::x);
+        config.inConfs[REDUCE_INDEXES].desc = MKLDNNMemoryDesc(getParentEdgeAt(REDUCE_INDEXES)->getDims(), memory::data_type::s32, memory::format_tag::x);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outDataType, outFormat);
        supportedPrimitiveDescriptors.push_back({config, impl_type, outFormat});
    };

    if (jit_mode) {
        impl_desc_type impl_type = impl_desc_type::jit_sse42;
-        if (mayiuse(cpu::avx512_common)) {
+        if (mayiuse(cpu::x64::avx512_common)) {
            impl_type = impl_desc_type::jit_avx512;
-        } else if (mayiuse(cpu::avx2)) {
+        } else if (mayiuse(cpu::x64::avx2)) {
            impl_type = impl_desc_type::jit_avx2;
        }

@ -1425,22 +1436,23 @@ void MKLDNNReduceNode::initSupportedPrimitiveDescriptors() {
             MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), inputDataType, outputDataType, impl_type);
        if (keep_dims) {
            if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 4 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) {
-                if (mayiuse(cpu::avx512_common)) {
-                    pushDesc(memory::nChw16c, memory::nChw16c, inputDataType, outputDataType, impl_type);
-                } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
-                    pushDesc(memory::nChw8c, memory::nChw8c, inputDataType, outputDataType, impl_type);
+                if (mayiuse(cpu::x64::avx512_common)) {
+                    pushDesc(memory::format_tag::nChw16c, memory::format_tag::nChw16c, inputDataType, outputDataType, impl_type);
+                } else if (mayiuse(cpu::x64::avx2) || mayiuse(cpu::x64::sse41)) {
+                    pushDesc(memory::format_tag::nChw8c, memory::format_tag::nChw8c, inputDataType, outputDataType, impl_type);
                }
            } else if (getParentEdgeAt(REDUCE_DATA)->getDims().ndims() == 5 && getParentEdgeAt(REDUCE_DATA)->getDims().ToSizeVector()[1] > 1) {
-                if (mayiuse(cpu::avx512_common)) {
-                    pushDesc(memory::nCdhw16c, memory::nCdhw16c, inputDataType, outputDataType, impl_type);
-                } else if (mayiuse(cpu::avx2) || mayiuse(cpu::sse42)) {
-                    pushDesc(memory::nCdhw8c, memory::nCdhw8c, inputDataType, outputDataType, impl_type);
+                if (mayiuse(cpu::x64::avx512_common)) {
+                    pushDesc(memory::format_tag::nCdhw16c, memory::format_tag::nCdhw16c, inputDataType, outputDataType, impl_type);
+                } else if (mayiuse(cpu::x64::avx2) || mayiuse(cpu::x64::sse41)) {
+                    pushDesc(memory::format_tag::nCdhw8c, memory::format_tag::nCdhw8c, inputDataType, outputDataType, impl_type);
                }
            }
        }
    } else {
        pushDesc(MKLDNNMemory::GetPlainFormat(memory::dims(getParentEdgeAt(REDUCE_DATA)->getDims().ndims())),
-             MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())), memory::f32, memory::f32, impl_desc_type::ref);
+                 MKLDNNMemory::GetPlainFormat(memory::dims(getChildEdgeAt(0)->getDims().ndims())),
+                 memory::data_type::f32, memory::data_type::f32, impl_desc_type::ref);
    }
 }

@ -1456,8 +1468,7 @@ void MKLDNNReduceNode::createPrimitive() {
        THROW_IE_EXCEPTION << "Reduce layer with name " << getName() << "didn't set preferable primitive descriptor.";

    auto selectedPD = getSelectedPrimitiveDescriptor();
-    Layout selected_layout = selectedPD->getConfig().inConfs[REDUCE_DATA].desc.getLayout();
-    planar_layout = MKLDNNMemory::GetPlainLayout(getParentEdgeAt(REDUCE_DATA)->getDims()) == selected_layout;
+    planar_layout = getParentEdgeAt(REDUCE_DATA)->getMemory().GetDesc().isPlainFormat();

    auto jcp = jit_reduce_config_params();
    jcp.src_dt = MKLDNNExtensionUtils::IEPrecisionToDataType(selectedPD->getConfig().inConfs[REDUCE_DATA].desc.getPrecision());
@ -1467,20 +1478,26 @@ void MKLDNNReduceNode::createPrimitive() {
    jcp.planar_layout = planar_layout;
    jcp.reduce_mode = reduceMode;

-    if (mayiuse(cpu::avx512_common)) {
-        reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::avx512_common>(jcp));
-        reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::avx512_common>(jcp));
+    if (mayiuse(cpu::x64::avx512_common)) {
+        reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::x64::avx512_common>(jcp));
+        reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::x64::avx512_common>(jcp));
        blk_size = 16;
-    } else if (mayiuse(cpu::avx2)) {
-        reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::avx2>(jcp));
-        reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::avx2>(jcp));
+    } else if (mayiuse(cpu::x64::avx2)) {
+        reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::x64::avx2>(jcp));
+        reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::x64::avx2>(jcp));
        blk_size = 8;
-    } else if (mayiuse(cpu::sse42)) {
-        reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::sse42>(jcp));
-        reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::sse42>(jcp));
+    } else if (mayiuse(cpu::x64::sse41)) {
+        reduce_kernel.reset(new jit_uni_reduce_kernel_f32<cpu::x64::sse41>(jcp));
+        reduce_post_kernel.reset(new jit_uni_reduce_post_kernel_f32<cpu::x64::sse41>(jcp));
        blk_size = 8;
    }

+    if (reduce_kernel)
+        reduce_kernel->create_ker();
+
+    if (reduce_post_kernel)
+        reduce_post_kernel->create_ker();
+
    jit_mode = jit_mode && reduce_kernel;
 }

@ -1521,12 +1538,8 @@ void MKLDNNReduceNode::execute(mkldnn::stream strm) {
        ReduceW = IW != OW && OW == 1;
    }

-    const uint8_t *src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetData()) +
-                   srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
-                   MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(srcMemPtr->GetDescriptor().data.data_type));
-    uint8_t *dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetData()) +
-                   dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding *
-                   MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(dstMemPtr->GetDescriptor().data.data_type));
+    const uint8_t *src_data = reinterpret_cast<const uint8_t *>(srcMemPtr->GetPtr());
+    uint8_t *dst_data = reinterpret_cast<uint8_t *>(dstMemPtr->GetPtr());
    if (jit_mode) {
        reduce_type(src_data, dst_data, dst_size);
    } else {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h
@ -56,6 +56,8 @@ struct jit_uni_reduce_kernel {
    explicit jit_uni_reduce_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
    virtual ~jit_uni_reduce_kernel() {}

+    virtual void create_ker() = 0;
+
    jit_reduce_config_params jcp_;
 };

@ -67,6 +69,8 @@ struct jit_uni_reduce_post_kernel {
        ker_(args);
    }

+    virtual void create_ker() = 0;
+
    explicit jit_uni_reduce_post_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
    virtual ~jit_uni_reduce_post_kernel() {}

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reorder_node.cpp
@ -58,8 +58,8 @@ void MKLDNNReorderNode::initSupportedPrimitiveDescriptors() {
        config.inConfs[0].desc = parent->getSelectedPrimitiveDescriptor()->getConfig().outConfs[0].desc;
        config.outConfs[0].desc = child->getSelectedPrimitiveDescriptor()->getConfig().inConfs[0].desc;
    } else {
-        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::any);
-        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format::any);
+        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::any);
+        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::any);
    }

    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::reorder, MKLDNNMemory::Convert(config.outConfs[0].desc.getLayout()));
@ -103,65 +103,53 @@ void MKLDNNReorderNode::createReorderPrimitive(const mkldnn::memory::desc &srcDe
        mask = 1 << oc_dim_id;

        attr.set_output_scales(mask, scales);
-        attr.set_int_output_round_mode(round_nearest);
    }

-    auto createReorder = [&]() {
+    auto createReorder = [&]() -> bool {
        // No autoblocking. Reorder can be applied as is
-        reorder::primitive_desc pd = reorder::primitive_desc(src_blocked->GetPrimitiveDescriptor(), dst_blocked->GetPrimitiveDescriptor(), attr);
+        reorder::primitive_desc pd = mkldnn::reorder::primitive_desc(src_blocked->GetPrimitive(), dst_blocked->GetPrimitive(), attr, true);

-        const char *info;
-        mkldnn_primitive_desc_query(pd.get(), mkldnn::convert_to_c(impl_info_str), 0, &info);
-        supportedPrimitiveDescriptors[0].setImplementationType(parse_impl_name(std::string(info)));
-        supportedPrimitiveDescriptors[0].setOutputLayouts(static_cast<memory::format>(dstDesc.data.format));
+        if (!pd)
+            return false;

-        prim.reset(new mkldnn::reorder(pd, src_blocked->GetPrimitive(), dst_blocked->GetPrimitive()));
+        auto info = pd.impl_info_str();
+        supportedPrimitiveDescriptors[0].setImplementationType(parse_impl_name(info));
+
+        prim.reset(new mkldnn::reorder(pd));
+        return true;
    };

-    try {
-        createReorder();
-    } catch (...) {
+    auto success = createReorder();
+    if (!success) {
+        // TODO: We should keep shape consistency for const and expected shape for node.
+        //       If it requires reshape operation it should explicitly injected into graph.
+        //
+        // There is a limitation for IE representing of weights for grouped convolutions. IE doesn't
+        // split group dimension in separate shape dimension. IE use OIHW, but mkldnn expect GOIHW.
+        // So we will perform implicit reshape to dst shape.
+        //
        // MKLDNN doesn't support direct reorders from planar data formats to grouped weights formats.
        // Code block below tries to detect such cases and reinterpret data planar formats (e.g. nchw)
        // as grouped weights planar formats (e.g. goihw) since they have same physical memory layout.
-        if (MKLDNNMemory::GetPlainFormat(src_blocked->GetDims()) == src_blocked->GetFormat() &&
+        if (src_blocked->GetDesc().isPlainFormat() &&
            src_blocked->GetDims().size() + 1 == dst_blocked->GetDims().size()) {
-            try {
-                mkldnn::memory::dims newDims = dst_blocked->GetDims();
-                mkldnn::memory::format newFormat;
-                if (MKLDNNMemory::IsGroupedFormat(dst_blocked->GetFormat())) {
-                    newFormat = src_blocked->GetDims().size() == 4 ? memory::goihw :
-                                src_blocked->GetDims().size() == 5 ? memory::goidhw :
-                                src_blocked->GetFormat();
-                } else {
-                    newFormat = src_blocked->GetDims().size() == 4 ? memory::ncdhw :
-                                src_blocked->GetFormat();
-                }
+            const auto newDims = dst_blocked->GetDims();
+            const auto newFormat = MKLDNNMemory::GetPlainFormat(newDims);

-                auto newDesc = mkldnn::memory::desc(newDims, src_blocked->GetDataType(), newFormat);
-                src_blocked->Create(newDesc, srcPtr, false);
+            auto newDesc = mkldnn::memory::desc(newDims, src_blocked->GetDataType(), newFormat);
+            src_blocked->Create(newDesc, srcPtr, false);

-                createReorder();
-            } catch (...) {
-                THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
-            }
-        // MKLDNN doesn't support direct reorders between planar data formats in case they have different rank but the same number of elements.
-        // Code block below detects these cases and substitute src dims with dst ones.
-        } else if (MKLDNNMemory::GetPlainFormat(src_blocked->GetDims()) == src_blocked->GetFormat() &&
-                   MKLDNNMemory::GetPlainFormat(dst_blocked->GetDims()) == dst_blocked->GetFormat() &&
-                   src_blocked->GetElementsCount() == dst_blocked->GetElementsCount()) {
-            try {
-                auto newDesc = mkldnn::memory::desc(dst_blocked->GetDims(), src_blocked->GetDataType(), dst_blocked->GetFormat());
-                src_blocked->Create(newDesc, srcPtr, false);
-
-                createReorder();
-            } catch (...) {
-                THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
-            }
-        } else {
-            THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
+            success = createReorder();
        }
    }
+
+    if (!success) {
+        THROW_IE_EXCEPTION << "Cannot create reorder primitive: unsupported reorder case";
+    }
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
 }

 const std::vector<impl_desc_type>& MKLDNNReorderNode::getPrimitivesPriority() {
@ -194,10 +182,10 @@ void MKLDNNReorderNode::setDynamicBatchLim(int lim) {
        void *dst_data_hdl = dstMemPtr->GetPrimitive().get_data_handle();

        src_d.data.dims[0] = batchToProcess();
-        src_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
+        src_d.data.padded_dims[0] = batchToProcess();

        dst_d.data.dims[0] = batchToProcess();
-        dst_d.data.layout_desc.blocking.padding_dims[0] = batchToProcess();
+        dst_d.data.padded_dims[0] = batchToProcess();

        createReorderPrimitive(src_d, src_data_hdl, dst_d, dst_data_hdl);
    }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reshape_node.cpp
@ -36,22 +36,19 @@ void MKLDNNReshapeNode::initSupportedPrimitiveDescriptors() {
    if (inputDataType != outputDataType)
        inputDataType = outputDataType;

-    auto& outDims = getChildEdgeAt(0)->getDims();
-    memory::format outFormat = MKLDNNMemory::GetPlainFormat(outDims);
    InferenceEngine::LayerConfig config;
    config.dynBatchSupport = true;
    config.inConfs.resize(getParentEdges().size());
    for (size_t i = 0; i <getParentEdges().size(); i++) {
        config.inConfs[i].inPlace = -1;
        config.inConfs[i].constant = false;
-        config.inConfs[i].desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType,
-                                                  MKLDNNMemory::GetPlainFormat(getParentEdgeAt(i)->getDims()));
+        config.inConfs[i].desc = MKLDNNMemoryDesc(getParentEdgeAt(i)->getDims(), inputDataType);
    }
    config.outConfs.resize(1);
    config.outConfs[0].inPlace = 0;
    config.outConfs[0].constant = false;
-    config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, outFormat);
-    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown, outFormat);
+    config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType);
+    supportedPrimitiveDescriptors.emplace_back(config, impl_desc_type::unknown);
 }

 void MKLDNNReshapeNode::createPrimitive() {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
@ -4,7 +4,8 @@

 #include "mkldnn_rnn.h"
 #include "mkldnn_extension_utils.h"
-#include "desc_iterator.hpp"
+
+#include "utils/general_utils.h"

 #include <string>
 #include <utility>
@ -14,38 +15,55 @@ using namespace InferenceEngine;

 namespace MKLDNNPlugin {

-template <typename T, typename P>
-inline bool one_of(T val, P item) { return val == item; }
-template <typename T, typename P, typename... Args>
-inline bool one_of(T val, P item, Args... item_others) {
-    return val == item || one_of(val, item_others...);
-}
-
 using _RNN = RNNSequenceLayer;  // alias

 static rnn_direction ie2mkl(_RNN::Direction &direction) {
-    return direction == _RNN::FWD ? unidirectional_left2right
-         : direction == _RNN::BWD ? unidirectional_right2left
-         : direction == _RNN::BDR ? bidirectional_concat
-         : unidirectional;
+    return direction == _RNN::FWD ? rnn_direction::unidirectional_left2right
+         : direction == _RNN::BWD ? rnn_direction::unidirectional_right2left
+         : direction == _RNN::BDR ? rnn_direction::bidirectional_concat
+         : rnn_direction::unidirectional;
 }

 static algorithm ie2mkl(std::string act_type) {
-    return act_type == "sigmoid" ? eltwise_logistic
-         : act_type == "tanh"    ? eltwise_tanh
-         : act_type == "relu"    ? eltwise_relu
-         : algorithm_undef;
+    return act_type == "sigmoid" ? algorithm::eltwise_logistic
+         : act_type == "tanh"    ? algorithm::eltwise_tanh
+         : act_type == "relu"    ? algorithm::eltwise_relu
+         : algorithm::undef;
 }

 static algorithm ie2mkl(RNNCellBase::CellType cell_type) {
    switch (cell_type) {
-        case RNNCellBase::LSTM: return vanilla_lstm;
-        case RNNCellBase::GRU:  return vanilla_gru;
-        case RNNCellBase::GRU_LBR:  return gru_linear_before_reset;
-        case RNNCellBase::RNN:  return vanilla_rnn;
+        case RNNCellBase::RNN:     return algorithm::vanilla_rnn;
+        case RNNCellBase::LSTM:    return algorithm::vanilla_lstm;
+        case RNNCellBase::GRU:     return algorithm::vanilla_gru;
+        case RNNCellBase::GRU_LBR: return algorithm::lbr_gru;
        default:
-            THROW_IE_EXCEPTION << "Unsoupported cell type";
-            return algorithm_undef;
+            THROW_IE_EXCEPTION << "Unsupported cell type";
+            return algorithm::undef;
+    }
+}
+
+size_t gatesCount(algorithm alg) {
+    switch (alg) {
+        case algorithm::vanilla_rnn:     return 1;
+        case algorithm::vanilla_gru:
+        case algorithm::lbr_gru:         return 3;
+        case algorithm::vanilla_lstm:    return 4;
+        default:
+            THROW_IE_EXCEPTION << "Unsupported cell type";
+            return 0;
+    }
+}
+
+size_t statesCount(algorithm alg) {
+    switch (alg) {
+        case algorithm::vanilla_rnn:
+        case algorithm::vanilla_gru:
+        case algorithm::lbr_gru:         return 1;
+        case algorithm::vanilla_lstm:    return 2;
+        default:
+            THROW_IE_EXCEPTION << "Unsupported cell type";
+            return 0;
    }
 }

@ -72,12 +90,14 @@ void MKLDNNRNN::fillCellDesc() {
    if (!cellLayer)
        THROW_IE_EXCEPTION << "No original layer for RNNCell.";

-    algorithm cell_type = ie2mkl(cellLayer->cellType);
-    algorithm cell_act = ie2mkl(cellLayer->activations[0]);  // Works only for RNN with one gate
+    cell_type = ie2mkl(cellLayer->cellType);
+    cell_act = ie2mkl(cellLayer->activations[0]);  // Works only for RNN with one gate

-    cell_desc = {cell_type, cell_act};
-    if (cellLayer->clip != 0.0f)
-        cell_desc.set_clipping(cellLayer->clip);
+    if (cellLayer->clip != 0.0f) {
+        // TODO [oneDNN]: No more supported
+        THROW_IE_EXCEPTION << "Clipping is not supported for RNN primitive";
+//        cell_desc.set_clipping(cellLayer->clip);
+    }

    auto &ins = cellLayer->insData;
    auto &outs = cellLayer->outData;
@ -94,17 +114,17 @@ void MKLDNNRNN::fillCellDesc() {
    if (in_data_dims.ndims() != 2 || in_h_state_dims.ndims() != 2)
        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();

-    G = cell_desc.get_gates_count();
-    S = cell_desc.get_state_count();
+    G = gatesCount(cell_type);
+    S = statesCount(cell_type);
    T = 1;
    N  = in_data_dims[0];
    DC = in_data_dims[1];
    SC = in_h_state_dims[1];

-    Gb = (cell_type != gru_linear_before_reset) ? G : G + 1;
+    Gb = (cell_type != mkldnn::algorithm::lbr_gru) ? G : G + 1;

    // Expected shapes
-    MKLDNNDims D_shape {N, DC}, S_shape {N, SC};
+    MKLDNNDims D_shape {N, DC}, S_shape {N, SC}, S_4D_shape {L, D, N, SC};

    if (in_data_dims != D_shape
        || in_h_state_dims != S_shape
@ -135,33 +155,34 @@ void MKLDNNRNN::fillCellDesc() {
        THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;

    // Shapes and Attributes are correct. Can start internal stuff initialization.
-
-    in_state_d  = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
-    out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
-
-    in_data_d  = {{T, N, DC}, memory::f32, memory::tnc};;
-    out_data_d = {{T, N, SC}, memory::f32, memory::tnc};;
-
-    w_data_d   = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
-    w_state_d  = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
-
-    if (bias)
-        w_bias_d = {{L, D, Gb, SC}, memory::f32, memory::ldgo};
-
-    std::vector<TensorDesc> in_candidate, out_candidate;
-    std::vector<memory::format> outputFormats;
-    in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::f32, memory::nc});
-    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
-    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
-    outputFormats.emplace_back(memory::nc);
-
-    if (S == 2) {
-        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
-        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
-        outputFormats.emplace_back(memory::nc);
+    for (size_t i = 0; i < S; i++) {
+        in_states_d.emplace_back(S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc);
+        out_states_d.emplace_back(S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc);
    }

-    createDescriptor(in_candidate, out_candidate, outputFormats);
+    in_data_d  = {{T, N, DC}, memory::data_type::f32, memory::format_tag::tnc};;
+    out_data_d = {{T, N, SC}, memory::data_type::f32, memory::format_tag::tnc};;
+
+    w_data_d   = {{L, D, DC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
+    w_state_d  = {{L, D, SC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
+
+    if (bias)
+        w_bias_d = {{L, D, Gb, SC}, memory::data_type::f32, memory::format_tag::ldgo};
+
+    std::vector<TensorDesc> in_candidate, out_candidate;
+    std::vector<memory::format_tag> outputFormats;
+    in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::data_type::f32, memory::format_tag::nc});
+    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
+    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
+    outputFormats.emplace_back(memory::format_tag::nc);
+
+    if (S == 2) {
+        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
+        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
+        outputFormats.emplace_back(memory::format_tag::nc);
+    }
+
+    createDescriptor(in_candidate, out_candidate);
 }

 void MKLDNNRNN::fillSeqDesc() {
@ -174,15 +195,16 @@ void MKLDNNRNN::fillSeqDesc() {
    if (!one_of(rnnLayer->cellType, _RNN::LSTM, _RNN::GRU, _RNN::GRU_LBR, _RNN::RNN))
        THROW_IE_EXCEPTION << "RNN layer supports only LSTM/GRU/RNN cell";

-    algorithm cell_type = ie2mkl(rnnLayer->cellType);
-    algorithm cell_act = algorithm_undef;
+    cell_type = ie2mkl(rnnLayer->cellType);
+    cell_act = algorithm::undef;
    if (!rnnLayer->activations.empty())
        cell_act = ie2mkl(rnnLayer->activations[0]);  // Works only for RNN with one gate

-    cell_desc = {cell_type, cell_act};
-
-    if (rnnLayer->clip != 0.0f)
-        cell_desc.set_clipping(rnnLayer->clip);
+    // TODO [oneDNN]: No more supported
+    if (rnnLayer->clip != 0.0f) {
+        THROW_IE_EXCEPTION << "Clipping is not supported for RNN primitive";
+//        cell_desc.set_clipping(rnnLayer->clip);
+    }

    if (!one_of(rnnLayer->axis, 0, 1))
        THROW_IE_EXCEPTION << "RNN layer supports only sequence axis 0 or 1";
@ -211,34 +233,33 @@ void MKLDNNRNN::fillSeqDesc() {
        std::swap(out_data_dims[0], out_data_dims[1]);
    }

-    G = cell_desc.get_gates_count();
-    S = cell_desc.get_state_count();
+    G = gatesCount(cell_type);
+    S = statesCount(cell_type);
    T = in_data_dims[0];
    N = in_data_dims[1];
    DC = in_data_dims[2];
    SC = out_data_dims[2];

-    Gb = (cell_type != gru_linear_before_reset) ? G : G + 1;
+    Gb = (cell_type != mkldnn::algorithm::lbr_gru) ? G : G + 1;

-    MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC};
+    MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC}, S_4D_shape {L, D, N, SC};

    if (out_data_dims != OD_shape)
        THROW_IE_EXCEPTION << "Incorrect shape of input/output ports for layer " << getName();

-    if (ins.size() > 1) {
-        for (int i = 1; i < ins.size(); i++)
-            if (getParentEdgeAt(i)->getDims() != S_shape)
-                THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
+    in_states_d.resize(S);
+    out_states_d.resize(S);

-        in_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+    for (int i = 1; i < ins.size(); i++) {
+        if (getParentEdgeAt(i)->getDims() != S_shape)
+            THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
+        in_states_d[i - 1] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
    }

-    if (outs.size() > 1) {
-        for (int i = 1; i < outs.size(); i++)
-            if (getChildEdgeAt(i)->getDims() != S_shape)
-                THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
-
-        out_state_d = {{L, D, S, N, SC}, memory::f32, memory::ldsnc};
+    for (int i = 1; i < outs.size(); i++) {
+        if (getChildEdgeAt(i)->getDims() != S_shape)
+            THROW_IE_EXCEPTION << "Incorrect shape of state ports for layer " << getName();
+        out_states_d[i - 1] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
    }

    auto blobs = rnnLayer->blobs;
@ -252,60 +273,98 @@ void MKLDNNRNN::fillSeqDesc() {
    if (weights->size() != G*SC*(SC+DC))
        THROW_IE_EXCEPTION << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);

-    w_data_d  = {{L, D, DC, G, SC}, memory::f32, memory::ldigo};
-    w_state_d = {{L, D, SC, G, SC}, memory::f32, memory::ldigo};
+    w_data_d  = {{L, D, DC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
+    w_state_d = {{L, D, SC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};

    if (bias && bias->size() != Gb*SC)
        THROW_IE_EXCEPTION << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;

    if (bias)
-        w_bias_d = {{L, D, Gb, SC}, memory::f32, memory::ldgo};
+        w_bias_d = {{L, D, Gb, SC}, memory::data_type::f32, memory::format_tag::ldgo};

    // Try to create descriptor and corresponding configuration
-    in_data_d = {in_data_dims, memory::f32, memory::tnc};
-    out_data_d = {out_data_dims, memory::f32, memory::tnc};
+    in_data_d = {in_data_dims, memory::data_type::f32, memory::format_tag::tnc};
+    out_data_d = {out_data_dims, memory::data_type::f32, memory::format_tag::tnc};

    std::vector<TensorDesc> in_candidate;
    if (nativeOrder)
        in_candidate.push_back(in_data_d);
    else
-        in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::f32, memory::ntc});
+        in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::data_type::f32, memory::format_tag::ntc});

    for (int i = 1; i < ins.size(); i++)
-        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::f32, memory::nc});
+        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});

    std::vector<TensorDesc> out_candidate;
-    std::vector<memory::format> outputFormats;
    if (nativeOrder) {
        out_candidate.push_back(out_data_d);
-        outputFormats.push_back(out_data_d.getFormat());
    } else {
-        out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::f32, memory::ntc});
-        outputFormats.push_back(memory::ntc);
+        out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::data_type::f32, memory::format_tag::ntc});
    }

    for (int i = 1; i < outs.size(); i++) {
-        out_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, memory::f32, memory::nc});
-        outputFormats.push_back(memory::nc);
+        out_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, memory::data_type::f32, memory::format_tag::nc});
    }

-    createDescriptor(in_candidate, out_candidate, outputFormats);
+    createDescriptor(in_candidate, out_candidate);
 }

 void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
-                                 const std::vector<TensorDesc> &outputDesc,
-                                 const std::vector<memory::format> &outputFormats) {
-    MKLDNNDescriptor desc(std::shared_ptr<rnn_forward::desc>(
-            new rnn_forward::desc(forward_scoring, cell_desc,
-                    direction,
-                    /* In Data       */ in_data_d,
-                    /* In State      */ in_state_d,
-                    /* Weights data  */ w_data_d,
-                    /* Weights state */ w_state_d,
-                    /* Bias          */ w_bias_d,
-                    /* Out Data      */ out_data_d,
-                    /* Out State     */ out_state_d)));
-    descs.push_back(desc);
+                                 const std::vector<TensorDesc> &outputDesc) {
+    switch (cell_type) {
+        case mkldnn::algorithm::vanilla_rnn: {
+            MKLDNNDescriptor desc(std::shared_ptr<vanilla_rnn_forward::desc>(
+                    new vanilla_rnn_forward::desc(prop_kind::forward_scoring, cell_act, direction,
+                            /* In Data       */ in_data_d,
+                            /* In State      */ in_states_d[0],
+                            /* Weights data  */ w_data_d,
+                            /* Weights state */ w_state_d,
+                            /* Bias          */ w_bias_d,
+                            /* Out Data      */ out_data_d,
+                            /* Out State     */ out_states_d[0])));
+            descs.push_back(desc);
+        } break;
+        case mkldnn::algorithm::vanilla_gru: {
+            MKLDNNDescriptor desc(std::shared_ptr<gru_forward::desc>(
+                    new gru_forward::desc(prop_kind::forward_scoring, direction,
+                            /* In Data       */ in_data_d,
+                            /* In State      */ in_states_d[0],
+                            /* Weights data  */ w_data_d,
+                            /* Weights state */ w_state_d,
+                            /* Bias          */ w_bias_d,
+                            /* Out Data      */ out_data_d,
+                            /* Out State     */ out_states_d[0])));
+            descs.push_back(desc);
+        } break;
+        case mkldnn::algorithm::lbr_gru: {
+            MKLDNNDescriptor desc(std::shared_ptr<lbr_gru_forward::desc>(
+                    new lbr_gru_forward::desc(prop_kind::forward_scoring, direction,
+                            /* In Data       */ in_data_d,
+                            /* In State      */ in_states_d[0],
+                            /* Weights data  */ w_data_d,
+                            /* Weights state */ w_state_d,
+                            /* Bias          */ w_bias_d,
+                            /* Out Data      */ out_data_d,
+                            /* Out State     */ out_states_d[0])));
+            descs.push_back(desc);
+        } break;
+        case mkldnn::algorithm::vanilla_lstm: {
+            MKLDNNDescriptor desc(std::shared_ptr<lstm_forward::desc>(
+                    new lstm_forward::desc(prop_kind::forward_scoring, direction,
+                            /* In Data       */ in_data_d,
+                            /* In State H    */ in_states_d[0],
+                            /* In State C    */ in_states_d[1],
+                            /* Weights data  */ w_data_d,
+                            /* Weights state */ w_state_d,
+                            /* Bias          */ w_bias_d,
+                            /* Out Data      */ out_data_d,
+                            /* Out State H   */ out_states_d[0],
+                            /* Out State C   */ out_states_d[1])));
+            descs.push_back(desc);
+        } break;
+        default:
+            THROW_IE_EXCEPTION << "Unknown cell type";
+    }

    // Fill supported config
    InferenceEngine::LayerConfig config;
@ -326,7 +385,7 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
        config.outConfs.push_back(dataConfig);
    }

-    supportedPrimitiveDescriptors.emplace_back(config, ref_any, outputFormats);
+    supportedPrimitiveDescriptors.emplace_back(config, ref_any);
 }

 void MKLDNNRNN::createPrimitive() {
@ -342,8 +401,7 @@ void MKLDNNRNN::createPrimitive() {
            && getCnnLayer()->blobs["biases"]->getTensorDesc().getPrecision() != Precision::FP32)
        THROW_IE_EXCEPTION << errorPrefix << " has invalid biases precision: " << getCnnLayer()->blobs["biases"]->getTensorDesc().getPrecision();

-    std::shared_ptr<rnn_forward::desc> d = descs[0];
-    rnn_forward::primitive_desc pd(*d, getEngine());
+    auto pd = descs[0].createPrimitiveDescriptorIterator(getEngine());

    auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
    auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
@ -387,22 +445,22 @@ void MKLDNNRNN::createPrimitive() {
        const int gate_map_lstm_size = sizeof(gate_map_lstm) / sizeof(int);
        const int gate_map_gru_size = sizeof(gate_map_gru) / sizeof(int);
        const int gate_map_rnn_size = sizeof(gate_map_rnn) / sizeof(int);
-        if (cell_desc.get_cell_kind() == vanilla_lstm) {
+        if (cell_type == algorithm::vanilla_lstm) {
            gate_map = gate_map_lstm;
            if (G > gate_map_lstm_size) {
                THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
            }
-        } else if (cell_desc.get_cell_kind() == vanilla_gru) {
+        } else if (cell_type == algorithm::vanilla_gru) {
            gate_map = gate_map_gru;
            if (G > gate_map_gru_size) {
                THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
            }
-        } else if (cell_desc.get_cell_kind() == gru_linear_before_reset) {
+        } else if (cell_type == algorithm::lbr_gru) {
            gate_map = gate_map_gru;
            if (G > gate_map_gru_size) {
                THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
            }
-        } else if (cell_desc.get_cell_kind() == vanilla_rnn) {
+        } else if (cell_type == algorithm::vanilla_rnn) {
            gate_map = gate_map_rnn;
            if (G > gate_map_rnn_size) {
                THROW_IE_EXCEPTION << "G isn't equal to the size of gate_map";
@ -448,76 +506,48 @@ void MKLDNNRNN::createPrimitive() {
        }
    }

-    auto src_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    src_state_mem->Create(in_state_d);
-    internalBlobMemory.push_back(src_state_mem);
-    if (in_state_d) {
-        int offset = 0;
-        for (int i = 0; i < S; i++) {
-            /* create copy/concat primitive */
-            auto src_stat = getParentEdgeAt(i+1)->getMemory().GetPrimitive();
-
-            auto state_mem = std::make_shared<MKLDNNMemory>(getEngine());
-            state_mem->Create(
-                    src_stat.get_primitive_desc().desc(),
-                    static_cast<uint8_t *>(src_state_mem->GetPrimitive().get_data_handle()) + offset);
-            offset += src_stat.get_primitive_desc().get_size();
-
-            internalBlobMemory.push_back(state_mem);
-
-            exec_before.emplace_back(src_stat, state_mem->GetPrimitive());
-        }
-    }
-
-    auto dst_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    dst_state_mem->Create(out_state_d);
-    internalBlobMemory.push_back(dst_state_mem);
-    if (out_state_d) {
-        int offset = 0;
-        int idx_start = is_cell ? 0 : 1;
-        for (int i = 0; i < S; i++) {
-            /* create copy/split primitive */
-            auto dst_stat = getChildEdgeAt(idx_start + i)->getMemory().GetPrimitive();
-
-            auto state_mem = std::make_shared<MKLDNNMemory>(getEngine());
-            state_mem->Create(
-                    dst_stat.get_primitive_desc().desc(),
-                    static_cast<uint8_t *>(dst_state_mem->GetPrimitive().get_data_handle()) + offset);
-            offset += dst_stat.get_primitive_desc().get_size();
-
-            internalBlobMemory.push_back(state_mem);
-
-            if (is_cell && i == 0) continue;
-            exec_after.emplace_back(state_mem->GetPrimitive(), dst_stat);
-        }
-    }
-
-    auto workspace_mem = std::make_shared<MKLDNNMemory>(getEngine());
-    workspace_mem->Create({}, memory::f32, memory::format_undef, nullptr);  // stub, not in use
-    internalBlobMemory.push_back(workspace_mem);
-
-    auto p = new rnn_forward(pd,
-            /* In Data       */ src_data_mem ->GetPrimitive(),
-            /* In State      */ src_state_mem->GetPrimitive(),
-            /* Weights data  */ w_data_mem   ->GetPrimitive(),
-            /* Weights state */ w_state_mem  ->GetPrimitive(),
-            /* Bias          */ w_bias_mem   ->GetPrimitive(),
-            /* Out Data      */ dst_data_mem ->GetPrimitive(),
-            /* Out State     */ dst_state_mem->GetPrimitive(),
-            /* Workspace     */ workspace_mem->GetPrimitive());
-
-    prim.reset(p);
+    prim.reset(new mkldnn::primitive(pd));
 }

 void MKLDNNRNN::execute(mkldnn::stream strm) {
-    if (!exec_before.empty())
-        strm.submit({exec_before.begin(), exec_before.end()});
+    if (!prim)
+        THROW_IE_EXCEPTION << "No initialized primitive to execute";

-    if (prim)
-        strm.submit({*prim});
+    const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
+    const auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();

-    if (!exec_after.empty())
-        strm.submit({exec_after.begin(), exec_after.end()});
+    const auto &wgh_data_mem = internalBlobMemory[0];
+    const auto &wgh_stat_mem = internalBlobMemory[1];
+    const auto &wgh_bias_mem = internalBlobMemory[2];
+
+    std::unordered_map<int, memory> args {
+        {DNNL_ARG_SRC_LAYER,     src_data_mem->GetPrimitive()},
+        {DNNL_ARG_WEIGHTS_LAYER, wgh_data_mem->GetPrimitive()},
+        {DNNL_ARG_WEIGHTS_ITER,  wgh_stat_mem->GetPrimitive()},
+        {DNNL_ARG_BIAS,          wgh_bias_mem->GetPrimitive()},
+        {DNNL_ARG_DST_LAYER,     dst_data_mem->GetPrimitive()},
+    };
+
+    int state_i_tags[] {DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C};
+    int state_o_tags[] {DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C};
+    for (size_t s = 0; s < S; s++) {
+        args[state_i_tags[s]] = getParentEdgeAt(s+1)->getMemoryPtr()->GetPrimitive();
+    }
+
+    if (is_cell) {
+        for (size_t s = 0; s < S; s++) {
+            args[state_o_tags[s]] = getChildEdgesAtPort(s)[0]->getMemoryPtr()->GetPrimitive();
+        }
+    } else {
+        ptrdiff_t n_ports_with_init_states = outDims.size() - 1; // first is a sequence data
+        for (size_t s = 0; s < std::min(S, n_ports_with_init_states); s++) {
+            if (s < inDims.size()) {
+                args[state_o_tags[s]] = getChildEdgesAtPort(s+1)[0]->getMemoryPtr()->GetPrimitive();
+            }
+        }
+    }
+
+    (*prim).execute(strm, args);
 }

 REG_MKLDNN_PRIM_FOR(MKLDNNRNN, RNNCell);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
@ -20,10 +20,8 @@ public:
    void getSupportedDescriptors() override;
    void createPrimitive() override;
    bool created() const override;
-    using MKLDNNNode::createDescriptor;
    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
-                          const std::vector<InferenceEngine::TensorDesc>& outputDesc,
-                          const std::vector<mkldnn::memory::format> &outputFormats);
+                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;

    void execute(mkldnn::stream strm) override;

@ -39,10 +37,13 @@ private:
    bool nativeOrder = true;

    /** Direction of iteration through sequence dimension */
-    mkldnn::rnn_direction direction = mkldnn::unidirectional;
+    mkldnn::rnn_direction direction = mkldnn::rnn_direction::unidirectional;

-    /** RNN Cell desc (type/activation_alg/clip)*/
-    mkldnn::rnn_cell::desc cell_desc { mkldnn::algorithm::vanilla_lstm };
+    /** RNN Cell type (type/activation_alg/clip)*/
+    mkldnn::algorithm cell_type = mkldnn::algorithm::vanilla_lstm;
+
+    /** activation type for vanilla RNN cell */
+    mkldnn::algorithm cell_act = mkldnn::algorithm::eltwise_tanh;

    // Internal attributes
    ptrdiff_t N = 0;   /**< Batch value */
@ -58,8 +59,8 @@ private:
    MKLDNNMemoryDesc in_data_d;
    MKLDNNMemoryDesc out_data_d;

-    MKLDNNMemoryDesc in_state_d;
-    MKLDNNMemoryDesc out_state_d;
+    std::vector<MKLDNNMemoryDesc> in_states_d;
+    std::vector<MKLDNNMemoryDesc> out_states_d;

    MKLDNNMemoryDesc w_data_d;
    MKLDNNMemoryDesc w_state_d;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_align_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_align_node.cpp
@ -11,7 +11,7 @@
 #include <mkldnn_extension_utils.h>
 #include <mkldnn_types.h>
 #include <utils/bfloat16.hpp>
-#include <cpu_isa_traits.hpp>
+#include <cpu/x64/cpu_isa_traits.hpp>
 #include "ie_parallel.hpp"
 #include <mkldnn_selective_build.h>

@ -19,6 +19,7 @@ using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn;
 using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;

 MKLDNNROIAlignNode::MKLDNNROIAlignNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
                                       MKLDNNWeightsSharing::Ptr &cache)
@ -100,17 +101,17 @@ void MKLDNNROIAlignNode::initSupportedPrimitiveDescriptors() {
    config.inConfs.resize(3);
    config.outConfs.resize(1);

-    std::vector<std::pair<memory::format, memory::format> > supportedFormats {
-            {memory::nchw, memory::nchw},
-            {memory::nhwc, memory::nhwc},
-            {memory::nChw16c, memory::nChw16c},
-            {memory::nChw8c, memory::nChw8c}
+    std::vector<std::pair<memory::format_tag, memory::format_tag>> supportedFormats {
+            {memory::format_tag::nchw, memory::format_tag::nchw},
+            {memory::format_tag::nhwc, memory::format_tag::nhwc},
+            {memory::format_tag::nChw16c, memory::format_tag::nChw16c},
+            {memory::format_tag::nChw8c, memory::format_tag::nChw8c}
    };

    for (auto fmts : supportedFormats) {
        config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, fmts.first);
-        config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), memory::f32, memory::nc);
-        config.inConfs[2].desc = MKLDNNMemoryDesc(getParentEdgeAt(2)->getDims(), memory::s32, memory::x);
+        config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), memory::data_type::f32, memory::format_tag::nc);
+        config.inConfs[2].desc = MKLDNNMemoryDesc(getParentEdgeAt(2)->getDims(), memory::data_type::s32, memory::format_tag::x);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, fmts.second);
        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, fmts.second});
    }
@ -153,16 +154,17 @@ void MKLDNNROIAlignNode::executeSpecified() {
    auto &srcMemory1 = getParentEdgeAt(1)->getMemory();
    auto &dstMemory = getChildEdgeAt(0)->getMemory();

-    auto srcBlockDesc = srcMemory0.GetDescriptor().data.layout_desc.blocking;
-    auto dstBlockDesc = dstMemory.GetDescriptor().data.layout_desc.blocking;
+    auto srcBlockDesc = srcMemory0.GetDescriptor().data.format_desc.blocking;
+    auto dstBlockDesc = dstMemory.GetDescriptor().data.format_desc.blocking;

-    int blockSize = srcBlockDesc.block_dims[1];
-    auto selectedFmt = srcMemory0.GetDescriptor().data.format;
+    int blockSize = srcBlockDesc.inner_nblks > 0 ? srcBlockDesc.inner_blks[0] : 1;
+    auto isPlainFmt = srcMemory0.GetDesc().isPlainFormat();
+    auto isNhwcFmt = srcMemory0.GetDesc().isTailCFormat();

-    const auto *srcData = reinterpret_cast<const inputType *>(getDataPtr(getParentEdgeAt(0)->getMemory()));
-    const auto *srcRoi = reinterpret_cast<const float *>(getDataPtr(getParentEdgeAt(1)->getMemory()));
-    const auto *srcRoiIdx = reinterpret_cast<const int *>(getDataPtr(getParentEdgeAt(2)->getMemory()));
-    auto *dst = reinterpret_cast<outputType *>(getDataPtr(getChildEdgeAt(0)->getMemory()));
+    const auto *srcData = reinterpret_cast<const inputType *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
+    const auto *srcRoi = reinterpret_cast<const float *>(getParentEdgeAt(1)->getMemoryPtr()->GetPtr());
+    const auto *srcRoiIdx = reinterpret_cast<const int *>(getParentEdgeAt(2)->getMemoryPtr()->GetPtr());
+    auto *dst = reinterpret_cast<outputType *>(getChildEdgeAt(0)->getMemoryPtr()->GetPtr());

    auto nominalRoiCount = static_cast<int>(srcMemory1.GetDims()[0]);
    int realRois = 0;
@ -173,11 +175,11 @@ void MKLDNNROIAlignNode::executeSpecified() {

    const int binCount = pooledH * pooledW;

-    const int hInputStride = srcBlockDesc.strides[0][2];
-    const int wInputStride = srcBlockDesc.strides[0][3];
-    const int hOutputStride = dstBlockDesc.strides[0][2];
-    const int wOutputStride = dstBlockDesc.strides[0][3];
-    const int chPadding = srcBlockDesc.padding_dims[1];
+    const int hInputStride = srcBlockDesc.strides[2];
+    const int wInputStride = srcBlockDesc.strides[3];
+    const int hOutputStride = dstBlockDesc.strides[2];
+    const int wOutputStride = dstBlockDesc.strides[3];
+    const int chPadding = srcMemory0.GetDescriptor().data.padded_dims[1];
    const int blockCount = chPadding / blockSize;

    for (; realRois < nominalRoiCount; realRois++) {
@ -317,7 +319,7 @@ void MKLDNNROIAlignNode::executeSpecified() {
                              xBinInd_ * wOutputStride + blockResidual_;
            dst[dstIndex] = pooledValue;
        };
-        if (selectedFmt == mkldnn_nhwc) {
+        if (isNhwcFmt) {
            parallel_for2d(pooledH, pooledW, [&](int yBinInd, int xBinInd) {
                for (int c = 0; c < C; c++) {
                    size_t binOffsetInput = roiBatchInd * C * H * W + c;
@ -330,7 +332,7 @@ void MKLDNNROIAlignNode::executeSpecified() {
                int cStart = blkIdx * blockSize;
                int cEnd = (blkIdx == blockCount - 1 ? C : cStart + blockSize);
                for (int c = cStart; c < cEnd; c++) {
-                    const int blockResidual = (selectedFmt == mkldnn_nchw ? 0 : c % blockSize);
+                    const int blockResidual = (isPlainFmt ? 0 : c % blockSize);
                    const int blockIdx = (c / blockSize) * blockSize;
                    size_t binOffsetInput = (roiBatchInd * chPadding + blockIdx) * H * W;
                    size_t binOffsetOutput = (n * chPadding + blockIdx) * binCount;
@ -341,11 +343,6 @@ void MKLDNNROIAlignNode::executeSpecified() {
    }
 }

-inline uint8_t* MKLDNNROIAlignNode::getDataPtr(const MKLDNNMemory& memoryPtr) const {
-    return reinterpret_cast<uint8_t*>(memoryPtr.GetData()) + memoryPtr.GetDescriptor().data.layout_desc.blocking.offset_padding *
-        MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(memoryPtr.GetDescriptor().data.data_type));
-}
-
 bool MKLDNNROIAlignNode::created() const {
    return getType() == ROIAlign;
 }
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_align_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_align_node.h
@ -39,7 +39,6 @@ private:
    void executeSpecified();
    template<typename T>
    struct ROIAlignExecute;
-    inline uint8_t* getDataPtr(const MKLDNNMemory& memoryPtr) const;
 };
 }  // namespace MKLDNNPlugin

--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.cpp
@ -3,16 +3,249 @@
 //

 #include "mkldnn_roi_pooling_node.h"
-#include "desc_iterator.hpp"
+
 #include <legacy/ie_layers.h>
 #include <mkldnn.hpp>
 #include <string>
 #include <vector>
+#include <math.h>
 #include <mkldnn_extension_utils.h>
+#include <cpu/x64/jit_generator.hpp>
+#include "ie_parallel.hpp"

-using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
+using namespace mkldnn;
+using namespace mkldnn::impl;
+using namespace mkldnn::impl::cpu::x64;
+using namespace mkldnn::impl::utils;
+using namespace Xbyak;
+
+#define GET_OFF(field) offsetof(jit_roi_pooling_call_args, field)
+
+template <cpu_isa_t isa>
+struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, public jit_generator {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_roi_pooling_kernel_f32)
+
+    explicit jit_uni_roi_pooling_kernel_f32(jit_roi_pooling_params jcp) : jit_uni_roi_pooling_kernel(jcp), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    };
+
+    void generate() override {
+        this->preamble();
+
+        Label exit_label;
+        Label tail_label;
+
+        mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
+        mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
+
+        mov(reg_bin_area, ptr[this->param1 + GET_OFF(bin_area)]);
+        mov(reg_c_blocks, ptr[this->param1 + GET_OFF(c_blocks)]);
+
+        if (jpp_.alg == ROIPoolingOpType::Max) {
+            mov(reg_kh, ptr[this->param1 + GET_OFF(kh)]);
+            mov(reg_kw, ptr[this->param1 + GET_OFF(kw)]);
+        } else {
+            mov(reg_yf, ptr[this->param1 + GET_OFF(yf)]);
+            mov(reg_xf, ptr[this->param1 + GET_OFF(xf)]);
+            mov(reg_yoff, ptr[this->param1 + GET_OFF(yoff)]);
+            mov(reg_xoff, ptr[this->param1 + GET_OFF(xoff)]);
+        }
+
+        int nb_c_tail = jpp_.nb_c % jpp_.nb_c_blocking;
+        cmp(reg_c_blocks, jpp_.nb_c_blocking);
+        jne(nb_c_tail ? tail_label : exit_label, T_NEAR);
+
+        loop_body(jpp_.nb_c_blocking);
+        jmp(exit_label, T_NEAR);
+
+        if (nb_c_tail) {
+            L(tail_label);
+            loop_body(nb_c_tail);
+        }
+
+        L(exit_label);
+
+        this->postamble();
+    }
+
+private:
+    using Vmm = typename conditional3<isa == cpu::x64::sse41, Xbyak::Xmm, isa == cpu::x64::avx2,
+            Xbyak::Ymm, Xbyak::Zmm>::type;
+
+    const int vlen = cpu_isa_traits<isa>::vlen;
+
+    Vmm vmm_mask = Vmm(0);
+    Vmm vmm_zero = Vmm(0);
+
+    Xmm xmm_yf = Xmm(0);
+    Vmm vmm_yf = Vmm(0);
+    Xmm xmm_xf = Xmm(1);
+    Vmm vmm_xf = Vmm(1);
+
+    Vmm get_acc_reg(int idx) { return Vmm(2*idx + 1); }
+    Vmm get_src_reg(int idx) { return Vmm(2*idx + 2); }
+
+    Opmask k_store_mask = Opmask(7);
+
+    const unsigned char _cmp_lt_os = 1;
+
+    using reg64_t = const Xbyak::Reg64;
+    reg64_t reg_input     = r8;
+    reg64_t aux_reg_input = rax;
+    reg64_t aux_reg_input1 = rdx;
+    reg64_t reg_output    = r9;
+    reg64_t reg_kh    = r10;
+    reg64_t reg_kw    = r11;
+
+    reg64_t h_iter = r14;
+    reg64_t w_iter = r15;
+
+    reg64_t reg_c_blocks = rbx;
+    reg64_t reg_bin_area = rdx;
+
+    reg64_t reg_yf = reg_kh;
+    reg64_t reg_xf = reg_kw;
+
+    reg64_t reg_yoff = h_iter;
+    reg64_t reg_xoff = r12;
+
+    void roi_pool_max(int c_blocks) {
+        Label h_loop_label;
+        Label w_loop_label;
+
+        mov(aux_reg_input, reg_input);
+
+        for (int i = 0; i < c_blocks; i++) {
+            Vmm vmm_max = get_acc_reg(i);
+            uni_vmovups(vmm_max, ptr[reg_input + i * jpp_.ih * jpp_.iw * jpp_.c_block * sizeof(float)]);
+        }
+
+        xor_(h_iter, h_iter);
+        L(h_loop_label); {
+            xor_(w_iter, w_iter);
+            mov(aux_reg_input1, aux_reg_input);
+            L(w_loop_label); {
+                for (int i = 0; i < c_blocks; i++) {
+                    Vmm vmm_max = get_acc_reg(i);
+                    Vmm vmm_src = get_src_reg(i);
+
+                    uni_vmovups(vmm_src, ptr[aux_reg_input1 + i * jpp_.ih * jpp_.iw * jpp_.c_block * sizeof(float)]);
+                    if (isa == cpu::x64::sse41) {
+                        movups(vmm_mask, vmm_max);
+                        cmpps(vmm_mask, vmm_src, _cmp_lt_os);
+                        blendvps(vmm_max, vmm_src);
+                    } else if (isa == cpu::x64::avx2) {
+                        vcmpps(vmm_mask, vmm_max, vmm_src, _cmp_lt_os);
+                        vblendvps(vmm_max, vmm_max, vmm_src, vmm_mask);
+                    } else if (isa == cpu::x64::avx512_common) {
+                        vcmpps(k_store_mask,  vmm_max,  vmm_src, _cmp_lt_os);
+                        vblendmps(vmm_max| k_store_mask, vmm_max, vmm_src);
+                    }
+                }
+
+                add(aux_reg_input1, jpp_.c_block * sizeof(float));
+
+                inc(w_iter);
+                cmp(w_iter, reg_kw);
+                jl(w_loop_label, T_NEAR);
+            }
+
+            add(aux_reg_input, jpp_.iw * jpp_.c_block * sizeof(float));
+
+            inc(h_iter);
+            cmp(h_iter, reg_kh);
+            jl(h_loop_label, T_NEAR);
+        }
+
+        for (int i = 0; i < c_blocks; i++) {
+            Vmm vmm_dst = get_acc_reg(i);
+            uni_vmovups(ptr[reg_output + i * jpp_.oh * jpp_.ow * jpp_.c_block * sizeof(float)], vmm_dst);
+        }
+    }
+
+    void roi_pool_bilinear(int c_blocks) {
+        movq(xmm_yf, reg_yf);
+        uni_vbroadcastss(vmm_yf, xmm_yf);
+        movq(xmm_xf, reg_xf);
+        uni_vbroadcastss(vmm_xf, xmm_xf);
+
+        Vmm vmm_src00 = get_src_reg(0);
+        Vmm vmm_src01 = get_src_reg(1);
+        Vmm vmm_src10 = get_src_reg(2);
+        Vmm vmm_src11 = get_src_reg(3);
+
+        for (int i = 0; i < c_blocks; i++) {
+            int src_c_off = i * jpp_.ih * jpp_.iw * jpp_.c_block * sizeof(float);
+
+            mov(aux_reg_input, reg_input);
+            uni_vmovups(vmm_src00, ptr[aux_reg_input + src_c_off]);
+            add(aux_reg_input, reg_xoff);
+            uni_vmovups(vmm_src01, ptr[aux_reg_input + src_c_off]);
+
+            add(aux_reg_input, reg_yoff);
+            uni_vmovups(vmm_src11, ptr[aux_reg_input + src_c_off]);
+            sub(aux_reg_input, reg_xoff);
+            uni_vmovups(vmm_src10, ptr[aux_reg_input + src_c_off]);
+
+            uni_vsubps(vmm_src01, vmm_src01, vmm_src00);
+            uni_vfmadd213ps(vmm_src01, vmm_xf, vmm_src00);
+
+            uni_vsubps(vmm_src11, vmm_src11, vmm_src10);
+            uni_vfmadd213ps(vmm_src11, vmm_xf, vmm_src10);
+
+            uni_vsubps(vmm_src11, vmm_src11, vmm_src01);
+            uni_vfmadd213ps(vmm_src11, vmm_yf, vmm_src01);
+
+            int dst_c_off = i * jpp_.oh * jpp_.ow * jpp_.c_block * sizeof(float);
+            uni_vmovups(ptr[reg_output + dst_c_off], vmm_src11);
+        }
+    }
+
+    void empty_roi(int c_blocks) {
+        uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
+        for (int i = 0; i < c_blocks; i++) {
+            uni_vmovups(ptr[reg_output + i * jpp_.oh * jpp_.ow * jpp_.c_block * sizeof(float)], vmm_zero);
+        }
+    }
+
+    void loop_body(int c_blocks) {
+        Label empty_roi_label;
+        Label exit_label;
+
+        cmp(reg_bin_area, 0);
+        je(empty_roi_label, T_NEAR);
+
+        if (jpp_.alg == ROIPoolingOpType::Max)
+            roi_pool_max(c_blocks);
+        else
+            roi_pool_bilinear(c_blocks);
+
+        if (isa == cpu::x64::sse41) {
+            add(reg_input, 4 * sizeof(float));
+            add(reg_output, 4 * sizeof(float));
+
+            if (jpp_.alg == ROIPoolingOpType::Max)
+                roi_pool_max(c_blocks);
+            else
+                roi_pool_bilinear(c_blocks);
+        }
+        jmp(exit_label, T_NEAR);
+
+        L(empty_roi_label);
+        empty_roi(c_blocks);
+        if (isa == cpu::x64::sse41) {
+            add(reg_output, 4 * sizeof(float));
+            empty_roi(c_blocks);
+        }
+
+        L(exit_label);
+    }
+};

 MKLDNNROIPoolingNode::MKLDNNROIPoolingNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng,
        MKLDNNWeightsSharing::Ptr &cache)
@ -22,94 +255,312 @@ void MKLDNNROIPoolingNode::getSupportedDescriptors() {
    if (!descs.empty())
        return;

-    InferenceEngine::Precision precision = getCnnLayer()->insData[0].lock()->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
-    auto inputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-    precision = getCnnLayer()->outData[0]->getPrecision();
-    if (precision != InferenceEngine::Precision::FP32)
-        precision = InferenceEngine::Precision::FP32;
-    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);
-
    GenericLayer* genericLayer = getCnnLayer().get();
-
    if (genericLayer == nullptr)
        THROW_IE_EXCEPTION << "Cannot convert ROIPooling layer.";

-    if (getParentEdges().empty())
-        THROW_IE_EXCEPTION << "Incorrect number of input edges for layer " << getName();
+    std::string errorPrefix = "ROIPooling layer with name '" + getName() + "' ";
+
+    if (getParentEdges().size() != 2)
+        THROW_IE_EXCEPTION << errorPrefix << "has incorrect number of input edges: " << getParentEdges().size();
    if (getChildEdges().empty())
-        THROW_IE_EXCEPTION << "Incorrect number of output edges for layer " << getName();
+        THROW_IE_EXCEPTION << errorPrefix << "has incorrect number of output edges: " << getChildEdges().size();
+
+    if (getParentEdgeAt(0)->getDims().ndims() != 4) {
+        THROW_IE_EXCEPTION << errorPrefix << "doesn't support 0th input with rank: " << getParentEdgeAt(0)->getDims().ndims();
+    }
+
+    if (getParentEdgeAt(1)->getDims().ndims() != 2) {
+        THROW_IE_EXCEPTION << errorPrefix << "doesn't support 1st input with rank: " << getParentEdgeAt(1)->getDims().ndims();
+    }
+
+    if (getChildEdgeAt(0)->getDims().ndims() != 4) {
+        THROW_IE_EXCEPTION << errorPrefix << "doesn't support output with rank: " << getChildEdgeAt(0)->getDims().ndims();
+    }
+
+    if (getParentEdgeAt(1)->getDims()[1] != 5) {
+        THROW_IE_EXCEPTION << errorPrefix << "has invalid shape on 1st input: ["
+                                          << getParentEdgeAt(1)->getDims()[0] << "," << getParentEdgeAt(1)->getDims()[1] << "]";
+    }

    pooled_h = genericLayer->GetParamAsInt("pooled_h");
    pooled_w = genericLayer->GetParamAsInt("pooled_w");
    spatial_scale = genericLayer->GetParamAsFloat("spatial_scale");
    std::string m = genericLayer->GetParamAsString("method", "max");
    if (m == "max") {
-        method = mkldnn::algorithm::roi_pooling_max;
+        opType = ROIPoolingOpType::Max;
    } else if (m == "bilinear") {
-        method = mkldnn::algorithm::roi_pooling_bilinear;
+        opType = ROIPoolingOpType::Bilinear;
    } else {
-        THROW_IE_EXCEPTION << "Unsupported roi pooling method";
+        THROW_IE_EXCEPTION << errorPrefix << "doesn't support roi pooling method: " << m;
    }
+}
+
+void MKLDNNROIPoolingNode::initSupportedPrimitiveDescriptors() {
+    if (!supportedPrimitiveDescriptors.empty())
+        return;
+
+    InferenceEngine::LayerConfig config;
+    config.dynBatchSupport = false;
+    config.inConfs.resize(2);
+    config.inConfs[0].constant = false;
+    config.inConfs[0].inPlace = -1;
+    config.inConfs[1].constant = false;
+    config.inConfs[1].inPlace = -1;
+
+    config.outConfs.resize(1);
+    config.outConfs[0].constant = false;
+    config.outConfs[0].inPlace = -1;

    auto parentDims = getParentEdgeAt(0)->getDims();
-    for (auto format : getAvailableFormatsForDims(parentDims)) {
-        std::vector<InferenceEngine::TensorDesc> srcs;
-        srcs.push_back(MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, format));
-        srcs.push_back(MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), inputDataType, memory::nc));
-        MKLDNNMemoryDesc out_candidate(getChildEdgeAt(0)->getDims(), outputDataType, format);
-
-        createDescriptor(srcs, {out_candidate});
+    auto format = mayiuse(avx512_common) ? memory::format_tag::nChw16c : memory::format_tag::nChw8c;
+    impl_desc_type impl_type;
+    if (mayiuse(cpu::x64::avx512_common)) {
+        impl_type = impl_desc_type::jit_avx512;
+    } else if (mayiuse(cpu::x64::avx2)) {
+        impl_type = impl_desc_type::jit_avx2;
+    } else if (mayiuse(cpu::x64::sse41)) {
+        impl_type = impl_desc_type::jit_sse42;
+    } else {
+        impl_type = impl_desc_type::ref;
    }
+
+    config.inConfs[0].desc = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), memory::data_type::f32, format);
+    config.inConfs[1].desc = MKLDNNMemoryDesc(getParentEdgeAt(1)->getDims(), memory::data_type::f32, memory::format_tag::nc);
+    config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), memory::data_type::f32, format);
+    supportedPrimitiveDescriptors.push_back({config, impl_type, format});
 }

 void MKLDNNROIPoolingNode::createPrimitive() {
-    if (prim)
-        return;
+    auto config = getSelectedPrimitiveDescriptor()->getConfig();

-    std::vector<memory::desc> srcs;
-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        srcs.push_back(getParentEdgeAt(i)->getMemory().GetDescriptor());
+    const int simd_w = mayiuse(cpu::x64::avx512_common) ? 16 : 8;
+    jpp.c_block = simd_w;
+
+    auto inDims = config.inConfs[0].desc.getDims();
+    auto outDims = config.outConfs[0].desc.getDims();
+
+    jpp.mb = outDims[0];
+    jpp.c = rnd_up(inDims[1], simd_w);
+    jpp.ih = inDims[2];
+    jpp.iw = inDims[3];
+    jpp.oh = outDims[2];
+    jpp.ow = outDims[3];
+
+    jpp.spatial_scale = spatial_scale;
+    jpp.pooled_h = pooled_h;
+    jpp.pooled_w = pooled_w;
+
+    jpp.nb_c = jpp.c / jpp.c_block;
+
+    jpp.nb_c_blocking = mayiuse(cpu::x64::avx512_common) ? 15 : 7;
+
+    jpp.alg = opType;
+
+    if (mayiuse(cpu::x64::avx512_common)) {
+        roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32<cpu::x64::avx512_common>(jpp));
+    } else if (mayiuse(cpu::x64::avx2)) {
+        roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32<cpu::x64::avx2>(jpp));
+    } else if (mayiuse(cpu::x64::sse41)) {
+        roi_pooling_kernel.reset(new jit_uni_roi_pooling_kernel_f32<cpu::x64::sse41>(jpp));
    }

-    memory::desc out_candidate = getChildEdgeAt(0)->getMemory().GetDescriptor();
-
-    MKLDNNDescriptor desc(std::shared_ptr<roi_pooling_forward::desc>(
-            new roi_pooling_forward::desc(prop_kind::forward_scoring, method, srcs, out_candidate, pooled_h, pooled_w,
-                                          spatial_scale)));
-
-    descs[0] = desc;
-    std::shared_ptr<roi_pooling_forward::desc> selected_desc_ptr = descs[0];
-
-    const PrimitiveDescInfo *selected_pd = getSelectedPrimitiveDescriptor();
-    if (selected_pd == nullptr)
-        THROW_IE_EXCEPTION << "Preferable primitive descriptor is not set for node " << getName() << ".";
-
-    auto prim_desc = roi_pooling_forward::primitive_desc(*selected_desc_ptr, getEngine());
-    primitive_desc_iterator itpd = descs[0].createPrimitiveDescriptorIterator(getEngine());
-
-    std::vector<primitive::at> src_p;
-    for (size_t i = 0; i < getParentEdges().size(); i++) {
-        src_p.push_back(getParentEdgeAt(i)->getMemoryPtr()->GetPrimitive());
-    }
-    prim.reset(new roi_pooling_forward(prim_desc, src_p, getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    if (roi_pooling_kernel)
+        roi_pooling_kernel->create_ker();
 }

+void MKLDNNROIPoolingNode::execute(mkldnn::stream strm) {
+    auto &srcMemory0 = getParentEdgeAt(0)->getMemory();
+    auto &srcMemory1 = getParentEdgeAt(1)->getMemory();
+    auto &dstMemory = getChildEdgeAt(0)->getMemory();
+
+    const auto *src_data = reinterpret_cast<const float *>(srcMemory0.GetPtr());
+    const auto *src_roi = reinterpret_cast<const float *>(srcMemory1.GetPtr());
+    float *dst = reinterpret_cast<float *>(dstMemory.GetPtr());
+
+    auto config = getSelectedPrimitiveDescriptor()->getConfig();
+
+    auto src_strides = config.inConfs[0].desc.getBlockingDesc().getStrides();
+    auto dst_strides = config.outConfs[0].desc.getBlockingDesc().getStrides();
+
+    int cb_work = impl::utils::div_up(jpp.nb_c, jpp.nb_c_blocking);
+    int MB = jpp.mb;
+
+    size_t src_roi_step = config.inConfs[1].desc.getBlockingDesc().getStrides()[0];
+    int real_rois = 0;
+    for (; real_rois < MB; real_rois++) {
+        size_t roi_off = real_rois * src_roi_step;
+
+        const float *src_roi_ptr = &src_roi[roi_off];
+        int roi_batch_ind = static_cast<int>(src_roi_ptr[0]);
+        if (roi_batch_ind == -1) {
+            break;
+        }
+    }
+
+    parallel_for4d(MB, cb_work, jpp.oh, jpp.ow, [&](int n, int cbb, int oh, int ow) {
+        auto arg = jit_roi_pooling_call_args();
+
+        int cb = cbb * jpp.nb_c_blocking;
+        int cb_num = jpp.nb_c_blocking;
+        int c_block = jpp.c_block;
+
+        arg.c_blocks = std::min(cb + cb_num, jpp.nb_c) - cb;
+
+        if (n >= real_rois) {
+            if (roi_pooling_kernel) {
+                arg.bin_area = 0;
+                arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
+            } else {
+                for (int c = 0; c < c_block; c++) {
+                    dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c] = 0;
+                }
+            }
+
+            (*roi_pooling_kernel)(&arg);
+        } else {
+            size_t roi_off = n * src_roi_step;
+            const float* src_roi_ptr = &src_roi[roi_off];
+
+            int roi_batch_ind = static_cast<int>(src_roi_ptr[0]);
+
+            if (jpp.alg == ROIPoolingOpType::Max) {
+                int roi_start_w = static_cast<int>(round(src_roi_ptr[1] * jpp.spatial_scale));
+                int roi_start_h = static_cast<int>(round(src_roi_ptr[2] * jpp.spatial_scale));
+                int roi_end_w = static_cast<int>(round(src_roi_ptr[3] * jpp.spatial_scale));
+                int roi_end_h = static_cast<int>(round(src_roi_ptr[4] * jpp.spatial_scale));
+
+                int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+                int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+
+                int hstart = (oh * roi_height) / jpp.pooled_h;
+                if ((hstart * jpp.pooled_h) > (oh * roi_height)) {
+                    --hstart;
+                }
+
+                int wstart = (ow * roi_width) / jpp.pooled_w;
+                if ((wstart * jpp.pooled_w) > (ow * roi_width)) {
+                    --wstart;
+                }
+
+                int hend = ((oh + 1) * roi_height) / jpp.pooled_h;
+                if ((hend * jpp.pooled_h) < ((oh + 1) * roi_height)) {
+                    ++hend;
+                }
+
+                int wend = ((ow + 1) * roi_width) / jpp.pooled_w;
+                if ((wend * jpp.pooled_w) < ((ow + 1) * roi_width)) {
+                    ++wend;
+                }
+
+                hstart = std::min(std::max(hstart + roi_start_h, 0), jpp.ih);
+                hend = std::min(std::max(hend + roi_start_h, 0), jpp.ih);
+                wstart = std::min(std::max(wstart + roi_start_w, 0), jpp.iw);
+                wend = std::min(std::max(wend + roi_start_w, 0), jpp.iw);
+
+                if (roi_pooling_kernel) {
+                    arg.src = &src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] + hstart * src_strides[2] + wstart * src_strides[3]];
+                    arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
+
+                    arg.bin_area = (hend - hstart) * (wend - wstart);
+                    arg.kh = hend - hstart;
+                    arg.kw = wend - wstart;
+                } else {
+                    for (int c = 0; c < c_block; c++) {
+                        const size_t pool_index = n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c;
+                        if ((hend <= hstart) || (wend <= wstart)) {
+                            dst[pool_index] = 0;
+                        } else {
+                            for (int h = hstart; h < hend; ++h) {
+                                for (int w = wstart; w < wend; ++w) {
+                                    float batch_data = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
+                                                                h * src_strides[2] + w * src_strides[3] + c];
+
+                                    if (batch_data > dst[pool_index]) {
+                                        dst[pool_index] = batch_data;
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            } else {
+                float roi_start_w_ = src_roi_ptr[1];
+                float roi_start_h_ = src_roi_ptr[2];
+                float roi_end_w_   = src_roi_ptr[3];
+                float roi_end_h_   = src_roi_ptr[4];
+
+                float height_scale = ((roi_end_h_ - roi_start_h_) * (jpp.ih - 1)) / (jpp.pooled_h - 1);
+                float width_scale  = ((roi_end_w_ - roi_start_w_) * (jpp.iw - 1)) / (jpp.pooled_w - 1);
+
+                float in_y = (oh * height_scale + roi_start_h_ * (jpp.ih - 1));
+                float in_x = (ow * width_scale  + roi_start_w_ * (jpp.iw - 1));
+
+                if (in_y < 0 || in_y > jpp.ih - 1 || in_x < 0 || in_x > jpp.iw - 1) {
+                    if (roi_pooling_kernel) {
+                        arg.bin_area = 0;
+                        arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
+                    } else {
+                        for (int c = 0; c < c_block; c++) {
+                            dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c] = 0;
+                        }
+                    }
+                } else {
+                    int top_y_index    = static_cast<int>(floorf(in_y));
+                    int bottom_y_index = static_cast<int>(ceilf(in_y));
+                    int left_x_index   = static_cast<int>(floorf(in_x));
+                    int right_x_index  = static_cast<int>(ceilf(in_x));
+
+                    if (right_x_index > jpp.iw - 1)
+                        right_x_index = jpp.iw - 1;
+
+                    if (bottom_y_index > jpp.ih - 1)
+                        bottom_y_index = jpp.ih - 1;
+
+                    if (roi_pooling_kernel) {
+                        arg.dst = &dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3]];
+
+                        arg.xf = in_x - left_x_index;
+                        arg.yf = in_y - top_y_index;
+
+                        arg.xoff = (size_t) ((right_x_index - left_x_index) * jpp.c_block * sizeof(float));
+                        arg.yoff = (size_t) ((bottom_y_index - top_y_index) * jpp.iw * jpp.c_block * sizeof(float));
+
+                        arg.src = &src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
+                                            top_y_index * src_strides[2] + left_x_index * src_strides[3]];
+                        arg.bin_area = 1;
+                    } else {
+                        for (int c = 0; c < 1; c++) {
+                            const float top_left     = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
+                                                                top_y_index * src_strides[2] + left_x_index * src_strides[3] + c];
+                            const float top_right    = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
+                                                                top_y_index * src_strides[2] + right_x_index * src_strides[3] + c];
+                            const float bottom_left  = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
+                                                                bottom_y_index * src_strides[2] + left_x_index * src_strides[3] + c];
+                            const float bottom_right = src_data[roi_batch_ind * src_strides[0] + cb * src_strides[1] +
+                                                                bottom_y_index * src_strides[2] + right_x_index * src_strides[3] + c];
+
+                            const float top    = top_left + (top_right - top_left) * (in_x - left_x_index);
+                            const float bottom = bottom_left + (bottom_right - bottom_left) * (in_x - left_x_index);
+
+                            dst[n * dst_strides[0] + cb * dst_strides[1] + oh * dst_strides[2] + ow * dst_strides[3] + c] =
+                                    top + (bottom - top) * (in_y - top_y_index);
+                        }
+                    }
+                }
+            }
+
+            if (roi_pooling_kernel) {
+                (*roi_pooling_kernel)(&arg);
+            }
+        }
+    });
+}
+
+
 bool MKLDNNROIPoolingNode::created() const {
    return getType() == ROIPooling;
 }

-void MKLDNNROIPoolingNode::createDescriptor(const std::vector<InferenceEngine::TensorDesc> &inputDesc,
-                                            const std::vector<InferenceEngine::TensorDesc> &outputDesc) {
-    std::vector<memory::desc> srcs;
-    srcs.push_back(MKLDNNMemoryDesc(inputDesc[0]));
-    srcs.push_back(MKLDNNMemoryDesc(inputDesc[1]));
-    MKLDNNMemoryDesc out_candidate(outputDesc[0]);
-
-    MKLDNNDescriptor desc(std::shared_ptr<roi_pooling_forward::desc>(
-            new roi_pooling_forward::desc(prop_kind::forward_scoring, method, srcs, out_candidate, pooled_h, pooled_w,
-                                          spatial_scale)));
-    descs.push_back(desc);
-}
 REG_MKLDNN_PRIM_FOR(MKLDNNROIPoolingNode, ROIPooling);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_roi_pooling_node.h
@ -12,22 +12,77 @@

 namespace MKLDNNPlugin {

+enum ROIPoolingOpType {
+    Max,
+    Bilinear
+};
+
+struct jit_roi_pooling_params {
+    int mb, c;
+    int ih, iw, oh, ow;
+
+    int c_block, nb_c, nb_c_blocking;
+
+    double spatial_scale;
+    int pooled_h;
+    int pooled_w;
+
+    ROIPoolingOpType alg;
+};
+
+struct jit_roi_pooling_call_args {
+    const float *src;
+    float *dst;
+
+    size_t kh;
+    size_t kw;
+    size_t bin_area;
+
+    size_t c_blocks;
+
+    float xf;
+    float yf;
+
+    size_t xoff;
+    size_t yoff;
+};
+
+struct jit_uni_roi_pooling_kernel {
+    void (*ker_)(const jit_roi_pooling_call_args *);
+
+    void operator()(const jit_roi_pooling_call_args *args) {
+        assert(ker_);
+        ker_(args);
+    }
+
+    explicit jit_uni_roi_pooling_kernel(jit_roi_pooling_params jpp) : ker_(nullptr), jpp_(jpp) {}
+    virtual ~jit_uni_roi_pooling_kernel() {}
+
+    virtual void create_ker() = 0;
+
+    jit_roi_pooling_params jpp_;
+};
+
 class MKLDNNROIPoolingNode : public MKLDNNNode {
 public:
    MKLDNNROIPoolingNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
    ~MKLDNNROIPoolingNode() override = default;

    void getSupportedDescriptors() override;
-    void createDescriptor(const std::vector<InferenceEngine::TensorDesc>& inputDesc,
-                          const std::vector<InferenceEngine::TensorDesc>& outputDesc) override;
+    void initSupportedPrimitiveDescriptors() override;
    void createPrimitive() override;
+    void execute(mkldnn::stream strm) override;
    bool created() const override;

 private:
    int pooled_h = 0;
    int pooled_w = 0;
    float spatial_scale = 0;
-    mkldnn::algorithm method = mkldnn::algorithm::roi_pooling_max;
+    ROIPoolingOpType opType = Max;
+
+    jit_roi_pooling_params jpp = {};
+
+    std::shared_ptr<jit_uni_roi_pooling_kernel> roi_pooling_kernel = nullptr;
 };

 }  // namespace MKLDNNPlugin
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_scatter_update_node.cpp
@ -3,8 +3,6 @@
 //

 #include "mkldnn_scatter_update_node.h"
-#include "desc_iterator.hpp"
-#include "mkldnn_quantize_node.h"
 #include <legacy/ie_layers.h>
 #include <mkldnn.hpp>
 #include <string>
@ -192,13 +190,13 @@ void MKLDNNScatterUpdateNode::initSupportedPrimitiveDescriptors() {
        config.inConfs[AXIS_ID].inPlace = -1;
    }

-    auto pushDesc = [&](memory::format inFormat, memory::format idxFormat, memory::format updateFormat, memory::format outFormat) {
+    auto pushDesc = [&](memory::format_tag inFormat, memory::format_tag idxFormat, memory::format_tag updateFormat, memory::format_tag outFormat) {
        config.inConfs[DATA_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(DATA_ID)->getDims(), dataType, inFormat);
        config.inConfs[INDICES_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(INDICES_ID)->getDims(), indicesType, idxFormat);
        config.inConfs[UPDATE_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(UPDATE_ID)->getDims(), dataType, updateFormat);
        if (axisRelaxed)
            config.inConfs[AXIS_ID].desc = MKLDNNMemoryDesc(getParentEdgeAt(AXIS_ID)->getDims(),
-                MKLDNNExtensionUtils::IEPrecisionToDataType(axisPrec), memory::x);
+                MKLDNNExtensionUtils::IEPrecisionToDataType(axisPrec), memory::format_tag::x);
        config.outConfs[0].desc = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), dataType, outFormat);
        supportedPrimitiveDescriptors.push_back({config, impl_desc_type::unknown, outFormat});
    };
@ -264,14 +262,10 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) {
    auto &indicesMemPtr = getParentEdgeAt(INDICES_ID)->getMemoryPtr();
    auto &updateMemPtr = getParentEdgeAt(UPDATE_ID)->getMemoryPtr();

-    uint8_t *dstPtr = reinterpret_cast<uint8_t*>(dstMemPtr->GetData()) +
-            dstMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dataSize;
-    uint8_t *srcPtr = reinterpret_cast<uint8_t*>(srcMemPtr->GetData()) +
-            srcMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dataSize;
-    uint8_t *indicesPtr = reinterpret_cast<uint8_t*>(indicesMemPtr->GetData()) +
-            indicesMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * indicesSize;
-    uint8_t *updatePtr = reinterpret_cast<uint8_t*>(updateMemPtr->GetData()) +
-            updateMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * dataSize;
+    uint8_t *dstPtr = reinterpret_cast<uint8_t*>(dstMemPtr->GetPtr());
+    uint8_t *srcPtr = reinterpret_cast<uint8_t*>(srcMemPtr->GetPtr());
+    uint8_t *indicesPtr = reinterpret_cast<uint8_t*>(indicesMemPtr->GetPtr());
+    uint8_t *updatePtr = reinterpret_cast<uint8_t*>(updateMemPtr->GetPtr());

    SizeVector srcDataDim = getParentEdgeAt(DATA_ID)->getDesc().getDims();
    SizeVector indicesDim = getParentEdgeAt(INDICES_ID)->getDesc().getDims();
@ -281,7 +275,7 @@ void MKLDNNScatterUpdateNode::execute(mkldnn::stream strm) {
    if (axisRelaxed) {
        auto &axisMemPtr = getParentEdgeAt(AXIS_ID)->getMemoryPtr();
        uint8_t *axisPtr = reinterpret_cast<uint8_t*>(axisMemPtr->GetData()) +
-            axisMemPtr->GetDescriptor().data.layout_desc.blocking.offset_padding * axisSize;
+            axisMemPtr->GetDescriptor().data.offset0 * axisSize;
        if (axisSize == 4) {
            auto *axisPtr32 = reinterpret_cast<int32_t*>(axisPtr);
            axis = *axisPtr32;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_softmax_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_softmax_node.h"
-#include "desc_iterator.hpp"
+
 #include <legacy/ie_layers.h>
 #include <string>
 #include <mkldnn_types.h>
@ -41,7 +41,7 @@ void MKLDNNSoftMaxNode::getSupportedDescriptors() {
    }

    if (getParentEdgeAt(0)->getDims().ndims() == 3) {
-        MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::format::blocked);
+        MKLDNNMemoryDesc in_candidate(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::abc);
        createDescriptor({in_candidate}, {});
    }

@ -73,18 +73,22 @@ void MKLDNNSoftMaxNode::createPrimitive() {
    auto prim_desc = softmax_forward::primitive_desc(*selected_desc_ptr, getEngine());
    primitive_desc_iterator itpd = descs[0].createPrimitiveDescriptorIterator(getEngine());

-    while (itpd.is_not_end()) {
-        impl_desc_type impl_type = parse_impl_name(itpd.get_impl_info_str());
+    while (itpd) {
+        impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
        auto primitiveDescriptor = getSelectedPrimitiveDescriptor();
        if ((primitiveDescriptor != nullptr) && (impl_type == primitiveDescriptor->getImplementationType())) {
-            itpd.getPrimitiveDescriptor(prim_desc);
+            prim_desc = itpd.get();
            break;
        }
-        itpd++;
+        if (!itpd.next_impl())
+            break;
    }

-    prim.reset(new softmax_forward(prim_desc, getParentEdgeAt(0)->getMemory().GetPrimitive(),
-                                getChildEdgeAt(0)->getMemory().GetPrimitive()));
+    prim.reset(new softmax_forward(prim_desc));
+
+    auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive();
+    primArgs = {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}};
 }

 bool MKLDNNSoftMaxNode::created() const {
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_split_node.cpp
@ -58,11 +58,6 @@ static TensorDesc makeChannelBlockedTensorDesc(const Precision& precision, const
    return TensorDesc(precision, srcDims, {blkDims, order});
 }

-static inline uint8_t* getDataPtr(const MKLDNNMemory& memoryPtr) {
-    return reinterpret_cast<uint8_t*>(memoryPtr.GetData()) + memoryPtr.GetDescriptor().data.layout_desc.blocking.offset_padding *
-        MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(memoryPtr.GetDescriptor().data.data_type));
-}
-
 MKLDNNSplitNode::MKLDNNSplitNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
        MKLDNNNode(layer, eng, cache) {}

@ -139,7 +134,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
        config.inConfs[0].desc = getTensorDesc(precision, srcDims.ToSizeVector());
        config.outConfs.resize(outDims.size());

-        std::vector<memory::format> outFormats;
+        std::vector<memory::format_tag> outFormats;

        for (size_t i = 0; i < outDims.size(); i++) {
            auto o_Dims = outDims[i];
@ -197,7 +192,7 @@ void MKLDNNSplitNode::initSupportedPrimitiveDescriptors() {
        const auto& blkDims = refConfig.inConfs[0].desc.getBlockingDesc().getBlockDims();
        auto numOfDim = blkDims.size();

-        std::vector<memory::format> outFormats;
+        std::vector<memory::format_tag> outFormats;
        SizeVector offsets(numOfDim, 0lu);
        SizeVector strides(numOfDim);
        strides.back() = 1lu;
@ -245,7 +240,7 @@ void MKLDNNSplitNode::execute(mkldnn::stream strm) {
        return;

    int MB = batchToProcess();
-    uint8_t* srcData = getDataPtr(this->getParentEdgeAt(0)->getMemory());
+    uint8_t* srcData = reinterpret_cast<uint8_t*>(this->getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
    size_t batch = this->getParentEdgeAt(0)->getDims()[0];

    if (batch != MB)
@ -385,9 +380,6 @@ void MKLDNNSplitNode::setDynamicBatchLim(int lim) {
        THROW_ERROR << "Dynamic batch is not supported by split layer with axis == 0 parameter";

    dynBatchLim = lim;
-    if (prim) {
-        prim.setBatchLimit(batchToProcess(), getParentEdges().size(), getChildEdges().size());
-    }
 }

 void MKLDNNSplitNode::prepareOptimizedParams() {
@ -418,7 +410,7 @@ void MKLDNNSplitNode::prepareOptimizedParams() {
    optimizedParams.dataSize.resize(this->getChildEdges().size());
    optimizedParams.dstMemPtrs.clear();
    for (int i = 0; i < this->getChildEdges().size(); i++) {
-        if (uint8_t* dstData = getDataPtr(this->getChildEdgeAt(i)->getMemory())) {
+        if (uint8_t* dstData = reinterpret_cast<uint8_t*>(this->getChildEdgeAt(i)->getMemoryPtr()->GetPtr())) {
            optimizedParams.dstMemPtrs.push_back(dstData);
        } else {
            THROW_ERROR << "can't get child edge indx " << i << "data.";
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.cpp
@ -3,7 +3,7 @@
 //

 #include "mkldnn_tensoriterator_node.h"
-#include "desc_iterator.hpp"
+
 #include <legacy/ie_layers.h>
 #include <legacy/ie_layers_internal.hpp>
 #include <string>
@ -50,7 +50,8 @@ static InferenceEngine::LayerConfig make_plain_config(const InferenceEngine::CNN
 class PortIteratorHelper : public PortMapHelper {
 public:
    PortIteratorHelper(const MKLDNNMemoryPtr &from, const MKLDNNMemoryPtr &to, bool sliced_src,
-                       const InferenceEngine::TensorIterator::PortMap &slice_rule, const mkldnn::engine& eng) {
+                       const InferenceEngine::TensorIterator::PortMap &slice_rule, const mkldnn::engine& eng)
+                       : sliced_src(sliced_src) {
        const auto &full_blob = sliced_src ? from : to;
        const auto &part_blob = !sliced_src ? from : to;

@ -71,56 +72,59 @@ public:
        // make chunk view
        auto chunk_desc =  full_blob->GetDescriptor();
        chunk_desc.data.dims[axis] = abs_stride;
-        chunk_desc.data.layout_desc.blocking.padding_dims[axis] = abs_stride;  // TODO: asamption that plain tensor
+        chunk_desc.data.padded_dims[axis] = abs_stride;  // TODO: asamption that plain tensor

-        mem_holder.push_back(full_blob->GetPrimitive());
-        auto full_mem_handler = full_blob->GetPrimitive().get_data_handle();
-        mem_holder.emplace_back(mkldnn::memory::primitive_desc(chunk_desc, eng), full_mem_handler);
-        auto &chunk_mem_prim = mem_holder.back();
+        full_mem = full_blob->GetPrimitive();
+        const auto full_mem_handler = full_mem.get_data_handle();
+        mkldnn::memory chunk_mem = {chunk_desc, eng, full_mem_handler};

        auto elem_size = MKLDNNExtensionUtils::sizeOfDataType(mkldnn::memory::data_type(chunk_desc.data.data_type));

-        chunk_stride_in_byte = chunk_desc.data.layout_desc.blocking.strides[0][axis] * elem_size * abs_stride;
+        chunk_stride_in_byte = chunk_desc.data.format_desc.blocking.strides[axis] * elem_size * abs_stride;
        chunk_offset_in_byte = sign_of_stride < 0 ? (iter_count - 1) * chunk_stride_in_byte : 0;
        chunk_stride_in_byte *= sign_of_stride;

        if (sliced_src) {
-            reorders.emplace_back(chunk_mem_prim, to->GetPrimitive());
+            mem_holder_src = chunk_mem;
+            mem_holder_dst = to->GetPrimitive();
        } else {
-            reorders.emplace_back(from->GetPrimitive(), chunk_mem_prim);
+            mem_holder_src = from->GetPrimitive();
+            mem_holder_dst = chunk_mem;
        }
+        reorder = {mem_holder_src, mem_holder_dst};
    }

    void execute(mkldnn::stream strm, int iter) override {
        IE_ASSERT(iter >= 0 && iter < iter_count);

-        auto full_mem = mem_holder[FULL_DATA];
-        auto chunk_mem = mem_holder[CHUNK_DATA];
-
+        auto &chunk_mem = sliced_src ? mem_holder_src : mem_holder_dst;
        chunk_mem.set_data_handle(static_cast<uint8_t *>(full_mem.get_data_handle()) +
                chunk_offset_in_byte + chunk_stride_in_byte * iter);

-        strm.submit({reorders.begin(), reorders.end()});
+        reorder.execute(strm, mem_holder_src, mem_holder_dst);
    }

 private:
    ptrdiff_t chunk_stride_in_byte = 0;
    ptrdiff_t chunk_offset_in_byte = 0;

-    const int FULL_DATA = 0;
-    const int CHUNK_DATA = 1;
+    bool sliced_src;
+    mkldnn::memory full_mem;
+
    int iter_count;
 };

 class BackEdgePortHelper : public PortMapHelper {
 public:
    BackEdgePortHelper(const MKLDNNMemoryPtr &from, const MKLDNNMemoryPtr &to, const mkldnn::engine& eng) {
-        reorders.emplace_back(from->GetPrimitive(), to->GetPrimitive());
+        mem_holder_src = from->GetPrimitive();
+        mem_holder_dst = to->GetPrimitive();
+        reorder = {mem_holder_src, mem_holder_dst};
    }

    void execute(mkldnn::stream strm, int iter) override {
        if (iter != 0) {
-            strm.submit({reorders.begin(), reorders.end()});
+            reorder.execute(strm, mem_holder_src, mem_holder_dst);
        }
    }
 };
@ -129,13 +133,13 @@ class IterCountPortHelper : public PortMapHelper {
 public:
    IterCountPortHelper(const MKLDNNMemoryPtr &to, const mkldnn::engine& eng) {
        // Only scalar I32 tensor is supported
-        IE_ASSERT(to->GetDataType() == memory::s32);
+        IE_ASSERT(to->GetDataType() == memory::data_type::s32);
        IE_ASSERT(to->GetDims() == memory::dims{1});
-        mem_holder.push_back(to->GetPrimitive());
+        mem_holder_dst = to->GetPrimitive();
    }

    void execute(mkldnn::stream strm, int n_iter) override {
-        auto mem = mem_holder[0];
+        auto mem = mem_holder_dst;
        auto data_ptr = static_cast<uint32_t*>(mem.get_data_handle());
        *data_ptr = n_iter;
    }
@ -144,14 +148,13 @@ public:
 class asBoolCheck : public PortChecker {
 public:
    asBoolCheck(const MKLDNNMemoryPtr &mem) {
-        IE_ASSERT(mem->GetDataType() == memory::u8);
+        IE_ASSERT(mem->GetDataType() == memory::data_type::u8);
        IE_ASSERT(mem->GetDims() == memory::dims{1});
-        mem_holder.push_back(mem->GetPrimitive());
+        mem_holder = mem->GetPrimitive();
    }

    int getStatus() override {
-        auto mem = mem_holder[0];
-        auto data_ptr = static_cast<uint8_t*>(mem.get_data_handle());
+        auto data_ptr = static_cast<uint8_t*>(mem_holder.get_data_handle());
        return *data_ptr == static_cast<uint8_t>(0) ? 0 : 1;
    }
 };
@ -159,14 +162,13 @@ public:
 class asIntCheck : public PortChecker {
 public:
    asIntCheck(const MKLDNNMemoryPtr &mem) {
-        IE_ASSERT(mem->GetDataType() == memory::s32);
+        IE_ASSERT(mem->GetDataType() == memory::data_type::s32);
        IE_ASSERT(mem->GetDims() == memory::dims{1});
-        mem_holder.push_back(mem->GetPrimitive());
+        mem_holder = mem->GetPrimitive();
    }

    int getStatus() override {
-        auto mem = mem_holder[0];
-        auto data_ptr = static_cast<uint32_t*>(mem.get_data_handle());
+        auto data_ptr = static_cast<uint32_t*>(mem_holder.get_data_handle());
        return *data_ptr;
    }
 };
@ -185,7 +187,8 @@ private:
 }  // namespace MKLDNNPlugin

 MKLDNNTensorIteratorNode::MKLDNNTensorIteratorNode(InferenceEngine::CNNLayerPtr layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
-        MKLDNNNode(layer, eng, cache) {}
+        MKLDNNNode(layer, eng, cache),
+        sub_graph(eng) {}

 void MKLDNNTensorIteratorNode::getSupportedDescriptors() {
    auto *ti = dynamic_cast<class InferenceEngine::TensorIterator*>(getCnnLayer().get());
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tensoriterator_node.h
@ -23,8 +23,9 @@ public:
    virtual ~PortMapHelper() = default;
    virtual void execute(mkldnn::stream strm, int n_iter = -1) = 0;
 protected:
-    std::vector<mkldnn::reorder> reorders;
-    std::vector<mkldnn::memory> mem_holder;
+    mkldnn::reorder reorder;
+    mkldnn::memory mem_holder_src;
+    mkldnn::memory mem_holder_dst;
 };


@ -38,7 +39,7 @@ public:
    virtual ~PortChecker() = default;
    virtual int getStatus() = 0;
 protected:
-    std::vector<mkldnn::memory> mem_holder;
+    mkldnn::memory mem_holder;
 };


--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_tile_node.cpp
@ -45,7 +45,7 @@ void MKLDNNTileNode::initSupportedPrimitiveDescriptors() {
    auto outputDataType = MKLDNNExtensionUtils::IEPrecisionToDataType(precision);

    auto& inDims = getParentEdgeAt(0)->getDims();
-    memory::format fmt = MKLDNNMemory::GetPlainFormat(inDims);
+    memory::format_tag fmt = MKLDNNMemory::GetPlainFormat(inDims);

    InferenceEngine::LayerConfig config;
    config.dynBatchSupport = true;
@ -76,10 +76,8 @@ void MKLDNNTileNode::createPrimitive() {
 void MKLDNNTileNode::execute(mkldnn::stream strm) {
    auto& srcMemory = getParentEdgeAt(0)->getMemory();

-    const float *src_ptr = reinterpret_cast<const float*>(srcMemory.GetData()) +
-            srcMemory.GetDescriptor().data.layout_desc.blocking.offset_padding;
-    float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetData()) +
-            getChildEdgeAt(0)->getMemory().GetDescriptor().data.layout_desc.blocking.offset_padding;
+    const float *src_ptr = reinterpret_cast<const float*>(srcMemory.GetPtr());
+    float *dst_ptr = reinterpret_cast<float*>(getChildEdgeAt(0)->getMemory().GetPtr());

    int m_inner_dim = 1;
    int m_outer_dim = 1;
@ -94,16 +92,13 @@ void MKLDNNTileNode::execute(mkldnn::stream strm) {
        m_inner_dim *= batchToProcess();
    }

-    if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw8c) ||
-            (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw8c))) {
+    if (m_inner_dim == 1 && m_outer_dim % 8 == 0 && srcMemory.GetDesc().isBlockedCFormat(8)) {
        /*
         * We may enable tile processing directly to appropriate output format (nChw8c)
         */
        m_inner_dim *= 8;
        m_outer_dim /= 8;
-    } else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 &&
-            ((inDims.size() == 4 && srcMemory.GetFormat() == memory::nChw16c) ||
-            (inDims.size() == 5 && srcMemory.GetFormat() == memory::nCdhw16c))) {
+    } else if (m_inner_dim == 1 && m_outer_dim % 16 == 0 && srcMemory.GetDesc().isBlockedCFormat(16)) {
        /*
         * We may enable tile processing directly to appropriate output format (nChw16c)
         */
--- a/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/region_yolo.cpp
@ -3,6 +3,7 @@
 //

 #include "base.hpp"
+#include "utils/general_utils.h"
 #include "common/defs.h"
 #include "common/softmax.h"
 #include "common/cpu_convert.h"
@ -13,13 +14,15 @@
 #include <mkldnn_extension_utils.h>
 #include "utils/bfloat16.hpp"
 #include "common/cpu_memcpy.h"
-#include "jit_generator.hpp"
-#include "jit_uni_eltwise.hpp"
+#include "mkldnn.hpp"
+#include <cpu/x64/jit_generator.hpp>
+#include <cpu/x64/jit_uni_eltwise_injector.hpp>

 using namespace mkldnn;
 using namespace MKLDNNPlugin;
 using namespace InferenceEngine;
 using namespace mkldnn::impl::cpu;
+using namespace mkldnn::impl::cpu::x64;
 using namespace mkldnn::impl::utils;

 namespace InferenceEngine {
@ -46,6 +49,8 @@ struct jit_uni_logistic_kernel {

    void operator()(const jit_args_logistic *args) { assert(ker_); ker_(args); }

+    virtual void create_ker() = 0;
+
    jit_uni_logistic_kernel() : ker_(nullptr) {}
    virtual ~jit_uni_logistic_kernel() {}
 };
@ -54,8 +59,15 @@ template <cpu_isa_t isa>
 struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_generator {
    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_logistic_kernel_f32)

-    jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jit_uni_logistic_kernel(), jit_generator() {
-        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, alg_kind::eltwise_exp, 0.f, 0.f));
+    jit_uni_logistic_kernel_f32(jit_logistic_config_params jcp) : jcp_(jcp), jit_uni_logistic_kernel(), jit_generator() {}
+
+    void create_ker() override {
+        jit_generator::create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+    void generate() override {
+        exp_injector.reset(new jit_uni_eltwise_injector_f32<isa>(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.f));

        if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core))
            emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr));
@ -76,12 +88,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
            cmp(reg_work_amount, step);
            jl(tail_loop_label, T_NEAR);

-            load_vector(vmm_src, ptr[reg_src], jcp.src_dt);
+            load_vector(vmm_src, ptr[reg_src], jcp_.src_dt);
            compute_kernel();
-            store_vector(ptr[reg_dst], vmm_src, jcp.dst_dt);
+            store_vector(ptr[reg_dst], vmm_src, jcp_.dst_dt);

-            add(reg_src, step * jcp.src_data_size);
-            add(reg_dst, step * jcp.dst_data_size);
+            add(reg_src, step * jcp_.src_data_size);
+            add(reg_dst, step * jcp_.dst_data_size);
            sub(reg_work_amount, step);

            jmp(main_loop_label, T_NEAR);
@ -92,12 +104,12 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
            cmp(reg_work_amount, step);
            jl(exit_label, T_NEAR);

-            load_scalar(xmm_src, ptr[reg_src], jcp.src_dt);
+            load_scalar(xmm_src, ptr[reg_src], jcp_.src_dt);
            compute_kernel();
-            store_scalar(ptr[reg_dst], xmm_src, jcp.dst_dt);
+            store_scalar(ptr[reg_dst], xmm_src, jcp_.dst_dt);

-            add(reg_src, step * jcp.src_data_size);
-            add(reg_dst, step * jcp.dst_data_size);
+            add(reg_src, step * jcp_.src_data_size);
+            add(reg_dst, step * jcp_.dst_data_size);
            sub(reg_work_amount, step);

            jmp(tail_loop_label, T_NEAR);
@ -113,12 +125,10 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_
        exp_injector->prepare_table();

        prepare_table();
-
-        ker_ = (decltype(ker_))this->getCode();
    }

 private:
-    using Vmm = typename conditional3<isa == cpu::sse42, Xbyak::Xmm, isa == cpu::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using Vmm = typename conditional3<isa == x64::sse41, Xbyak::Xmm, isa == x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
    size_t vlen = cpu_isa_traits<isa>::vlen;

    Xbyak::Address table_val(int index) { return ptr[reg_table + index * vlen]; }
@ -143,6 +153,8 @@ private:

    std::shared_ptr<jit_uni_eltwise_injector_f32<isa>> exp_injector;

+    jit_logistic_config_params jcp_;
+
    void compute_kernel() {
        uni_vmovups(vmm_aux0, vmm_src);
        uni_vandps(vmm_aux0, vmm_aux0, table_val(0));
@ -157,10 +169,10 @@ private:
        uni_vmovups(vmm_aux2, table_val(1));
        uni_vsubps(vmm_aux2, vmm_aux2, vmm_src);

-        if (isa == cpu::sse42) {
+        if (isa == x64::sse41) {
            uni_vblendvps(vmm_aux2, vmm_aux2, vmm_src, vmm_aux0);
            uni_vmovups(vmm_src, vmm_aux2);
-        } else if (isa == cpu::avx2) {
+        } else if (isa == x64::avx2) {
            uni_vblendvps(vmm_src, vmm_aux2, vmm_src, vmm_aux0);
        } else {
            vptestmd(k_mask, vmm_aux0, vmm_aux0);
@ -281,19 +293,22 @@ public:
            jcp.src_data_size = jcp.dst_data_size = output_prec.size();

            block_size = 1;
-            if (mayiuse(cpu::avx512_common)) {
-                logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx512_common>(jcp));
+            if (mayiuse(x64::avx512_common)) {
+                logistic_kernel.reset(new jit_uni_logistic_kernel_f32<x64::avx512_common>(jcp));
                block_size = 16;
-            } else if (mayiuse(cpu::avx2)) {
-                logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::avx2>(jcp));
+            } else if (mayiuse(x64::avx2)) {
+                logistic_kernel.reset(new jit_uni_logistic_kernel_f32<x64::avx2>(jcp));
                block_size = 8;
-            } else if (mayiuse(cpu::sse42)) {
-                logistic_kernel.reset(new jit_uni_logistic_kernel_f32<cpu::sse42>(jcp));
+            } else if (mayiuse(x64::sse41)) {
+                logistic_kernel.reset(new jit_uni_logistic_kernel_f32<x64::sse41>(jcp));
                block_size = 4;
            }

            softmax_kernel = std::make_shared<SoftmaxGeneric>(input_prec, output_prec);

+            if (logistic_kernel)
+                logistic_kernel->create_ker();
+
            addConfig(layer, {DataConfigurator(ConfLayout::PLN, input_prec)}, {DataConfigurator(ConfLayout::PLN, output_prec)});
        } catch (InferenceEngine::details::InferenceEngineException &ex) {
            errorMsg = ex.what();
@ -413,7 +428,7 @@ private:
                    float_dst_data[i + start_index] = logistic_scalar(float_dst_data[i + start_index]);
                }
            } else if (Precision::BF16 == output_prec) {
-                auto bf16_dst_data = reinterpret_cast<bfloat16_t*>(dst_data);
+                auto bf16_dst_data = reinterpret_cast<MKLDNNPlugin::bfloat16_t*>(dst_data);
                for (int i = 0; i < count; i++) {
                    bf16_dst_data[i + start_index] = logistic_scalar(bf16_dst_data[i + start_index]);
                }
--- a/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp
+++ b/inference-engine/src/mkldnn_plugin/utils/bfloat16.hpp
@ -6,7 +6,6 @@

 #include <cmath>
 #include <limits>
-#include "utils.hpp"
 #include "nodes/common/emitter.h"

 /**
@ -77,7 +76,7 @@ private:

 class jit_emu_vcvtneps2bf16 : public jit_emitter {
 public:
-    jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::jit_generator* host, mkldnn::impl::cpu::cpu_isa_t host_isa, const MKLDNNNode* node,
+    jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node,
        InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) {
        prepare_table();
    };
@ -87,7 +86,7 @@ public:
 private:
    void emit_impl(const std::vector<size_t>& in_vec_idxs, const std::vector<size_t>& out_vec_idxs,
        const std::vector<size_t>& pool_vec_idxs, const std::vector<size_t>& pool_gpr_idxs) {
-        if (host_isa_ == mkldnn::impl::cpu::cpu_isa_t::avx512_common) {
+        if (host_isa_ == mkldnn::impl::cpu::x64::cpu_isa_t::avx512_common) {
            Xbyak::Zmm in = Xbyak::Zmm(in_vec_idxs[0]);
            Xbyak::Ymm out = Xbyak::Ymm(out_vec_idxs[0]);
            Xbyak::Zmm aux = Xbyak::Zmm(aux_vec_idxs[0]);
--- a/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
+++ b/inference-engine/src/mkldnn_plugin/utils/blob_dump.cpp
@ -6,8 +6,7 @@
 #include "blob_factory.hpp"
 #include "mkldnn_memory.h"

-// It's so bad to include by relative path :-(
-#include "../../thirdparty/mkl-dnn/src/common/memory_desc_wrapper.hpp"
+#include "common/memory_desc_wrapper.hpp"

 #include <fstream>

--- a/inference-engine/src/mkldnn_plugin/utils/general_utils.h
+++ b/inference-engine/src/mkldnn_plugin/utils/general_utils.h
@ -0,0 +1,43 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cassert>
+
+namespace MKLDNNPlugin {
+
+template<typename T, typename U>
+inline T div_up(const T a, const U b) {
+    assert(b);
+    return (a + b - 1) / b;
+}
+
+template<typename T, typename U>
+inline T rnd_up(const T a, const U b) {
+    return div_up(a, b) * b;
+}
+
+template <typename T, typename P>
+constexpr bool one_of(T val, P item) { return val == item; }
+
+template <typename T, typename P, typename... Args>
+constexpr bool one_of(T val, P item, Args... item_others) {
+    return val == item || one_of(val, item_others...);
+}
+
+template <typename T, typename P>
+constexpr bool everyone_is(T val, P item) { return val == item; }
+
+template <typename T, typename P, typename... Args>
+constexpr bool everyone_is(T val, P item, Args... item_others) {
+    return val == item && everyone_is(val, item_others...);
+}
+
+constexpr inline bool implication(bool cause, bool cond) {
+    return !cause || !!cond;
+}
+
+
+}  // namespace MKLDNNPlugin
--- a/inference-engine/src/transformations/src/transformations/convert_precision.cpp
+++ b/inference-engine/src/transformations/src/transformations/convert_precision.cpp
@ -373,6 +373,8 @@ bool fuse_type_to_constant(std::shared_ptr<Node> & node, element::Type to, const
            new_const = change_constant_precision<element::Type_t::u8, element::Type_t::i32>(constant);
        } else if (from == element::u16 && to == element::i32) {
            new_const = change_constant_precision<element::Type_t::u16, element::Type_t::i32>(constant);
+        } else if (from == element::i16 && to == element::i32) {
+            new_const = change_constant_precision<element::Type_t::i16, element::Type_t::i32>(constant);
        } else if (from == element::u32 && to == element::i32) {
            new_const = change_constant_precision<element::Type_t::u32, element::Type_t::i32>(constant);
        } else if (from == element::f16 && to == element::f32) {
--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_conv.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_conv.cpp
@ -100,7 +100,7 @@ protected:
        // performance counters
        expectedPrecisions["ADD_1"] = "FP32";
        expectedPrecisions["CONV_1"] = "BF16";
-        expectedPrecisions["CONV_2"] = "FP32";
+        expectedPrecisions["CONV_2"] = "BF16";
    }
 };

--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/conv_relu_pool_conv_relu_pool.cpp
@ -164,7 +164,7 @@ protected:
        // performance counters
        expectedPrecisions["Convolution_1"] = "FP32";
        expectedPrecisions["ReLU_1"] = "ndef";
-        expectedPrecisions["AvgPool_1"] = "BF16";
+        expectedPrecisions["AvgPool_1"] = netPrecision == Precision::BF16 ? "BF16" : "FP32";
        expectedPrecisions["Convolution_2"] = "BF16";
        expectedPrecisions["ReLU_2"] = "ndef";
        expectedPrecisions["MaxPool_2"] = "BF16";
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/fake_quantize.cpp
@ -16,7 +16,9 @@ const std::vector<InferenceEngine::Precision> netPrecisions = {
        InferenceEngine::Precision::FP16
 };

-const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
+const std::vector<std::vector<size_t>> inputShapes = {{1, 1}, {2, 6}, {1, 1, 1}, {2, 6, 13},
+                                                      {1, 1, 1, 1}, {3, 10, 5, 6}, {2, 8, 5, 18}, {2, 16, 3, 18}, {3, 49, 5, 6},
+                                                      {1, 1, 1, 1, 1}, {3, 10, 2, 5, 6}, {2, 8, 1, 5, 18}, {2, 16, 4, 3, 18}, {3, 49, 7, 5, 6}};
 const std::vector<std::vector<size_t>> constShapes = {{1}};
 const std::vector<size_t> levels = {16, 255, 256};

@ -24,7 +26,6 @@ const std::pair<std::string, std::map<std::string, std::string>> config = {};
 const std::vector<float> fqArgs = {};
 const std::vector<float> inputParams = {};

-
 const auto fqParams = ::testing::Combine(
        ::testing::ValuesIn(levels),
        ::testing::ValuesIn(constShapes),
@ -45,4 +46,47 @@ INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize, FakeQuantizeLayerTest,
                                ::testing::Values(config)),
                        FakeQuantizeLayerTest::getTestCaseName);

+const std::vector<std::vector<size_t>> inputShapesPerChannel = {{11, 10, 22, 19}, {11, 10, 5, 6}};
+const std::vector<std::vector<size_t>> constShapesPerChannelAxis0 = {{11, 1, 1, 1}};
+const std::vector<std::vector<size_t>> constShapesPerChannelAxis1 = {{1, 10, 1, 1}};
+
+const auto fqParamsPerChannelAxis0 = ::testing::Combine(
+        ::testing::ValuesIn(levels),
+        ::testing::ValuesIn(constShapesPerChannelAxis0),
+        ::testing::Values(fqArgs),
+        ::testing::Values(inputParams)
+);
+
+const auto fqParamsPerChannelAxis1 = ::testing::Combine(
+        ::testing::ValuesIn(levels),
+        ::testing::ValuesIn(constShapesPerChannelAxis1),
+        ::testing::Values(fqArgs),
+        ::testing::Values(inputParams)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizePerChannelAxis0, FakeQuantizeLayerTest,
+                        ::testing::Combine(
+                                fqParamsPerChannelAxis0,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::ValuesIn(inputShapesPerChannel),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(config)),
+                        FakeQuantizeLayerTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_FakeQuantizePerChannelAxis1, FakeQuantizeLayerTest,
+                        ::testing::Combine(
+                                fqParamsPerChannelAxis1,
+                                ::testing::ValuesIn(netPrecisions),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::Values(InferenceEngine::Layout::ANY),
+                                ::testing::ValuesIn(inputShapesPerChannel),
+                                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                                ::testing::Values(config)),
+                        FakeQuantizeLayerTest::getTestCaseName);
 }  // namespace
--- a/Show More
+++ b/Show More