[CPU] Enable bf16 RNN primitives (#4942)

2021-04-25 22:18:38 +03:00 · 2021-04-25 22:18:38 +03:00 · 39e1a21c42
commit 39e1a21c42
parent 8bb73273f1
12 changed files with 1425 additions and 222 deletions
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.cpp
@ -5,12 +5,17 @@
 #include "mkldnn_rnn.h"
 #include "mkldnn_extension_utils.h"
 #include "mkldnn_node.h"
 #include "utils/general_utils.h"
 #include "nodes/common/cpu_memcpy.h"
 #include "utils/bfloat16.hpp"
 #include "nodes/common/cpu_convert.h"
 #include <string>
 #include <utility>
 #define THROW_ERROR IE_THROW() << NameFromType(getType()) << " layer '" << getName() << "' "
 using namespace mkldnn;
 using namespace InferenceEngine;
@ -39,7 +44,7 @@ static algorithm ie2mkl(RNNCellBase::CellType cell_type) {
        case RNNCellBase::GRU:     return algorithm::vanilla_gru;
        case RNNCellBase::GRU_LBR: return algorithm::lbr_gru;
        default:
-            IE_THROW() << "Unsupported cell type";
+            IE_THROW() << "RNN node. Unsupported cell type";
            return algorithm::undef;
    }
 }
@ -51,7 +56,7 @@ size_t gatesCount(algorithm alg) {
        case algorithm::lbr_gru:         return 3;
        case algorithm::vanilla_lstm:    return 4;
        default:
-            IE_THROW() << "Unsupported cell type";
+            IE_THROW() << "RNN node. Unsupported cell type";
            return 0;
    }
 }
@ -63,11 +68,24 @@ size_t statesCount(algorithm alg) {
        case algorithm::lbr_gru:         return 1;
        case algorithm::vanilla_lstm:    return 2;
        default:
-            IE_THROW() << "Unsupported cell type";
+            IE_THROW() << "RNN node. Unsupported cell type";
            return 0;
    }
 }
 bool haveCellState(algorithm alg) {
    return alg == algorithm::vanilla_lstm;
 }
 const std::map<InferenceEngine::Precision, InferenceEngine::Precision> MKLDNNRNN::weightsByLayerPrec {
    // layer precision,                weights precision
    {InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP32},
    {InferenceEngine::Precision::BF16, InferenceEngine::Precision::BF16},
    // FP16 and U8 are not supported yet
    // {InferenceEngine::Precision::FP16, InferenceEngine::Precision::FP16},
    // {InferenceEngine::Precision::U8,   InferenceEngine::Precision::I8},
 };
 MKLDNNRNN::MKLDNNRNN(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) :
        MKLDNNNode(layer, eng, cache) {
    is_cell = one_of(layer->type, "LSTMCell", "GRUCell", "RNNCell");
@ -78,6 +96,8 @@ bool MKLDNNRNN::created() const {
 }
 void MKLDNNRNN::getSupportedDescriptors() {
    runtimePrecision = getCnnLayer()->insData[0].lock()->getPrecision();
    if (is_cell)
        fillCellDesc();
    else
@ -89,14 +109,14 @@ void MKLDNNRNN::fillCellDesc() {
    auto cellLayer = std::dynamic_pointer_cast<RNNCellBase>(getCnnLayer());
    if (!cellLayer)
-        IE_THROW() << "No original layer for RNNCell.";
+        THROW_ERROR << "No original layer for RNNCell.";
    cell_type = ie2mkl(cellLayer->cellType);
    cell_act = ie2mkl(cellLayer->activations[0]);  // Works only for RNN with one gate
    if (cellLayer->clip != 0.0f) {
        // TODO [oneDNN]: No more supported
-        IE_THROW() << "Clipping is not supported for RNN primitive";
+        THROW_ERROR << "Clipping is not supported for RNN primitive";
 //        cell_desc.set_clipping(cellLayer->clip);
    }
@ -104,16 +124,16 @@ void MKLDNNRNN::fillCellDesc() {
    auto &outs = cellLayer->outData;
    if (!one_of(ins.size(), 3, 2))
-        IE_THROW() << "Incorrect number of input ports for layer " << getName();
+        THROW_ERROR << "Incorrect number of input ports for layer " << getName();
    if (!one_of(outs.size(), 2, 1))
-        IE_THROW() << "Incorrect number of output ports for layer " << getName();
+        THROW_ERROR << "Incorrect number of output ports for layer " << getName();
    auto in_data_dims = getParentEdgeAt(0)->getDims();
    auto in_h_state_dims = getParentEdgeAt(1)->getDims();
    auto out_h_state_dims = getChildEdgeAt(0)->getDims();
    if (in_data_dims.ndims() != 2 || in_h_state_dims.ndims() != 2)
-        IE_THROW() << "Incorrect shape of input/output ports for layer " << getName();
+        THROW_ERROR << "Incorrect shape of input/output ports for layer " << getName();
    G = gatesCount(cell_type);
    S = statesCount(cell_type);
@ -130,7 +150,7 @@ void MKLDNNRNN::fillCellDesc() {
    if (in_data_dims != D_shape
        || in_h_state_dims != S_shape
        || out_h_state_dims != S_shape)
-        IE_THROW() << "Incorrect shape of input/output ports for layer " << getName();
+        THROW_ERROR << "Incorrect shape of input/output ports for layer " << getName();
    if (S == 2) {
        auto in_c_state_dims = getParentEdgeAt(2)->getDims();
@ -138,7 +158,7 @@ void MKLDNNRNN::fillCellDesc() {
        if (in_c_state_dims != S_shape
            || out_c_state_dims != S_shape)
-            IE_THROW() << "Incorrect shape of input/output ports for layer " << getName();
+            THROW_ERROR << "Incorrect shape of input/output ports for layer " << getName();
    }
    auto blobs = cellLayer->blobs;
@ -147,40 +167,53 @@ void MKLDNNRNN::fillCellDesc() {
    if (blobs.find("biases") != blobs.end()) bias = blobs["biases"];
    if (!weights)
-        IE_THROW() << "RNN Layer. Weights do not present.";
+        THROW_ERROR << "RNN Layer. Weights do not present.";
-    if (weights->size() != G*SC*(SC+DC))
+    if (weights->size() != G * SC * (SC + DC))
-        IE_THROW() << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);
+        THROW_ERROR << "RNN Layer. Weights size is not correct. Expected size:" << G * SC * (SC + DC);
-    if (bias && bias->size() != Gb*SC)
+    if (bias && bias->size() != Gb * SC)
-        IE_THROW() << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
+        THROW_ERROR << "RNN Layer. Biases size is not correct. Expected size:" << G * SC;
    auto dataType = MKLDNNExtensionUtils::IEPrecisionToDataType(runtimePrecision);
    // layer input plus states
    in_data_d.resize(S + 1);
    out_data_d.resize(S + 1);
    // Shapes and Attributes are correct. Can start internal stuff initialization.
-    for (size_t i = 0; i < S; i++) {
+    in_data_d[RNNInOutKind::Layer]  = {{T, N, DC}, dataType, memory::format_tag::tnc};
-        in_states_d.emplace_back(S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc);
+    out_data_d[RNNInOutKind::Layer] = {{T, N, SC}, dataType, memory::format_tag::tnc};
-        out_states_d.emplace_back(S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc);
+
    in_data_d[RNNInOutKind::HiddenState]  = {S_4D_shape, dataType, memory::format_tag::ldnc};
    out_data_d[RNNInOutKind::HiddenState] = {S_4D_shape, dataType, memory::format_tag::ldnc};
    if (haveCellState(cell_type)) {
        in_data_d[RNNInOutKind::CellState] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
        out_data_d[RNNInOutKind::CellState] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
    }
-    in_data_d  = {{T, N, DC}, memory::data_type::f32, memory::format_tag::tnc};;
+    w_data_d   = {{L, D, DC, G, SC}, dataType, memory::format_tag::ldigo};
-    out_data_d = {{T, N, SC}, memory::data_type::f32, memory::format_tag::tnc};;
+    w_state_d  = {{L, D, SC, G, SC}, dataType, memory::format_tag::ldigo};
    w_data_d   = {{L, D, DC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
    w_state_d  = {{L, D, SC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
    if (bias)
        w_bias_d = {{L, D, Gb, SC}, memory::data_type::f32, memory::format_tag::ldgo};
    std::vector<TensorDesc> in_candidate, out_candidate;
-    std::vector<memory::format_tag> outputFormats;
+    in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, dataType, memory::format_tag::nc});
-    in_candidate.emplace_back(MKLDNNMemoryDesc {D_shape, memory::data_type::f32, memory::format_tag::nc});
+    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, dataType, memory::format_tag::nc});
-    in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
+    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, dataType, memory::format_tag::nc});
    out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
    outputFormats.emplace_back(memory::format_tag::nc);
-    if (S == 2) {
+    if (haveCellState(cell_type)) {
        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
        out_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
-        outputFormats.emplace_back(memory::format_tag::nc);
+    }
    Precision weights_prec = as<MemoryBlob>(weights)->getTensorDesc().getPrecision();
    if (!verifyWeightsPrecision(runtimePrecision, weights_prec)) {
        if (runtimePrecision == Precision::BF16 && weights_prec == Precision::FP32)
            convertWeightsBlobToBF16();
    }
    createDescriptor(in_candidate, out_candidate);
@ -191,10 +224,10 @@ void MKLDNNRNN::fillSeqDesc() {
    auto rnnLayer = std::dynamic_pointer_cast<RNNSequenceLayer>(getCnnLayer());
    if (!rnnLayer)
-        IE_THROW() << "Wrong RNN layer representation. Cannot cast to RNNSequenceLayer.";
+        THROW_ERROR << "Wrong RNN layer representation. Cannot cast to RNNSequenceLayer.";
    if (!one_of(rnnLayer->cellType, _RNN::LSTM, _RNN::GRU, _RNN::GRU_LBR, _RNN::RNN))
-        IE_THROW() << "RNN layer supports only LSTM/GRU/RNN cell";
+        THROW_ERROR << "RNN layer supports only LSTM/GRU/RNN cell";
    cell_type = ie2mkl(rnnLayer->cellType);
    cell_act = algorithm::undef;
@ -203,31 +236,31 @@ void MKLDNNRNN::fillSeqDesc() {
    // TODO [oneDNN]: No more supported
    if (rnnLayer->clip != 0.0f) {
-        IE_THROW() << "Clipping is not supported for RNN primitive";
+        THROW_ERROR << "Clipping is not supported for RNN primitive";
 //        cell_desc.set_clipping(rnnLayer->clip);
    }
    if (!one_of(rnnLayer->axis, 0, 1))
-        IE_THROW() << "RNN layer supports only sequence axis 0 or 1";
+        THROW_ERROR << "RNN layer supports only sequence axis 0 or 1";
    nativeOrder = rnnLayer->axis == 0;
    if (!one_of(rnnLayer->direction, _RNN::FWD, _RNN::BWD))
-        IE_THROW() << "RNN layer supports only unidirectional RNN layer";
+        THROW_ERROR << "RNN layer supports only unidirectional RNN layer";
    direction = ie2mkl(rnnLayer->direction);
    auto &ins = rnnLayer->insData;
    auto &outs = rnnLayer->outData;
    if (!one_of(ins.size(), 3, 2, 1))
-        IE_THROW() << "Incorrect number of input ports for layer " << getName();
+        THROW_ERROR << "Incorrect number of input ports for layer " << getName();
    if (!one_of(outs.size(), 3, 2, 1))
-        IE_THROW() << "Incorrect number of output ports for layer " << getName();
+        THROW_ERROR << "Incorrect number of output ports for layer " << getName();
    auto in_data_dims = getParentEdgeAt(0)->getDims();
    auto out_data_dims = getChildEdgeAt(0)->getDims();
    if (in_data_dims.ndims() != 3 || out_data_dims.ndims() != 3)
-        IE_THROW() << "Incorrect shape of input/output ports for layer " << getName();
+        THROW_ERROR << "Incorrect shape of input/output ports for layer " << getName();
    if (!nativeOrder) {
        std::swap(in_data_dims[0], in_data_dims[1]);
@ -246,125 +279,153 @@ void MKLDNNRNN::fillSeqDesc() {
    MKLDNNDims ID_shape {T, N, DC}, OD_shape {T, N, SC}, S_shape {N, SC}, S_4D_shape {L, D, N, SC};
    if (out_data_dims != OD_shape)
-        IE_THROW() << "Incorrect shape of input/output ports for layer " << getName();
+        THROW_ERROR << "Incorrect shape of input/output ports for layer " << getName();
-    in_states_d.resize(S);
+    auto& blobs = rnnLayer->blobs;
    out_states_d.resize(S);
    for (int i = 1; i < ins.size(); i++) {
        if (getParentEdgeAt(i)->getDims() != S_shape)
            IE_THROW() << "Incorrect shape of state ports for layer " << getName();
        in_states_d[i - 1] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
    }
    for (int i = 1; i < outs.size(); i++) {
        if (getChildEdgeAt(i)->getDims() != S_shape)
            IE_THROW() << "Incorrect shape of state ports for layer " << getName();
        out_states_d[i - 1] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
    }
    auto blobs = rnnLayer->blobs;
    Blob::Ptr weights, bias;
    if (blobs.find("weights") != blobs.end()) weights = blobs["weights"];
    if (blobs.find("biases") != blobs.end()) bias = blobs["biases"];
    if (!weights)
-        IE_THROW() << "RNN Layer. Weights do not present.";
+        THROW_ERROR << "RNN Layer. Weights do not present.";
-    if (weights->size() != G*SC*(SC+DC))
+    if (weights->size() != G * SC * (SC + DC))
-        IE_THROW() << "RNN Layer. Weights size is not correct. Expected size:" << G*SC*(SC+DC);
+        THROW_ERROR << "RNN Layer. Weights size is not correct. Expected size:" << G * SC * (SC + DC);
-    w_data_d  = {{L, D, DC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
+    for (int i = 1; i < ins.size(); i++) {
-    w_state_d = {{L, D, SC, G, SC}, memory::data_type::f32, memory::format_tag::ldigo};
+        if (getParentEdgeAt(i)->getDims() != S_shape)
            THROW_ERROR << "Incorrect shape of state ports for layer " << getName();
    }
-    if (bias && bias->size() != Gb*SC)
+    for (int i = 1; i < outs.size(); i++) {
-        IE_THROW() << "RNN Layer. Biases size is not correct. Expected size:" << G*SC;
+        if (getChildEdgeAt(i)->getDims() != S_shape)
            THROW_ERROR << "Incorrect shape of state ports for layer " << getName();
    }
    // layer input plus states
    in_data_d.resize(S + 1);
    out_data_d.resize(S + 1);
    auto dataType = MKLDNNExtensionUtils::IEPrecisionToDataType(runtimePrecision);
   // Try to create descriptor and corresponding configuration
    in_data_d[RNNInOutKind::Layer]  = {in_data_dims,  dataType, memory::format_tag::tnc};
    out_data_d[RNNInOutKind::Layer] = {out_data_dims, dataType, memory::format_tag::tnc};
    in_data_d[RNNInOutKind::HiddenState]  = {S_4D_shape, dataType, memory::format_tag::ldnc};
    out_data_d[RNNInOutKind::HiddenState] = {S_4D_shape, dataType, memory::format_tag::ldnc};
    if (haveCellState(cell_type)) {
        in_data_d[RNNInOutKind::CellState] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
        out_data_d[RNNInOutKind::CellState] = {S_4D_shape, memory::data_type::f32, memory::format_tag::ldnc};
    }
    w_data_d  = {{L, D, DC, G, SC}, dataType, memory::format_tag::ldigo};
    w_state_d = {{L, D, SC, G, SC}, dataType, memory::format_tag::ldigo};
    if (bias && bias->size() != Gb * SC)
        THROW_ERROR << "RNN Layer. Biases size is not correct. Expected size:" << G * SC;
    if (bias)
        w_bias_d = {{L, D, Gb, SC}, memory::data_type::f32, memory::format_tag::ldgo};
-    // Try to create descriptor and corresponding configuration
+    std::vector<TensorDesc> in_candidate, out_candidate;
    in_data_d = {in_data_dims, memory::data_type::f32, memory::format_tag::tnc};
    out_data_d = {out_data_dims, memory::data_type::f32, memory::format_tag::tnc};
    std::vector<TensorDesc> in_candidate;
    if (nativeOrder)
        in_candidate.push_back(in_data_d);
    else
        in_candidate.push_back(MKLDNNMemoryDesc{{N, T, DC}, memory::data_type::f32, memory::format_tag::ntc});
    for (int i = 1; i < ins.size(); i++)
        in_candidate.emplace_back(MKLDNNMemoryDesc {S_shape, memory::data_type::f32, memory::format_tag::nc});
    std::vector<TensorDesc> out_candidate;
    if (nativeOrder) {
-        out_candidate.push_back(out_data_d);
+        in_candidate.push_back(in_data_d[RNNInOutKind::Layer]);
        out_candidate.push_back(out_data_d[RNNInOutKind::Layer]);
    } else {
-        out_candidate.push_back(MKLDNNMemoryDesc{{N, T, SC}, memory::data_type::f32, memory::format_tag::ntc});
+        in_candidate.emplace_back(MKLDNNMemoryDesc{{N, T, DC}, dataType, memory::format_tag::ntc});
        out_candidate.emplace_back(MKLDNNMemoryDesc{{N, T, SC}, dataType, memory::format_tag::ntc});
    }
-    for (int i = 1; i < outs.size(); i++) {
+    in_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, dataType, memory::format_tag::nc});
    out_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, dataType, memory::format_tag::nc});
    if (haveCellState(cell_type)) {
        in_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, memory::data_type::f32, memory::format_tag::nc});
        out_candidate.emplace_back(MKLDNNMemoryDesc{S_shape, memory::data_type::f32, memory::format_tag::nc});
    }
    Precision weights_prec = as<MemoryBlob>(weights)->getTensorDesc().getPrecision();
    if (!verifyWeightsPrecision(runtimePrecision, weights_prec)) {
        if (runtimePrecision == Precision::BF16 && weights_prec == Precision::FP32)
            convertWeightsBlobToBF16();
    }
    createDescriptor(in_candidate, out_candidate);
 }
 void MKLDNNRNN::convertWeightsBlobToBF16() {
    Blob::Ptr &weights = getCnnLayer()->blobs["weights"];
    MemoryBlob::Ptr cur_weights = as<MemoryBlob>(weights);
    TensorDesc td(Precision::BF16, cur_weights->getTensorDesc().getDims(), cur_weights->getTensorDesc().getLayout());
    MemoryBlob::Ptr new_weights_blob = make_shared_blob<uint16_t>(td);
    new_weights_blob->allocate();
    bfloat16_t *dst = new_weights_blob->wmap();
    float* fp32src = cur_weights->rmap().as<float*>();
    cpu_convert(fp32src, dst, Precision::FP32, Precision::BF16, new_weights_blob->size());
    weights = new_weights_blob;
 }
 void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
                                 const std::vector<TensorDesc> &outputDesc) {
    switch (cell_type) {
        case mkldnn::algorithm::vanilla_rnn: {
            MKLDNNDescriptor desc(std::shared_ptr<vanilla_rnn_forward::desc>(
                    new vanilla_rnn_forward::desc(prop_kind::forward_scoring, cell_act, direction,
-                            /* In Data       */ in_data_d,
+                            /* In Data       */ in_data_d[RNNInOutKind::Layer],
-                            /* In State      */ in_states_d[0],
+                            /* In State      */ in_data_d[RNNInOutKind::HiddenState],
                            /* Weights data  */ w_data_d,
                            /* Weights state */ w_state_d,
                            /* Bias          */ w_bias_d,
-                            /* Out Data      */ out_data_d,
+                            /* Out Data      */ out_data_d[RNNInOutKind::Layer],
-                            /* Out State     */ out_states_d[0])));
+                            /* Out State     */ out_data_d[RNNInOutKind::HiddenState])));
            descs.push_back(desc);
        } break;
        case mkldnn::algorithm::vanilla_gru: {
            MKLDNNDescriptor desc(std::shared_ptr<gru_forward::desc>(
                    new gru_forward::desc(prop_kind::forward_scoring, direction,
-                            /* In Data       */ in_data_d,
+                            /* In Data       */ in_data_d[RNNInOutKind::Layer],
-                            /* In State      */ in_states_d[0],
+                            /* In State      */ in_data_d[RNNInOutKind::HiddenState],
                            /* Weights data  */ w_data_d,
                            /* Weights state */ w_state_d,
                            /* Bias          */ w_bias_d,
-                            /* Out Data      */ out_data_d,
+                            /* Out Data      */ out_data_d[RNNInOutKind::Layer],
-                            /* Out State     */ out_states_d[0])));
+                            /* Out State     */ out_data_d[RNNInOutKind::HiddenState])));
            descs.push_back(desc);
        } break;
        case mkldnn::algorithm::lbr_gru: {
            MKLDNNDescriptor desc(std::shared_ptr<lbr_gru_forward::desc>(
                    new lbr_gru_forward::desc(prop_kind::forward_scoring, direction,
-                            /* In Data       */ in_data_d,
+                            /* In Data       */ in_data_d[RNNInOutKind::Layer],
-                            /* In State      */ in_states_d[0],
+                            /* In State      */ in_data_d[RNNInOutKind::HiddenState],
                            /* Weights data  */ w_data_d,
                            /* Weights state */ w_state_d,
                            /* Bias          */ w_bias_d,
-                            /* Out Data      */ out_data_d,
+                            /* Out Data      */ out_data_d[RNNInOutKind::Layer],
-                            /* Out State     */ out_states_d[0])));
+                            /* Out State     */ out_data_d[RNNInOutKind::HiddenState])));
            descs.push_back(desc);
        } break;
        case mkldnn::algorithm::vanilla_lstm: {
            MKLDNNDescriptor desc(std::shared_ptr<lstm_forward::desc>(
                    new lstm_forward::desc(prop_kind::forward_scoring, direction,
-                            /* In Data       */ in_data_d,
+                            /* In Data       */ in_data_d[RNNInOutKind::Layer],
-                            /* In State H    */ in_states_d[0],
+                            /* In State      */ in_data_d[RNNInOutKind::HiddenState],
-                            /* In State C    */ in_states_d[1],
+                            /* In State C    */ in_data_d[RNNInOutKind::CellState],
                            /* Weights data  */ w_data_d,
                            /* Weights state */ w_state_d,
                            /* Bias          */ w_bias_d,
-                            /* Out Data      */ out_data_d,
+                            /* Out Data      */ out_data_d[RNNInOutKind::Layer],
-                            /* Out State H   */ out_states_d[0],
+                            /* Out State     */ out_data_d[RNNInOutKind::HiddenState],
-                            /* Out State C   */ out_states_d[1])));
+                            /* Out State C   */ out_data_d[RNNInOutKind::CellState])));
            descs.push_back(desc);
        } break;
        default:
-            IE_THROW() << "Unknown cell type";
+            THROW_ERROR << "Unknown cell type";
    }
    // Fill supported config
@ -389,130 +450,170 @@ void MKLDNNRNN::createDescriptor(const std::vector<TensorDesc> &inputDesc,
    supportedPrimitiveDescriptors.emplace_back(config, ref_any);
 }
 bool MKLDNNRNN::verifyWeightsPrecision(const Precision &layerPrec, const Precision &weightsPrec) {
    if (!weightsByLayerPrec.count(layerPrec))
        THROW_ERROR << "Unsupported layer precision " << layerPrec;
    return weightsPrec == weightsByLayerPrec.at(layerPrec);
 }
 void MKLDNNRNN::verifyWeights() {
    auto layer = getCnnLayer();
    auto weightsIt = layer->blobs.find("weights");
    if (weightsIt == layer->blobs.end())
        THROW_ERROR << "Missed weights blob.";
    const auto& weightsPrec = weightsIt->second->getTensorDesc().getPrecision();
    if (!verifyWeightsPrecision(runtimePrecision, weightsPrec)) {
        THROW_ERROR << "Weights precision " << weightsPrec <<
            " does not match runtime precision" << runtimePrecision;
    }
 }
 void MKLDNNRNN::verifyBiases() {
    auto layer = getCnnLayer();
    if (layer->blobs.find("biases") != layer->blobs.end()
            && layer->blobs["biases"]->getTensorDesc().getPrecision() != Precision::FP32)
        THROW_ERROR << "Invalid biases precision: " << layer->blobs["biases"]->getTensorDesc().getPrecision();
 }
 void MKLDNNRNN::createPrimitive() {
    if (prim) return;
-    std::string errorPrefix =  "RNN layer '" + getCnnLayer()->name + "'";
+    verifyWeights();
-    auto weightsIt = getCnnLayer()->blobs.find("weights");
+    verifyBiases();
-    if (weightsIt == getCnnLayer()->blobs.end())
+
-        IE_THROW() << errorPrefix << " does not have weights blob.";
+    /*
-    if (weightsIt->second->getTensorDesc().getPrecision() != Precision::FP32)
+     *   Gate order
-        IE_THROW() << errorPrefix << " has invalid weights precision: " << weightsIt->second->getTensorDesc().getPrecision();
+     *   ====== LSTM ======
-    if (getCnnLayer()->blobs.find("biases") != getCnnLayer()->blobs.end()
+     *   Caffe - IFOC, ONNX   - IOFC
-            && getCnnLayer()->blobs["biases"]->getTensorDesc().getPrecision() != Precision::FP32)
+     *   IE    - FICO, mkldnn - IFCO
-        IE_THROW() << errorPrefix << " has invalid biases precision: " << getCnnLayer()->blobs["biases"]->getTensorDesc().getPrecision();
+     *
     *   ====== GRU ======
     *   IE - URO, mkldnn - URO
     */
    const int gate_map_lstm[] = {1, 0, 2, 3};  // FICO -> IFCO
    const int gate_map_gru[]  = {0, 1, 2, 3};
    const int gate_map_rnn[]  = {0};
    const int *gate_map;
    const int gate_map_lstm_size = sizeof(gate_map_lstm) / sizeof(int);
    const int gate_map_gru_size = sizeof(gate_map_gru) / sizeof(int);
    const int gate_map_rnn_size = sizeof(gate_map_rnn) / sizeof(int);
    if (cell_type == algorithm::vanilla_lstm) {
        gate_map = gate_map_lstm;
        if (G > gate_map_lstm_size) {
            THROW_ERROR << "G isn't equal to the size of gate_map";
        }
    } else if (cell_type == algorithm::vanilla_gru) {
        gate_map = gate_map_gru;
        if (G > gate_map_gru_size) {
            THROW_ERROR << "G isn't equal to the size of gate_map";
        }
    } else if (cell_type == algorithm::lbr_gru) {
        gate_map = gate_map_gru;
        if (G > gate_map_gru_size) {
            THROW_ERROR << "G isn't equal to the size of gate_map";
        }
    } else if (cell_type == algorithm::vanilla_rnn) {
        gate_map = gate_map_rnn;
        if (G > gate_map_rnn_size) {
            THROW_ERROR << "G isn't equal to the size of gate_map";
        }
    } else {
        gate_map = gate_map_gru;
        if (G > gate_map_gru_size) {
            THROW_ERROR << "G isn't equal to the size of gate_map";
        }
    }
    if (runtimePrecision == Precision::BF16)
        fillWeights<bfloat16_t>(gate_map);
    else if (runtimePrecision == Precision::FP32)
        fillWeights<float>(gate_map);
    else // TODO FP16 and INT8 support
        THROW_ERROR << "Unsupported data type";
    if (runtimePrecision == Precision::BF16 ||
        runtimePrecision == Precision::FP32)
        fillBiases<float>(gate_map);
    auto pd = descs[0].createPrimitiveDescriptorIterator(getEngine());
    prim.reset(new mkldnn::primitive(pd));
 }
-    auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
+/*
-    auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
+ * IE format:
-
+ *   B - [gates, out_state_size]
-    // create weight blobs (data and state part)
+ *
-    auto w_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
+ * MKLDNN format:
-    w_data_mem->Create(w_data_d);
+ *   B - [gates, out_state_size]
-    internalBlobMemory.push_back(w_data_mem);
+ *
-
+ */
-    auto w_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
+template <typename Prec>
-    w_state_mem->Create(w_state_d);
+void MKLDNNRNN::fillBiases(const int *gate_map) {
-    internalBlobMemory.push_back(w_state_mem);
+    if (!w_bias_d)
        return;
    auto w_bias_mem = std::make_shared<MKLDNNMemory>(getEngine());
    w_bias_mem->Create(w_bias_d);
    internalBlobMemory.push_back(w_bias_mem);
-    {
+    auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as<const Prec*>();
-        /* Copy Weight data
+    auto b_ptr = static_cast<Prec*>(w_bias_mem->GetData());
-         * IE format:
+    for (int g = 0; g < Gb; g++) {
-         *   W - [gates, out_state_size, in_data_size + in_state_size]
+        Prec *l_b_ptr = b_ptr + gate_map[g]*SC;
-         *   B - [gates, out_state_size]
+        const Prec *l_ie_b_ptr = ie_b_ptr + g * SC;
-         *
+        cpu_memcpy(l_b_ptr, l_ie_b_ptr, SC * sizeof(Prec));
-         * MKLDNN format:
+    }
-         *   W - [1, 1, in_date_size,  gates, out_state_size]
+}
         *   R - [1, 1, in_state_size, gates, out_state_size]
         *   B - [gates, out_state_size]
         *
         *   Gate order
         *   ====== LSTM ======
         *   Caffe - IFOC, ONNX   - IOFC
         *   IE    - FICO, mkldnn - IFCO
         *
         *   ====== GRU ======
         *   IE - URO, mkldnn - URO
         */
        const int gate_map_lstm[] = {1, 0, 2, 3};  // FICO -> IFCO
        const int gate_map_gru[]  = {0, 1, 2, 3};
        const int gate_map_rnn[]  = {0};
        const int *gate_map;
        const int gate_map_lstm_size = sizeof(gate_map_lstm) / sizeof(int);
        const int gate_map_gru_size = sizeof(gate_map_gru) / sizeof(int);
        const int gate_map_rnn_size = sizeof(gate_map_rnn) / sizeof(int);
        if (cell_type == algorithm::vanilla_lstm) {
            gate_map = gate_map_lstm;
            if (G > gate_map_lstm_size) {
                IE_THROW() << "G isn't equal to the size of gate_map";
            }
        } else if (cell_type == algorithm::vanilla_gru) {
            gate_map = gate_map_gru;
            if (G > gate_map_gru_size) {
                IE_THROW() << "G isn't equal to the size of gate_map";
            }
        } else if (cell_type == algorithm::lbr_gru) {
            gate_map = gate_map_gru;
            if (G > gate_map_gru_size) {
                IE_THROW() << "G isn't equal to the size of gate_map";
            }
        } else if (cell_type == algorithm::vanilla_rnn) {
            gate_map = gate_map_rnn;
            if (G > gate_map_rnn_size) {
                IE_THROW() << "G isn't equal to the size of gate_map";
            }
        } else {
            gate_map = gate_map_gru;
            if (G > gate_map_gru_size) {
                IE_THROW() << "G isn't equal to the size of gate_map";
            }
        }
-        auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as<const float*>();
+/*
-        auto w_ptr = static_cast<float*>(w_data_mem->GetData());
+ * IE format:
-        auto r_ptr = static_cast<float*>(w_state_mem->GetData());
+ *   W - [gates, out_state_size, in_data_size + in_state_size]
-        const int step = SC * G;
+ *
 * MKLDNN format:
 *   W - [1, 1, in_date_size,  gates, out_state_size]
 *   R - [1, 1, in_state_size, gates, out_state_size]
 *
 */
 template <typename Prec>
 void MKLDNNRNN::fillWeights(const int *gate_map) {
    // create weight blobs (data and state part)
    auto w_data_mem = std::make_shared<MKLDNNMemory>(getEngine());
    w_data_mem->Create(w_data_d);
    internalBlobMemory.push_back(w_data_mem);
    auto w_state_mem = std::make_shared<MKLDNNMemory>(getEngine());
    w_state_mem->Create(w_state_d);
    internalBlobMemory.push_back(w_state_mem);
-        for (int g = 0; g < G; g++) {
+    auto ie_w_ptr = getCnnLayer()->blobs["weights"]->buffer().as<const Prec*>();
-            for (int out_i = 0; out_i < SC; out_i++) {
+    auto w_ptr = static_cast<Prec*>(w_data_mem->GetData());
-                float *l_w_ptr = w_ptr + gate_map[g]*SC + out_i;
+    auto r_ptr = static_cast<Prec*>(w_state_mem->GetData());
-                float *l_r_ptr = r_ptr + gate_map[g]*SC+ out_i;
+    const int step = SC * G;
                for (int in_i = 0; in_i < DC; in_i++) {
                    *l_w_ptr = *ie_w_ptr;
                    ie_w_ptr++;
                    l_w_ptr += step;
                }
-                for (int in_i = 0; in_i < SC; in_i++) {
+    for (int g = 0; g < G; g++) {
-                    *l_r_ptr = *ie_w_ptr;
+        for (int out_i = 0; out_i < SC; out_i++) {
-                    ie_w_ptr++;
+            Prec *l_w_ptr = w_ptr + gate_map[g]*SC + out_i;
-                    l_r_ptr += step;
+            Prec *l_r_ptr = r_ptr + gate_map[g]*SC+ out_i;
-                }
+            for (int in_i = 0; in_i < DC; in_i++) {
                *l_w_ptr = *ie_w_ptr;
                ie_w_ptr++;
                l_w_ptr += step;
            }
        }
-        if (w_bias_d) {
+            for (int in_i = 0; in_i < SC; in_i++) {
-            auto ie_b_ptr = getCnnLayer()->blobs["biases"]->buffer().as<const float*>();
+                *l_r_ptr = *ie_w_ptr;
-            auto b_ptr = static_cast<float*>(w_bias_mem->GetData());
+                ie_w_ptr++;
-            for (int g = 0; g < Gb; g++) {
+                l_r_ptr += step;
                float *l_b_ptr = b_ptr + gate_map[g]*SC;
                const float *l_ie_b_ptr = ie_b_ptr + g * SC;
                cpu_memcpy(l_b_ptr, l_ie_b_ptr, SC * sizeof(float));
            }
        }
    }
    prim.reset(new mkldnn::primitive(pd));
 }
 void MKLDNNRNN::execute(mkldnn::stream strm) {
    if (!prim)
-        IE_THROW() << "No initialized primitive to execute";
+        THROW_ERROR << "No initialized primitive to execute";
    const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
    const auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_rnn.h
@ -28,8 +28,19 @@ public:
 private:
    void fillCellDesc();
    void fillSeqDesc();
    bool verifyWeightsPrecision(const InferenceEngine::Precision& layerPrec,
                                const InferenceEngine::Precision& weightsPrec);
    void verifyWeights();
    void verifyBiases();
    void convertWeightsBlobToBF16();
    template <typename Prec>
    void fillWeights(const int* gate_map);
    template <typename Prec>
    void fillBiases(const int* gate_map);
 private:
    InferenceEngine::Precision runtimePrecision;
    /** Specify mode Cell or Seq. true - Cell, false - Seq */
    bool is_cell = false;
@ -56,11 +67,14 @@ private:
    const ptrdiff_t L = 1;   /**< What is it??. Constant for mkldnn impl */
    const ptrdiff_t D = 1;   /**< Num of direction. 1 or 2 */
-    MKLDNNMemoryDesc in_data_d;
+    std::vector<MKLDNNMemoryDesc> in_data_d;
-    MKLDNNMemoryDesc out_data_d;
+    std::vector<MKLDNNMemoryDesc> out_data_d;
-    std::vector<MKLDNNMemoryDesc> in_states_d;
+    enum RNNInOutKind {
-    std::vector<MKLDNNMemoryDesc> out_states_d;
+        Layer       = 0,
        HiddenState = 1,
        CellState   = 2
    };
    MKLDNNMemoryDesc w_data_d;
    MKLDNNMemoryDesc w_state_d;
@ -69,7 +83,7 @@ private:
    // List of in/out reorders if required
    std::vector<mkldnn::reorder> exec_before;
    std::vector<mkldnn::reorder> exec_after;
 };
    static const std::map<InferenceEngine::Precision, InferenceEngine::Precision> weightsByLayerPrec;
 }; // class MKLDNNRNN
 }  // namespace MKLDNNPlugin
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/gru_cell.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/gru_cell.cpp
@ -0,0 +1,135 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ngraph/op/gru_cell.hpp"
 #include <shared_test_classes/single_layer/gru_cell.hpp>
 #include "test_utils/cpu_test_utils.hpp"
 #include "transformations/op_conversions/gru_cell_decomposition.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 using GRUCellCpuSpecificParams = typename std::tuple<LayerTestsDefinitions::GRUCellParams, CPUSpecificParams, std::map<std::string, std::string>>;
 class GRUCellCPUTest : public testing::WithParamInterface<GRUCellCpuSpecificParams>,
                            virtual public LayerTestsUtils::LayerTestsCommon,
                            public CPUTestsBase {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<GRUCellCpuSpecificParams> &obj) {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::GRUCellParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = obj.param;
        std::ostringstream result;
        result << LayerTestsDefinitions::GRUCellTest::getTestCaseName(
            testing::TestParamInfo<LayerTestsDefinitions::GRUCellParams>(basicParamsSet, 0));
        result << CPUTestsBase::getTestCaseName(cpuParams);
        if (!additionalConfig.empty()) {
            result << "_PluginConf";
            for (auto &item : additionalConfig) {
                if (item.second == PluginConfigParams::YES)
                    result << "_" << item.first << "=" << item.second;
            }
        }
        return result.str();
    }
 protected:
    void SetUp() {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::GRUCellParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        bool should_decompose;
        size_t batch;
        size_t hidden_size;
        size_t input_size;
        std::vector<std::string> activations;
        std::vector<float> activations_alpha;
        std::vector<float> activations_beta;
        float clip;
        bool linear_before_reset;
        InferenceEngine::Precision netPrecision;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = this->GetParam();
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        std::tie(should_decompose, batch, hidden_size, input_size, activations, clip, linear_before_reset, netPrecision, targetDevice) = basicParamsSet;
        std::vector<std::vector<size_t>> inputShapes = {
            {{batch, input_size},
             {batch, hidden_size},
             {3 * hidden_size, input_size},
             {3 * hidden_size, hidden_size},
             {(linear_before_reset ? 4 : 3) * hidden_size}},
        };
        configuration.insert(additionalConfig.begin(), additionalConfig.end());
        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
            inPrc = outPrc = Precision::BF16;
        } else {
            inPrc = outPrc = netPrecision;
        }
        selectedType += "_";
        selectedType += outPrc.name();
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::FP32);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]});
        std::vector<ngraph::Shape> WRB = {inputShapes[2], inputShapes[3], inputShapes[4]};
        auto gru_cell = ngraph::builder::makeGRU(
            ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hidden_size, activations, {}, {}, clip, linear_before_reset);
        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(gru_cell->output(0))};
        function = makeNgraphFunction(ngPrc, params, gru_cell, "gru_cell");
    }
 };
 TEST_P(GRUCellCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckPluginRelatedResults(executableNetwork, "RNNCell");
 }
 namespace {
 /* CPU PARAMS */
 std::vector<std::map<std::string, std::string>> additionalConfig
    = {{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}},
       {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}};
 CPUSpecificParams cpuParams{{nc, nc}, {nc}, {"ref_any"}, "ref_any"};
 std::vector<bool> should_decompose{false};
 std::vector<size_t> batch{1, 5};
 std::vector<size_t> hidden_size{1, 10};
 std::vector<size_t> input_size{1, 30};
 // oneDNN supports only sigmoid-tanh
 std::vector<std::vector<std::string>> activations = {{"sigmoid", "tanh"}};
 // oneDNN supports only zero clip
 std::vector<float> clip = {0.f};
 std::vector<bool> linear_before_reset = {true, false};
 std::vector<InferenceEngine::Precision> netPrecisions = {InferenceEngine::Precision::FP32};
 INSTANTIATE_TEST_CASE_P(smoke_GRUCellCPU,
                        GRUCellCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(should_decompose),
                                                              ::testing::ValuesIn(batch),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(linear_before_reset),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParams),
                                           ::testing::ValuesIn(additionalConfig)),
                        GRUCellCPUTest::getTestCaseName);
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/gru_sequence.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/gru_sequence.cpp
@ -0,0 +1,202 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "shared_test_classes/single_layer/gru_sequence.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "test_utils/cpu_test_utils.hpp"
 #include "transformations/op_conversions/bidirectional_sequences_decomposition.hpp"
 #include "transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 using GRUSequenceCpuSpecificParams = typename std::tuple<LayerTestsDefinitions::GRUSequenceParams, CPUSpecificParams, std::map<std::string, std::string>>;
 class GRUSequenceCPUTest : public testing::WithParamInterface<GRUSequenceCpuSpecificParams>,
                           virtual public LayerTestsUtils::LayerTestsCommon,
                           public CPUTestsBase {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<GRUSequenceCpuSpecificParams> &obj) {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::GRUSequenceParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = obj.param;
        std::ostringstream result;
        result << LayerTestsDefinitions::GRUSequenceTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::GRUSequenceParams>(basicParamsSet, 0));
        result << CPUTestsBase::getTestCaseName(cpuParams);
        if (!additionalConfig.empty()) {
            result << "_PluginConf";
            for (auto &item : additionalConfig) {
                if (item.second == PluginConfigParams::YES)
                    result << "_" << item.first << "=" << item.second;
            }
        }
        return result.str();
    }
 protected:
    void SetUp() {
        LayerTestsDefinitions::GRUSequenceParams basicParamsSet;
        CPUSpecificParams cpuParams;
        std::map<std::string, std::string> additionalConfig;
        size_t seq_lenghts;
        size_t batch;
        size_t hidden_size;
        size_t input_size = 10;
        std::vector<std::string> activations;
        std::vector<float> activations_alpha;
        std::vector<float> activations_beta;
        float clip;
        bool linear_before_reset;
        ngraph::op::RecurrentSequenceDirection direction;
        InferenceEngine::Precision netPrecision;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = this->GetParam();
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        std::tie(m_mode, seq_lenghts, batch, hidden_size, activations, clip, linear_before_reset, direction, netPrecision, targetDevice) = basicParamsSet;
        size_t num_directions = direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1;
        std::vector<std::vector<size_t>> inputShapes = {
            {{batch, seq_lenghts, input_size},
             {batch, num_directions, hidden_size},
             {batch},
             {num_directions, 3 * hidden_size, input_size},
             {num_directions, 3 * hidden_size, hidden_size},
             {num_directions, (linear_before_reset ? 4 : 3) * hidden_size}},
        };
        configuration.insert(additionalConfig.begin(), additionalConfig.end());
        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
            inPrc = outPrc = Precision::BF16;
        } else {
            inPrc = outPrc = netPrecision;
        }
        selectedType += "_";
        selectedType += outPrc.name();
        m_max_seq_len = seq_lenghts;
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::FP32);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]});
        if (m_mode == ngraph::helpers::SequenceTestsMode::CONVERT_TO_TI_MAX_SEQ_LEN_PARAM
            || m_mode == ngraph::helpers::SequenceTestsMode::CONVERT_TO_TI_RAND_SEQ_LEN_PARAM) {
            auto seq_lengths = ngraph::builder::makeParams(ngraph::element::i64, {inputShapes[2]}).at(0);
            seq_lengths->set_friendly_name("seq_lengths");
            params.push_back(seq_lengths);
        }
        std::vector<ngraph::Shape> WRB = {inputShapes[3], inputShapes[4], inputShapes[5], inputShapes[2]};
        auto gru_sequence = ngraph::builder::makeGRU(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)),
                                                     WRB,
                                                     hidden_size,
                                                     activations,
                                                     {},
                                                     {},
                                                     clip,
                                                     linear_before_reset,
                                                     true,
                                                     direction,
                                                     m_mode);
        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(gru_sequence->output(0)),
                                     std::make_shared<ngraph::opset1::Result>(gru_sequence->output(1))};
        function = makeNgraphFunction(ngPrc, params, gru_sequence, "gru_sequence");
        if (m_mode != ngraph::helpers::SequenceTestsMode::PURE_SEQ) {
            ngraph::pass::Manager manager;
            if (direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL)
                manager.register_pass<ngraph::pass::BidirectionalGRUSequenceDecomposition>();
            manager.register_pass<ngraph::pass::ConvertGRUSequenceToTensorIterator>();
            manager.run_passes(function);
            bool ti_found = ngraph::helpers::is_tensor_iterator_exist(function);
            EXPECT_EQ(ti_found, true);
        } else {
            bool ti_found = ngraph::helpers::is_tensor_iterator_exist(function);
            EXPECT_EQ(ti_found, false);
        }
    }
    void GenerateInputs() {
        for (const auto &input : executableNetwork.GetInputsInfo()) {
            const auto &info = input.second;
            auto blob = GenerateInput(*info);
            if (input.first == "seq_lengths") {
                blob = FuncTestUtils::createAndFillBlob(info->getTensorDesc(), m_max_seq_len, 0);
            }
            inputs.push_back(blob);
        }
    }
 private:
    ngraph::helpers::SequenceTestsMode m_mode;
    int64_t m_max_seq_len = 0;
 };
 TEST_P(GRUSequenceCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckPluginRelatedResults(executableNetwork, "RNNSeq");
 }
 namespace {
 /* CPU PARAMS */
 std::vector<std::map<std::string, std::string>> additionalConfig
    = {{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}}, {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}};
 CPUSpecificParams cpuParams{{ntc, nc}, {ntc, nc}, {"ref_any"}, "ref_any"};
 CPUSpecificParams cpuParamsBatchSizeOne{{tnc, nc}, {tnc, nc}, {"ref_any"}, "ref_any"};;
 std::vector<ngraph::helpers::SequenceTestsMode> mode{ngraph::helpers::SequenceTestsMode::PURE_SEQ};
 // output values increase rapidly without clip, so use only seq_lenghts = 2
 std::vector<size_t> seq_lengths_zero_clip{2};
 std::vector<size_t> batch{10};
 std::vector<size_t> batch_size_one{1};
 std::vector<size_t> hidden_size{1, 10};
 std::vector<std::vector<std::string>> activations = {{"sigmoid", "tanh"}};
 std::vector<bool> linear_before_reset = {true, false};
 std::vector<float> clip{0.f};
 std::vector<ngraph::op::RecurrentSequenceDirection> direction = {ngraph::op::RecurrentSequenceDirection::FORWARD};
 std::vector<InferenceEngine::Precision> netPrecisions = {InferenceEngine::Precision::FP32};
 INSTANTIATE_TEST_CASE_P(smoke_GRUSequenceCPU,
                        GRUSequenceCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(mode),
                                                              ::testing::ValuesIn(seq_lengths_zero_clip),
                                                              ::testing::ValuesIn(batch),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(linear_before_reset),
                                                              ::testing::ValuesIn(direction),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParams),
                                           ::testing::ValuesIn(additionalConfig)),
                        GRUSequenceCPUTest::getTestCaseName);
 INSTANTIATE_TEST_CASE_P(smoke_GRUSequenceCPUBatchSizeOne,
                        GRUSequenceCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(mode),
                                                              ::testing::ValuesIn(seq_lengths_zero_clip),
                                                              ::testing::ValuesIn(batch_size_one),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(linear_before_reset),
                                                              ::testing::ValuesIn(direction),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParamsBatchSizeOne),
                                           ::testing::ValuesIn(additionalConfig)),
                        GRUSequenceCPUTest::getTestCaseName);
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/lstm_cell.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/lstm_cell.cpp
@ -0,0 +1,132 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ngraph/op/lstm_cell.hpp"
 #include <shared_test_classes/single_layer/lstm_cell.hpp>
 #include "test_utils/cpu_test_utils.hpp"
 #include "transformations/op_conversions/lstm_cell_decomposition.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 using LSTMCellCpuSpecificParams = typename std::tuple<LayerTestsDefinitions::LSTMCellParams, CPUSpecificParams, std::map<std::string, std::string>>;
 class LSTMCellLayerCPUTest : public testing::WithParamInterface<LSTMCellCpuSpecificParams>,
                             virtual public LayerTestsUtils::LayerTestsCommon,
                             public CPUTestsBase {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<LSTMCellCpuSpecificParams>& obj) {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::LSTMCellParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = obj.param;
        std::ostringstream result;
        result << LayerTestsDefinitions::LSTMCellTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::LSTMCellParams>(
                basicParamsSet, 0));
        result << CPUTestsBase::getTestCaseName(cpuParams);
        if (!additionalConfig.empty()) {
            result << "_PluginConf";
            for (auto& item : additionalConfig) {
                if (item.second == PluginConfigParams::YES)
                    result << "_" << item.first << "=" << item.second;
            }
        }
        return result.str();
    }
 protected:
    void SetUp() {
        LayerTestsDefinitions::LSTMCellParams basicParamsSet;
        CPUSpecificParams cpuParams;
        std::map<std::string, std::string> additionalConfig;
        bool should_decompose;
        size_t batch;
        size_t hidden_size;
        size_t input_size;
        std::vector<std::string> activations;
        std::vector<float> activations_alpha;
        std::vector<float> activations_beta;
        float clip;
        InferenceEngine::Precision netPrecision;
        threshold = 0.05;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = this->GetParam();
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        std::tie(should_decompose, batch, hidden_size, input_size, activations, clip, netPrecision, targetDevice) = basicParamsSet;
        std::vector<std::vector<size_t>> inputShapes = {
            {{batch, input_size}, {batch, hidden_size}, {batch, hidden_size}, {4 * hidden_size, input_size}, {4 * hidden_size, hidden_size}, {4 * hidden_size}},
        };
        configuration.insert(additionalConfig.begin(), additionalConfig.end());
        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
            inPrc = outPrc = Precision::BF16;
        } else {
            inPrc = outPrc = netPrecision;
        }
        selectedType += "_";
        selectedType += outPrc.name();
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::FP32);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1], inputShapes[2]});
        std::vector<ngraph::Shape> WRB = {inputShapes[3], inputShapes[4], inputShapes[5]};
        auto lstm_cell = ngraph::builder::makeLSTM(
            ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)), WRB, hidden_size, activations, {}, {}, clip);
        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(lstm_cell->output(0)),
                                     std::make_shared<ngraph::opset1::Result>(lstm_cell->output(1))};
        function = makeNgraphFunction(ngPrc, params, lstm_cell, "lstm_cell");
    }
 };
 TEST_P(LSTMCellLayerCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckPluginRelatedResults(executableNetwork, "RNNCell");
 }
 namespace {
 /* CPU PARAMS */
 std::vector<std::map<std::string, std::string>> additionalConfig
    = {{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}},
       {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}}};
 CPUSpecificParams cpuParams{{nc, nc, nc}, {nc}, {"ref_any"}, "ref_any"};
 std::vector<bool> should_decompose{false};
 std::vector<size_t> batch{5};
 std::vector<size_t> hidden_size{1, 10};
 std::vector<size_t> input_size{1, 30};
 // oneDNN supports only sigmoid-tanh-tanh
 std::vector<std::vector<std::string>> activations = {{"sigmoid", "tanh", "tanh"}};
 // oneDNN supports only zero clip
 std::vector<float> clip{0.f};
 std::vector<InferenceEngine::Precision> netPrecisions = {InferenceEngine::Precision::FP32, InferenceEngine::Precision::BF16};
 INSTANTIATE_TEST_CASE_P(smoke_LSTMCellCPU,
                        LSTMCellLayerCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(should_decompose),
                                                              ::testing::ValuesIn(batch),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParams),
                                           ::testing::ValuesIn(additionalConfig)),
                        LSTMCellLayerCPUTest::getTestCaseName);
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/lstm_sequence.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/lstm_sequence.cpp
@ -0,0 +1,205 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "shared_test_classes/single_layer/lstm_sequence.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "test_utils/cpu_test_utils.hpp"
 #include "transformations/op_conversions/bidirectional_sequences_decomposition.hpp"
 #include "transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 using LSTMSequenceCpuSpecificParams = typename std::tuple<LayerTestsDefinitions::LSTMSequenceParams, CPUSpecificParams, std::map<std::string, std::string>>;
 class LSTMSequenceCPUTest : public testing::WithParamInterface<LSTMSequenceCpuSpecificParams>,
                            virtual public LayerTestsUtils::LayerTestsCommon,
                            public CPUTestsBase {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<LSTMSequenceCpuSpecificParams> &obj) {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::LSTMSequenceParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = obj.param;
        std::ostringstream result;
        result << LayerTestsDefinitions::LSTMSequenceTest::getTestCaseName(
            testing::TestParamInfo<LayerTestsDefinitions::LSTMSequenceParams>(basicParamsSet, 0));
        result << CPUTestsBase::getTestCaseName(cpuParams);
        if (!additionalConfig.empty()) {
            result << "_PluginConf";
            for (auto &item : additionalConfig) {
                if (item.second == PluginConfigParams::YES)
                    result << "_" << item.first << "=" << item.second;
            }
        }
        return result.str();
    }
 protected:
    void SetUp() {
        LayerTestsDefinitions::LSTMSequenceParams basicParamsSet;
        CPUSpecificParams cpuParams;
        std::map<std::string, std::string> additionalConfig;
        size_t seq_lenghts;
        size_t batch;
        size_t hidden_size;
        size_t input_size;
        std::vector<std::string> activations;
        std::vector<float> activations_alpha;
        std::vector<float> activations_beta;
        float clip;
        ngraph::op::RecurrentSequenceDirection direction;
        InferenceEngine::Precision netPrecision;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = this->GetParam();
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        std::tie(m_mode, seq_lenghts, batch, hidden_size, input_size, activations, clip, direction, netPrecision, targetDevice) = basicParamsSet;
        size_t num_directions = direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1;
        m_max_seq_len = seq_lenghts;
        std::vector<std::vector<size_t>> inputShapes = {
            {{batch, seq_lenghts, input_size},
             {batch, num_directions, hidden_size},
             {batch, num_directions, hidden_size},
             {batch},
             {num_directions, 4 * hidden_size, input_size},
             {num_directions, 4 * hidden_size, hidden_size},
             {num_directions, 4 * hidden_size}},
        };
        configuration.insert(additionalConfig.begin(), additionalConfig.end());
        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
            inPrc = outPrc = Precision::BF16;
        } else {
            inPrc = outPrc = netPrecision;
        }
        selectedType += "_";
        selectedType += outPrc.name();
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::FP32);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1], inputShapes[2]});
        if (m_mode == ngraph::helpers::SequenceTestsMode::CONVERT_TO_TI_MAX_SEQ_LEN_PARAM
            || m_mode == ngraph::helpers::SequenceTestsMode::CONVERT_TO_TI_RAND_SEQ_LEN_PARAM) {
            auto seq_lengths = ngraph::builder::makeParams(ngraph::element::i64, {inputShapes[3]}).at(0);
            seq_lengths->set_friendly_name("seq_lengths");
            params.push_back(seq_lengths);
        }
        std::vector<ngraph::Shape> WRB = {inputShapes[4], inputShapes[5], inputShapes[6], inputShapes[3]};
        auto lstm_sequence = ngraph::builder::makeLSTM(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)),
                                                       WRB,
                                                       hidden_size,
                                                       activations,
                                                       {},
                                                       {},
                                                       clip,
                                                       true,
                                                       direction,
                                                       m_mode);
        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(lstm_sequence->output(0)),
                                     std::make_shared<ngraph::opset1::Result>(lstm_sequence->output(1)),
                                     std::make_shared<ngraph::opset1::Result>(lstm_sequence->output(2))};
        function = makeNgraphFunction(ngPrc, params, lstm_sequence, "lstm_sequence");
        if (m_mode != ngraph::helpers::SequenceTestsMode::PURE_SEQ) {
            ngraph::pass::Manager manager;
            if (direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL)
                manager.register_pass<ngraph::pass::BidirectionalLSTMSequenceDecomposition>();
            manager.register_pass<ngraph::pass::ConvertLSTMSequenceToTensorIterator>();
            manager.run_passes(function);
            bool ti_found = ngraph::helpers::is_tensor_iterator_exist(function);
            EXPECT_EQ(ti_found, true);
        } else {
            bool ti_found = ngraph::helpers::is_tensor_iterator_exist(function);
            EXPECT_EQ(ti_found, false);
        }
    }
    void GenerateInputs() {
        for (const auto &input : executableNetwork.GetInputsInfo()) {
            const auto &info = input.second;
            auto blob = GenerateInput(*info);
            if (input.first == "seq_lengths") {
                blob = FuncTestUtils::createAndFillBlob(info->getTensorDesc(), m_max_seq_len, 0);
            }
            inputs.push_back(blob);
        }
    }
 private:
    ngraph::helpers::SequenceTestsMode m_mode;
    int64_t m_max_seq_len = 0;
 };
 TEST_P(LSTMSequenceCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckPluginRelatedResults(executableNetwork, "RNNSeq");
 }
 namespace {
 /* CPU PARAMS */
 std::vector<std::map<std::string, std::string>> additionalConfig
    = {{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}},
       {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}};
 CPUSpecificParams cpuParams{{ntc, nc, nc}, {ntc, nc, nc}, {"ref_any"}, "ref_any"};
 CPUSpecificParams cpuParamsBatchSizeOne{{tnc, nc, nc}, {tnc, nc, nc}, {"ref_any"}, "ref_any"};
 std::vector<ngraph::helpers::SequenceTestsMode> mode{ngraph::helpers::SequenceTestsMode::PURE_SEQ};
 std::vector<size_t> seq_lengths_zero_clip{2};
 std::vector<size_t> batch_size_one{1};
 std::vector<size_t> batch{10};
 std::vector<size_t> hidden_size{1, 10};
 std::vector<size_t> input_size{10};
 // oneDNN supports only sigmoid-tanh-tanh
 std::vector<std::vector<std::string>> activations = {{"sigmoid", "tanh", "tanh"}};
 // oneDNN supports only zero clip
 std::vector<float> clip{0.f};
 std::vector<ngraph::op::RecurrentSequenceDirection> direction = {ngraph::op::RecurrentSequenceDirection::FORWARD};
 std::vector<InferenceEngine::Precision> netPrecisions = {InferenceEngine::Precision::FP32};
 INSTANTIATE_TEST_CASE_P(smoke_LSTMSequenceCPU,
                        LSTMSequenceCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(mode),
                                                              ::testing::ValuesIn(seq_lengths_zero_clip),
                                                              ::testing::ValuesIn(batch),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(direction),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParams),
                                           ::testing::ValuesIn(additionalConfig)),
                        LSTMSequenceCPUTest::getTestCaseName);
 INSTANTIATE_TEST_CASE_P(smoke_LSTMSequenceCPUbatchSizeOne,
                        LSTMSequenceCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(mode),
                                                              ::testing::ValuesIn(seq_lengths_zero_clip),
                                                              ::testing::ValuesIn(batch_size_one),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(direction),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParamsBatchSizeOne),
                                           ::testing::ValuesIn(additionalConfig)),
                        LSTMSequenceCPUTest::getTestCaseName);
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/rnn_cell.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/rnn_cell.cpp
@ -0,0 +1,124 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "ngraph/op/rnn_cell.hpp"
 #include <shared_test_classes/single_layer/rnn_cell.hpp>
 #include "test_utils/cpu_test_utils.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 using RNNCellCpuSpecificParams = typename std::tuple<LayerTestsDefinitions::RNNCellParams, CPUSpecificParams, std::map<std::string, std::string>>;
 class RNNCellCPUTest : public testing::WithParamInterface<RNNCellCpuSpecificParams>,
                            virtual public LayerTestsUtils::LayerTestsCommon,
                            public CPUTestsBase {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<RNNCellCpuSpecificParams> &obj) {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::RNNCellParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = obj.param;
        std::ostringstream result;
        result << LayerTestsDefinitions::RNNCellTest::getTestCaseName(
            testing::TestParamInfo<LayerTestsDefinitions::RNNCellParams>(basicParamsSet, 0));
        result << CPUTestsBase::getTestCaseName(cpuParams);
        if (!additionalConfig.empty()) {
            result << "_PluginConf";
            for (auto &item : additionalConfig) {
                if (item.second == PluginConfigParams::YES)
                    result << "_" << item.first << "=" << item.second;
            }
        }
        return result.str();
    }
 protected:
    void SetUp() {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::RNNCellParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        bool should_decompose;
        size_t batch;
        size_t hidden_size;
        size_t input_size;
        std::vector<std::string> activations;
        std::vector<float> activations_alpha;
        std::vector<float> activations_beta;
        float clip;
        InferenceEngine::Precision netPrecision;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = this->GetParam();
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        std::tie(should_decompose, batch, hidden_size, input_size, activations, clip, netPrecision, targetDevice) = basicParamsSet;
        std::vector<std::vector<size_t>> inputShapes = {{batch, input_size}, {batch, hidden_size},
                                                        {hidden_size, input_size}, {hidden_size, hidden_size}, {hidden_size}};
        configuration.insert(additionalConfig.begin(), additionalConfig.end());
        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
            inPrc = outPrc = Precision::BF16;
        } else {
            inPrc = outPrc = netPrecision;
        }
        selectedType += "_";
        selectedType += outPrc.name();
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::FP32);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]});
        std::vector<ngraph::Shape> WRB = {inputShapes[2], inputShapes[3], inputShapes[4]};
        auto rnn_cell = ngraph::builder::makeRNN(
            ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)),
            WRB, hidden_size, activations, {}, {}, clip);
        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(rnn_cell)};
        function = makeNgraphFunction(ngPrc, params, rnn_cell, "rnn_cell");
    }
 };
 TEST_P(RNNCellCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckPluginRelatedResults(executableNetwork, "RNNCell");
 }
 namespace {
 /* CPU PARAMS */
 std::vector<std::map<std::string, std::string>> additionalConfig
    = {{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}}, {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}};
 CPUSpecificParams cpuParams{{nc, nc}, {nc}, {"ref_any"}, "ref_any"};
 std::vector<bool> should_decompose{false};
 std::vector<size_t> batch{1, 5};
 std::vector<size_t> hidden_size{1, 10};
 std::vector<size_t> input_size{1, 30};
 std::vector<std::vector<std::string>> activations = {{"relu"}, {"sigmoid"}, {"tanh"}};
 // oneDNN supports only zero clip
 std::vector<float> clip = {0.f};
 std::vector<InferenceEngine::Precision> netPrecisions = {InferenceEngine::Precision::FP32};
 INSTANTIATE_TEST_CASE_P(smoke_RNNCellCPU,
                        RNNCellCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(should_decompose),
                                                              ::testing::ValuesIn(batch),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParams),
                                           ::testing::ValuesIn(additionalConfig)),
                        RNNCellCPUTest::getTestCaseName);
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/rnn_sequence.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/rnn_sequence.cpp
@ -0,0 +1,202 @@
 // Copyright (C) 2021 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "shared_test_classes/single_layer/rnn_sequence.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "test_utils/cpu_test_utils.hpp"
 #include "transformations/op_conversions/bidirectional_sequences_decomposition.hpp"
 #include "transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp"
 using namespace InferenceEngine;
 using namespace CPUTestUtils;
 namespace CPULayerTestsDefinitions {
 using RNNSequenceCpuSpecificParams = typename std::tuple<LayerTestsDefinitions::RNNSequenceParams, CPUSpecificParams, std::map<std::string, std::string>>;
 class RNNSequenceCPUTest : public testing::WithParamInterface<RNNSequenceCpuSpecificParams>,
                           virtual public LayerTestsUtils::LayerTestsCommon,
                           public CPUTestsBase {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<RNNSequenceCpuSpecificParams> &obj) {
        CPUSpecificParams cpuParams;
        LayerTestsDefinitions::RNNSequenceParams basicParamsSet;
        std::map<std::string, std::string> additionalConfig;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = obj.param;
        std::ostringstream result;
        result << LayerTestsDefinitions::RNNSequenceTest::getTestCaseName(testing::TestParamInfo<LayerTestsDefinitions::RNNSequenceParams>(basicParamsSet, 0));
        result << CPUTestsBase::getTestCaseName(cpuParams);
        if (!additionalConfig.empty()) {
            result << "_PluginConf";
            for (auto &item : additionalConfig) {
                if (item.second == PluginConfigParams::YES)
                    result << "_" << item.first << "=" << item.second;
            }
        }
        return result.str();
    }
 protected:
    void SetUp() {
        LayerTestsDefinitions::RNNSequenceParams basicParamsSet;
        CPUSpecificParams cpuParams;
        std::map<std::string, std::string> additionalConfig;
        size_t seq_lenghts;
        size_t batch;
        size_t hidden_size;
        size_t input_size;
        std::vector<std::string> activations;
        std::vector<float> activations_alpha;
        std::vector<float> activations_beta;
        float clip;
        ngraph::op::RecurrentSequenceDirection direction;
        InferenceEngine::Precision netPrecision;
        std::tie(basicParamsSet, cpuParams, additionalConfig) = this->GetParam();
        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
        std::tie(m_mode, seq_lenghts, batch, hidden_size, input_size, activations, clip, direction, netPrecision, targetDevice) = basicParamsSet;
        size_t num_directions = direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL ? 2 : 1;
        std::vector<std::vector<size_t>> inputShapes = {
            {{batch, seq_lenghts, input_size},
             {batch, num_directions, hidden_size},
             {batch},
             {num_directions, hidden_size, input_size},
             {num_directions, hidden_size, hidden_size},
             {num_directions, hidden_size}},
        };
        configuration.insert(additionalConfig.begin(), additionalConfig.end());
        if (additionalConfig[PluginConfigParams::KEY_ENFORCE_BF16] == PluginConfigParams::YES) {
            inPrc = outPrc = Precision::BF16;
        } else {
            inPrc = outPrc = netPrecision;
        }
        selectedType += "_";
        selectedType += outPrc.name();
        m_max_seq_len = seq_lenghts;
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(Precision::FP32);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShapes[0], inputShapes[1]});
        if (m_mode == ngraph::helpers::SequenceTestsMode::CONVERT_TO_TI_MAX_SEQ_LEN_PARAM
            || m_mode == ngraph::helpers::SequenceTestsMode::CONVERT_TO_TI_RAND_SEQ_LEN_PARAM) {
            auto seq_lengths = ngraph::builder::makeParams(ngraph::element::i64, {inputShapes[2]}).at(0);
            seq_lengths->set_friendly_name("seq_lengths");
            params.push_back(seq_lengths);
        }
        std::vector<ngraph::Shape> WRB = {inputShapes[3], inputShapes[4], inputShapes[5], inputShapes[2]};
        auto rnn_sequence = ngraph::builder::makeRNN(ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)),
                                                     WRB,
                                                     hidden_size,
                                                     activations,
                                                     {},
                                                     {},
                                                     clip,
                                                     true,
                                                     direction,
                                                     m_mode);
        ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(rnn_sequence->output(0)),
                                     std::make_shared<ngraph::opset1::Result>(rnn_sequence->output(1))};
        function = makeNgraphFunction(ngPrc, params, rnn_sequence, "rnn_sequence");
        if (m_mode != ngraph::helpers::SequenceTestsMode::PURE_SEQ) {
            ngraph::pass::Manager manager;
            if (direction == ngraph::op::RecurrentSequenceDirection::BIDIRECTIONAL)
                manager.register_pass<ngraph::pass::BidirectionalRNNSequenceDecomposition>();
            manager.register_pass<ngraph::pass::ConvertRNNSequenceToTensorIterator>();
            manager.run_passes(function);
            bool ti_found = ngraph::helpers::is_tensor_iterator_exist(function);
            EXPECT_EQ(ti_found, true);
        } else {
            bool ti_found = ngraph::helpers::is_tensor_iterator_exist(function);
            EXPECT_EQ(ti_found, false);
        }
    }
    void GenerateInputs() {
        for (const auto &input : executableNetwork.GetInputsInfo()) {
            const auto &info = input.second;
            auto blob = GenerateInput(*info);
            if (input.first == "seq_lengths") {
                blob = FuncTestUtils::createAndFillBlob(info->getTensorDesc(), m_max_seq_len, 0);
            }
            inputs.push_back(blob);
        }
    }
 private:
    ngraph::helpers::SequenceTestsMode m_mode;
    int64_t m_max_seq_len = 0;
 };
 TEST_P(RNNSequenceCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()
    Run();
    CheckPluginRelatedResults(executableNetwork, "RNNSeq");
 }
 namespace {
 /* CPU PARAMS */
 std::vector<std::map<std::string, std::string>> additionalConfig
    = {{{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::NO}}, {{PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES}}};
 CPUSpecificParams cpuParams{{ntc, nc}, {ntc, nc}, {"ref_any"}, "ref_any"};
 CPUSpecificParams cpuParamsBatchSizeOne{{tnc, nc}, {tnc, nc}, {"ref_any"}, "ref_any"};
 std::vector<ngraph::helpers::SequenceTestsMode> mode{ngraph::helpers::SequenceTestsMode::PURE_SEQ};
 // output values increase rapidly without clip, so use only seq_lenghts = 2
 std::vector<size_t> seq_lengths_zero_clip{2};
 std::vector<size_t> batch{10};
 std::vector<size_t> batch_size_one{1};
 std::vector<size_t> hidden_size{10};
 // std::vector<size_t> hidden_size{1, 10};
 std::vector<size_t> input_size{10};
 std::vector<std::vector<std::string>> activations = {{"relu"}, {"sigmoid"}, {"tanh"}};
 // oneDNN supports only zero clip
 std::vector<float> clip{0.f};
 std::vector<ngraph::op::RecurrentSequenceDirection> direction{ngraph::op::RecurrentSequenceDirection::FORWARD};
 std::vector<InferenceEngine::Precision> netPrecisions = {InferenceEngine::Precision::FP32};
 INSTANTIATE_TEST_CASE_P(smoke_RNNSequenceCPU,
                        RNNSequenceCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(mode),
                                                              ::testing::ValuesIn(seq_lengths_zero_clip),
                                                              ::testing::ValuesIn(batch),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(direction),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParams),
                                           ::testing::ValuesIn(additionalConfig)),
                        RNNSequenceCPUTest::getTestCaseName);
 INSTANTIATE_TEST_CASE_P(smoke_RNNSequenceCPUBatchSizeOne,
                        RNNSequenceCPUTest,
                        ::testing::Combine(::testing::Combine(::testing::ValuesIn(mode),
                                                              ::testing::ValuesIn(seq_lengths_zero_clip),
                                                              ::testing::ValuesIn(batch_size_one),
                                                              ::testing::ValuesIn(hidden_size),
                                                              ::testing::ValuesIn(input_size),
                                                              ::testing::ValuesIn(activations),
                                                              ::testing::ValuesIn(clip),
                                                              ::testing::ValuesIn(direction),
                                                              ::testing::ValuesIn(netPrecisions),
                                                              ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                                           ::testing::Values(cpuParamsBatchSizeOne),
                                           ::testing::ValuesIn(additionalConfig)),
                        RNNSequenceCPUTest::getTestCaseName);
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp
@ -8,16 +8,29 @@
 namespace CPUTestUtils {
 const char *CPUTestsBase::cpu_fmt2str(cpu_memory_format_t v) {
-    if (v == nchw) return "nchw";
+#define CASE(_fmt) do { \
-    if (v == nChw8c) return "nChw8c";
+    if (v == _fmt) return #_fmt; \
-    if (v == nChw16c) return "nChw16c";
+} while (0)
-    if (v == nhwc) return "nhwc";
+    CASE(undef);
-    if (v == ncdhw) return "ncdhw";
+    CASE(nchw);
-    if (v == nCdhw8c) return "nCdhw8c";
+    CASE(nChw8c);
-    if (v == nCdhw16c) return "nCdhw16c";
+    CASE(nChw16c);
-    if (v == ndhwc) return "ndhwc";
+    CASE(nhwc);
-    if (v == nc) return "nc";
+    CASE(ncdhw);
-    if (v == x) return "x";
+    CASE(nCdhw8c);
    CASE(nCdhw16c);
    CASE(ndhwc);
    CASE(nc);
    CASE(x);
    CASE(tnc);
    CASE(ntc);
    CASE(ldnc);
    CASE(ldigo);
    CASE(ldgoi);
    CASE(ldio);
    CASE(ldoi);
    CASE(ldgo);
 #undef CASE
    assert(!"unknown fmt");
    return "undef";
 }
@ -39,6 +52,10 @@ cpu_memory_format_t CPUTestsBase::cpu_str2fmt(const char *str) {
    CASE(acdeb);
    CASE(aBcde8b);
    CASE(aBcde16b);
    CASE(abc);
    CASE(bac);
    CASE(abdc);
    CASE(abdec);
    CASE(nchw);
    CASE(nChw8c);
    CASE(nChw16c);
@ -49,6 +66,14 @@ cpu_memory_format_t CPUTestsBase::cpu_str2fmt(const char *str) {
    CASE(ndhwc);
    CASE(nc);
    CASE(x);
    CASE(tnc);
    CASE(ntc);
    CASE(ldnc);
    CASE(ldigo);
    CASE(ldgoi);
    CASE(ldio);
    CASE(ldoi);
    CASE(ldgo);
 #undef CASE
    assert(!"unknown memory format");
    return undef;
@ -120,18 +145,38 @@ void CPUTestsBase::CheckPluginRelatedResults(InferenceEngine::ExecutableNetwork
                    auto shape = parentNode->get_output_tensor(0).get_shape();
                    auto actualInputMemoryFormat = getExecValueOutputsLayout(parentNode);
-                    if (!should_be_skipped(shape, inFmts[i]))
+                    if (!should_be_skipped(shape, inFmts[i])) {
                        ASSERT_EQ(inFmts[i], cpu_str2fmt(actualInputMemoryFormat.c_str()));
                    }
                }
            }
-            for (int i = 0; i < outFmts.size(); i++) {
+
            /* actual output formats are represented as a single string, for example 'fmt1' or 'fmt1, fmt2, fmt3'
             * convert it to the list of formats */
            auto getActualOutputMemoryFormats = [] (const std::string& fmtStr) -> std::vector<std::string> {
                std::vector<std::string> result;
                std::stringstream ss(fmtStr);
                std::string str;
                while (std::getline(ss, str, ',')) {
                    result.push_back(str);
                }
                return result;
            };
            auto actualOutputMemoryFormats = getActualOutputMemoryFormats(getExecValueOutputsLayout(node));
            for (size_t i = 0; i < outFmts.size(); i++) {
                const auto actualOutputMemoryFormat = getExecValue(ExecGraphInfoSerialization::OUTPUT_LAYOUTS);
                const auto shape = node->get_output_shape(i);
-                if (!should_be_skipped(shape, outFmts[i]))
+                if (should_be_skipped(shape, outFmts[i]))
-                    ASSERT_EQ(outFmts[i], cpu_str2fmt(actualOutputMemoryFormat.c_str()));
+                    continue;
                ASSERT_EQ(outFmts[i], cpu_str2fmt(actualOutputMemoryFormats[i].c_str()));
            }
            auto primType = getExecValue(ExecGraphInfoSerialization::IMPL_TYPE);
            ASSERT_EQ(selectedType, primType);
        }
    }
@ -197,8 +242,11 @@ std::shared_ptr<ngraph::Function>
 CPUTestsBase::makeNgraphFunction(const ngraph::element::Type &ngPrc, ngraph::ParameterVector &params,
                                 const std::shared_ptr<ngraph::Node> &lastNode, std::string name) const {
   auto newLastNode = modifyGraph(ngPrc, params, lastNode);
   ngraph::ResultVector results;
   for (int i = 0; i < newLastNode->get_output_size(); i++)
        results.push_back(std::make_shared<ngraph::opset1::Result>(newLastNode->output(i)));
   ngraph::ResultVector results = {std::make_shared<ngraph::opset1::Result>(newLastNode)};
   return std::make_shared<ngraph::Function>(results, params, name);
 }
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp
@ -24,6 +24,11 @@ namespace CPUTestUtils {
        acdeb,
        aBcde8b,
        aBcde16b,
        // RNN layouts
        abc,
        bac,
        abdc,
        abdec,
        x = a,
        nc = ab,
@ -34,7 +39,41 @@ namespace CPUTestUtils {
        ncdhw = abcde,
        nCdhw8c = aBcde8b,
        nCdhw16c = aBcde16b,
-        ndhwc = acdeb
+        ndhwc = acdeb,
        // RNN layouts
        tnc = abc,
        /// 3D RNN data tensor in the format (batch, seq_length, input channels).
        ntc = bac,
        /// 4D RNN states tensor in the format (num_layers, num_directions,
        /// batch, state channels).
        ldnc = abcd,
        /// 5D RNN weights tensor in the format (num_layers, num_directions,
        ///  input_channels, num_gates, output_channels).
        ///
        ///  - For LSTM cells, the gates order is input, forget, candidate
        ///    and output gate.
        ///  - For GRU cells, the gates order is update, reset and output gate.
        ldigo = abcde,
        /// 5D RNN weights tensor in the format (num_layers, num_directions,
        /// num_gates, output_channels, input_channels).
        ///
        ///  - For LSTM cells, the gates order is input, forget, candidate
        ///    and output gate.
        ///  - For GRU cells, the gates order is update, reset and output gate.
        ldgoi = abdec,
        /// 4D LSTM projection tensor in the format (num_layers, num_directions,
        /// num_channels_in_hidden_state, num_channels_in_recurrent_projection).
        ldio = abcd,
        /// 4D LSTM projection tensor in the format (num_layers, num_directions,
        /// num_channels_in_recurrent_projection, num_channels_in_hidden_state).
        ldoi = abdc,
        /// 4D RNN bias tensor in the format (num_layers, num_directions,
        /// num_gates, output_channels).
        ///
        ///  - For LSTM cells, the gates order is input, forget, candidate
        ///    and output gate.
        ///  - For GRU cells, the gates order is update, reset and output gate.
        ldgo = abcd,
    } cpu_memory_format_t;
    using CPUSpecificParams =  std::tuple<
--- a/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/lstm_cell.hpp
+++ b/inference-engine/tests/functional/shared_test_classes/include/shared_test_classes/single_layer/lstm_cell.hpp
@ -4,6 +4,7 @@
 #pragma once
 #include <gtest/gtest.h>
 #include <tuple>
 #include <string>
 #include <vector>
@ -26,7 +27,7 @@ using LSTMCellParams = typename std::tuple<
        std::string>;                      // Device name
 class LSTMCellTest : public testing::WithParamInterface<LSTMCellParams >,
-                    virtual public LayerTestsUtils::LayerTestsCommon {
+                     virtual public LayerTestsUtils::LayerTestsCommon {
 public:
    static std::string getTestCaseName(const testing::TestParamInfo<LSTMCellParams> &obj);
--- a/inference-engine/thirdparty/mkl-dnn
+++ b/inference-engine/thirdparty/mkl-dnn
@ -1 +1 @@
-Subproject commit 0813c00df7558bc9b858d3a73c725bab2ce1b1eb
+Subproject commit 462982a2f9272ad26473ec13d983b10dbd193cd3
		`@ -1 +1 @@`
			`Subproject commit 0813c00df7558bc9b858d3a73c725bab2ce1b1eb`				`Subproject commit 462982a2f9272ad26473ec13d983b10dbd193cd3`