[CPU] Nspc layout enabling in the FP32/BF16 convolutions (#5292)

2021-05-25 11:41:23 +03:00 · 2021-05-25 11:41:23 +03:00 · 617636693a
commit 617636693a
parent cc810297f4
19 changed files with 907 additions and 344 deletions
--- a/inference-engine/src/low_precision_transformations/include/low_precision/transformer.hpp
+++ b/inference-engine/src/low_precision_transformations/include/low_precision/transformer.hpp
@ -278,7 +278,7 @@ class TRANSFORMATIONS_API LowPrecisionTransformer : public IParamsManager, ILaye
 public:
    static LowPrecisionTransformations getAllTransformations(const LayerTransformation::Params& params = LayerTransformation::Params());

-    static bool isFunctionQuantized(const std::shared_ptr<Function>& function);
+    static bool isFunctionQuantized(const std::shared_ptr<const Function>& function);

    LowPrecisionTransformer();
    LowPrecisionTransformer(const LowPrecisionTransformations& transformations);
--- a/inference-engine/src/low_precision_transformations/src/transformer.cpp
+++ b/inference-engine/src/low_precision_transformations/src/transformer.cpp
@ -259,7 +259,7 @@ LowPrecisionTransformations LowPrecisionTransformer::getAllTransformations(const
    return transformer;
 }

-bool LowPrecisionTransformer::isFunctionQuantized(const std::shared_ptr<Function>& function) {
+bool LowPrecisionTransformer::isFunctionQuantized(const std::shared_ptr<const Function>& function) {
    std::set<std::shared_ptr<Node>> handledNodes;
    std::deque<std::shared_ptr<Node>> nodes;
    for (auto result : function->get_results()) {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.cpp
@ -45,6 +45,7 @@
 #include <ngraph/variant.hpp>
 #include <ngraph/ops.hpp>
 #include <transformations/utils/utils.hpp>
+#include <low_precision/transformer.hpp>

 /*****************************************************
 * Debug capability
@ -89,6 +90,9 @@ void MKLDNNGraph::Replicate(const std::shared_ptr<const ngraph::Function> &subgr
    this->_name = "subgraph";
    this->reuse_io_tensors = false;

+    isQuantizedFlag = (config.lpTransformsMode == Config::On) &&
+                      ngraph::pass::low_precision::LowPrecisionTransformer::isFunctionQuantized(subgraph);
+
    // Map data object onto producer node
    std::map<std::shared_ptr<ngraph::Node>, std::pair<MKLDNNNodePtr, int>> op2node;

@ -109,6 +113,10 @@ void MKLDNNGraph::Replicate(const std::shared_ptr<const ngraph::Function> &subgr

    for (const auto op : subgraph->get_ordered_ops()) {
        const MKLDNNNodePtr node {MKLDNNNode::factory().create(op, getEngine(), extMgr, weightsCache)};
+        if (isQuantized()) {
+            node->setQuantizedGraphFlag(true);
+        }
+
        graphNodes.push_back(node);

        if (op->get_type_info() == ngraph::op::v0::Parameter::type_info) {
@ -180,6 +188,9 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
        IE_THROW() << "Function pointer inside CNNNetwork is nullptr";
    }

+    isQuantizedFlag = (config.lpTransformsMode == Config::On) &&
+                      ngraph::pass::low_precision::LowPrecisionTransformer::isFunctionQuantized(func);
+
    auto orderedOps = func->get_ordered_ops();

    // TODO [NM]: unordered_map is preferred from performance perspective. Needs hash for ngraph::Node
@ -202,6 +213,9 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana
    // Replicate All Nodes in topological order
    for (const auto& op : orderedOps) {
        const MKLDNNNodePtr node(MKLDNNNode::factory().create(op, getEngine(), extMgr, weightsCache));
+        if (isQuantized()) {
+            node->setQuantizedGraphFlag(true);
+        }
        graphNodes.push_back(node);

        if (op->get_type_info() == ngraph::op::v0::Parameter::type_info) {
@ -1162,6 +1176,10 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo
    afterNode->getParent()->childEdges.push_back(afterNode);
    child->parentEdges.push_back(afterNode);

+    if (isQuantized()) {
+        node->setQuantizedGraphFlag(true);
+    }
+
    if (initNode) {
        node->getSupportedDescriptors();
        node->initSupportedPrimitiveDescriptors();
@ -1178,15 +1196,9 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo

 // Set all non const data paths precision to BF16
 void MKLDNNGraph::EnforceBF16() {
-    bool isQuantizedModel = false;
-    for (auto& node : graphNodes) {
-        if (node->getType() == FakeQuantize)
-            isQuantizedModel = true;
-    }
-
    // Floating point parts of FP32 + INT8 or FP32 + BIN mixed precision models will be executed in BF16 precision
    // only if enforceBF16 flag was set manually because current performance is not good enough to enable it by default
-    if (implication(isQuantizedModel, config.manualEnforceBF16)) {
+    if (implication(isQuantized(), config.manualEnforceBF16)) {
        for (auto &node : graphNodes) {
            if (node->getType() != Input && node->getType() != Output) {
                for (size_t i = 0; i < node->getOriginalInputsNumber(); i++) {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_graph.h
@ -153,6 +153,10 @@ public:

    void SortTopologically();

+    bool isQuantized() const {
+        return isQuantizedFlag;
+    }
+
 protected:
    void VisitNode(MKLDNNNodePtr node, std::vector<MKLDNNNodePtr>& sortedNodes);

@ -185,6 +189,8 @@ protected:
    std::map<std::string, MeanImage> _meanImages;
    std::string _name;

+    bool isQuantizedFlag = false;
+
    static mkldnn::engine eng;

    void Replicate(const InferenceEngine::CNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr);
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -586,6 +586,10 @@ public:
        return false;
    }

+    void setQuantizedGraphFlag(bool flag) {
+        isInQuantizedGraph = flag;
+    }
+
 protected:
    bool canBePerformedAsScaleShift(const MKLDNNNode *parentNode = nullptr) const;
    bool canFuseSimpleOperation(const MKLDNNNodePtr& node) const;
@ -652,6 +656,8 @@ protected:

    Algorithm algorithm = Algorithm::Undefined;

+    bool isInQuantizedGraph = false;
+
    friend class MKLDNNEdge;
    friend class MKLDNNGraph;
    friend class MKLDNNGraphOptimizer;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.cpp
@ -9,6 +9,7 @@
 #include "mkldnn_fake_quantize_node.h"
 #include "mkldnn_pooling_node.h"
 #include "mkldnn_concat_node.h"
+#include "cpu/x64/cpu_isa_traits.hpp"
 #include <string>
 #include <vector>
 #include <mkldnn_types.h>
@ -234,10 +235,10 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
                                                                                                  : memory::format_tag::nhwc);
        createDescriptor({in_candidate}, {out_candidate});
    } else {
-        inputDataType = (getOriginalInputPrecisionAtPort(0) == Precision::BF16 && !(isGrouped && ndims == 5)) ? memory::data_type::bf16
-                                                                                                           : memory::data_type::f32;
-        outputDataType = (getOriginalOutputPrecisionAtPort(0) == Precision::BF16 && !(isGrouped && ndims == 5)) ? memory::data_type::bf16
-                                                                                                             : memory::data_type::f32;
+        inputDataType = (getOriginalInputPrecisionAtPort(0) == Precision::BF16
+                && !(isDepthWise() && ndims == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
+        outputDataType = (getOriginalOutputPrecisionAtPort(0) == Precision::BF16
+                && !(isDepthWise() && ndims == 5)) ? memory::data_type::bf16 : memory::data_type::f32;
        eltwisePrecision = Precision::FP32;
        for (int i = 0; i < fusedWith.size(); i++) {
            if (fusedWith[i]->getAlgorithm() == EltwiseAdd) {
@ -263,52 +264,40 @@ void MKLDNNConvolutionNode::getSupportedDescriptors() {
            eltwisePrecision = Precision::FP32;
        }

-        if (ndims == 4) {
+        if (one_of(ndims, 4, 5)) {
+            memory::format_tag ncsp = ndims == 4 ? memory::format_tag::nchw : memory::format_tag::ncdhw;
+            memory::format_tag nspc = ndims == 4 ? memory::format_tag::nhwc : memory::format_tag::ndhwc;
+            memory::format_tag nCsp16c = ndims == 4 ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c;
+            memory::format_tag nCsp8c = ndims == 4 ? memory::format_tag::nChw8c : memory::format_tag::nCdhw8c;
+
            if (IC == 1 && groupOC == 1) {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, ncsp);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, ncsp);
                createDescriptor({in_candidate}, {out_candidate});
-            } else if (IC == 3 || IC == 1) {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
+            } else if (IC < 4) {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, ncsp);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, nCsp16c);
                createDescriptor({in_candidate}, {out_candidate});
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, nCsp8c);
                createDescriptor({in_candidate}, {out_candidate});
            } else {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw16c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw16c);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, nCsp16c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, nCsp16c);
                createDescriptor({in_candidate}, {out_candidate});
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nChw8c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nChw8c);
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, nCsp8c);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, nCsp8c);
                createDescriptor({in_candidate}, {out_candidate});
            }

-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nchw);
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nchw);
+            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, ncsp);
+            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, ncsp);
            createDescriptor({in_candidate}, {out_candidate});
-        } else if (ndims == 5) {
-            if (IC == 1 && groupOC == 1) {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
-                createDescriptor({in_candidate}, {out_candidate});
-            } else if (IC == 3 || IC == 1) {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
-                createDescriptor({in_candidate}, {out_candidate});
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
-                createDescriptor({in_candidate}, {out_candidate});
-            } else {
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw16c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw16c);
-                createDescriptor({in_candidate}, {out_candidate});
-                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::nCdhw8c);
-                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::nCdhw8c);
+
+            if (inputDataType != memory::data_type::bf16 && isNspcAvailable()) {
+                in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, nspc);
+                out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, nspc);
                createDescriptor({in_candidate}, {out_candidate});
            }
-
-            in_candidate = MKLDNNMemoryDesc(getParentEdgeAt(0)->getDims(), inputDataType, memory::format_tag::ncdhw);
-            out_candidate = MKLDNNMemoryDesc(getChildEdgeAt(0)->getDims(), outputDataType, memory::format_tag::ncdhw);
-            createDescriptor({in_candidate}, {out_candidate});
        }
    }
 }
@ -747,4 +736,80 @@ InferenceEngine::Precision MKLDNNConvolutionNode::getRuntimePrecision() const {
    return MKLDNNExtensionUtils::getMaxPrecision(inputPrecisions);
 }

+bool MKLDNNConvolutionNode::isNspcAvailable() const {
+    using impl::cpu::x64::mayiuse;
+
+    // do not use in non-quantized networks until it is enforced externally
+    if (!isInQuantizedGraph) {
+        auto predicate = [](memory::format_tag tag) {
+            return one_of(tag, memory::format_tag::nwc, memory::format_tag::nhwc, memory::format_tag::ndhwc);
+        };
+        if (std::none_of(inputMemoryFormatsFilter.begin(), inputMemoryFormatsFilter.end(), predicate)) {
+            return false;
+        }
+    }
+
+    // A bunch of heuristics are designed to cut off not optimal nspc convolution applications
+    auto inpDims = getParentEdgeAt(0)->getDims().ToSizeVector();
+    auto outDims = getChildEdgeAt(0)->getDims().ToSizeVector();
+    auto ndims = inpDims.size();
+
+    if (isDepthWise()) {
+        // 1d equivalent cases are painfully slow
+        if (1 == inpDims[inpDims.size() - 2]) {
+            return false;
+        }
+    } else {
+        // it was empirically observed that the nspc convolutions perform much slower than the blocked ones if the channels number more than the specific value
+        size_t spatialRank = ndims - 2; //two means batch dim plus channels dim
+
+        bool is1x1 = false;
+
+        if (!isGrouped) {
+            auto weightDimsReversItr = weightDims.crbegin();
+            auto inpDimsReversItr = inpDims.crbegin();
+            auto outDimsReversItr = outDims.crbegin();
+            auto paddingLreversItr = paddingL.crbegin();
+            auto paddingRreversItr = paddingR.crbegin();
+
+            for (size_t i = 0; i < spatialRank; ++i) {
+                is1x1 = true
+                        && *(weightDimsReversItr++) == 1
+                        && *(inpDimsReversItr++) == *(outDimsReversItr++)
+                        && *(paddingLreversItr++) == 0
+                        && *(paddingRreversItr++) == 0;
+            }
+        }
+
+        // if the activation field size is 1x1 the avx512 1x1 nspc convolution pollutes caches so that the layer after the convolution performs slow
+        if (mayiuse(impl::cpu::x64::avx512_common) && is1x1) {
+            auto end = inpDims.rbegin();
+            std::advance(end, spatialRank);
+            if (std::all_of(inpDims.rbegin(), end, [](size_t x) { return 1 == x; })) {
+                return false;
+            }
+        }
+
+        unsigned thresholdNumChannels = 128u; // for avx and below
+        if (is1x1) {
+            thresholdNumChannels = 2048u;
+        } else if (mayiuse(impl::cpu::x64::avx512_common)) {
+            thresholdNumChannels = 512u;
+        }
+
+        size_t OC = outDims[1];
+        if (std::max(IC, OC) >= thresholdNumChannels) {
+            return false;
+        }
+        if (!mayiuse(impl::cpu::x64::avx)) {
+            // SSE41 nspc convolutions do not support ic and oc tails yet and the blocked implementation will be much better than gemm
+            if ((IC % 8) || (OC % 8)) {
+                return false;
+            }
+        }
+    }
+
+    return true;
+}
+
 REG_MKLDNN_PRIM_FOR(MKLDNNConvolutionNode, Convolution);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_conv_node.h
@ -62,6 +62,7 @@ private:
    void setPostOps(mkldnn::primitive_attr &attr, bool initWeights) const;
    void filterSupportedDescriptors();
    bool isPossibleToSkipInitConfig(MKLDNNDescriptor &desc) const;
+    bool isNspcAvailable() const;

    bool withBiases;
    bool withSum;
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_eltwise_node.cpp
@ -1210,6 +1210,11 @@ void MKLDNNEltwiseNode::createPrimitive() {
            size_t startOff = outOrder.size() != config.outConfs[0].desc.getDims().size() &&
                              outOrder[outOrder.size() - 1] != inOrder[inOrder.size() - 1] ? 1 : 0;

+            // WA to handle nspc layout with 1D tensors
+            if (1 == inRank) {
+                if (outRank > 2 && 1 == outOrder.back()) startOff = 1;
+            }
+
            for (int j = 0; j < inRank; j++) {
                dims_in[i][dims_in[i].size() - 1 - j - startOff] = config.inConfs[i].desc.getBlockingDesc().getBlockDims()[inRank - 1 - j];
            }
--- a/inference-engine/tests/functional/plugin/cpu/bfloat16/elt_x3.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/bfloat16/elt_x3.cpp
@ -171,10 +171,10 @@ protected:
        // threshold = 0.6f;  // Max in fp32 network by output: 12.0983

        // 3 channels, 4 x 4 size
-        threshold = 20.6f;  // Max in fp32 network by output: 879.077
+        threshold = 30.6f;  // Max in fp32 network by output: 879.077

        // STAGE3:
-        // filling of expected precision of layer execution defined by precisoin of input tensor to the primitive and reflected in
+        // filling of expected precision of layer execution defined by precision of input tensor to the primitive and reflected in
        // performance counters
        expectedPrecisions["Convolution_1"] = "BF16";
        expectedPrecisions["Convolution_2"] = "BF16";
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp
@ -50,9 +50,15 @@ std::vector<std::string> disabledTestPatterns() {
        R"(.*BinaryConvolutionLayerTest.*)",
        R"(.*ClampLayerTest.*netPrc=(I64|I32).*)",
        R"(.*ClampLayerTest.*netPrc=U64.*)",
-        // TODO: 42538. Unexpected application crush
+        // TODO: 42538. Unexpected application crash
        R"(.*CoreThreadingTestsWithIterations\.smoke_LoadNetwork.t.*)",
        R"(.*CoreThreadingTestsWithIterations\.smoke_LoadNetworkAccuracy.*AUTO.*)",
+        // TODO: 53618. BF16 gemm ncsp convolution crash
+        R"(.*_GroupConv.*_inPRC=BF16.*_inFmts=nc.*_primitive=jit_gemm.*)",
+        // TODO: 53578. fork DW bf16 convolution does not support 3d cases yet
+        R"(.*_DW_GroupConv.*_inPRC=BF16.*_inFmts=(ndhwc|nCdhw16c).*)",
+        // TODO: 56143. Enable nspc convolutions for bf16 precision
+        R"(.*ConvolutionLayerCPUTest.*BF16.*_inFmts=(ndhwc|nhwc).*)",

        // incorrect reference implementation
        R"(.*NormalizeL2LayerTest.*axes=\(\).*)",
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution.cpp
@ -3,6 +3,7 @@
 //

 #include "test_utils/cpu_test_utils.hpp"
+#include "test_utils/convolution_params.hpp"
 #include "test_utils/fusing_test_utils.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "ngraph_functions/utils/ngraph_helpers.hpp"
@ -25,7 +26,7 @@ typedef std::tuple<
 class ConvolutionLayerCPUTest : public testing::WithParamInterface<convLayerCPUTestParamsSet>,
    virtual public LayerTestsUtils::LayerTestsCommon, public CpuTestWithFusing {
 public:
-    static std::string getTestCaseName(testing::TestParamInfo<convLayerCPUTestParamsSet> obj) {
+    static std::string getTestCaseName(const testing::TestParamInfo<convLayerCPUTestParamsSet>& obj) {
        convLayerTestParamsSet basicParamsSet;
        CPUSpecificParams cpuParams;
        fusingSpecificParams fusingParams;
@ -124,6 +125,16 @@ protected:
 TEST_P(ConvolutionLayerCPUTest, CompareWithRefs) {
    SKIP_IF_CURRENT_TEST_IS_DISABLED()

+    // Skip tests for sse41 convolution where ic or oc cannot be exactly divided by the block size,
+    // since tails processing for sse41 nspc layout is not supported yet (see 52736).
+    if (!inFmts.empty() && (inFmts.front() == nhwc || inFmts.front() == ndhwc) && selectedType.find("jit_sse") != std::string::npos) {
+        auto inpChannels = function->get_parameters().front()->get_shape()[1];
+        auto outChannels = function->get_output_shape(0)[1];
+        if ((inpChannels % 8) || (outChannels % 8)) {
+            GTEST_SKIP() << "Disabled test due to the sse41 convolution kernel does not support tails for nspc layout." << std::endl;
+        }
+    }
+
    Run();

    if (isBias) {
@ -137,50 +148,41 @@ namespace {
 /* COMMON PARAMS */
 const std::vector<fusingSpecificParams> fusingParamsSet{
        emptyFusingSpec,
-        // activations
+        // eltwise
        fusingRelu,
-        fusingElu,
-        fusingSigmoid,
-        fusingClamp,
-        fusingPReluPerChannel,
-        fusingSwish,
-        fusingHSwish,
-        fusingMish,
-        fusingSoftPlus,
-        // other patterns
-        fusingReluAdd,
+        fusingPRelu1D,
+        // depthwise
        fusingReluScaleShift,
+        // fake quantize
        fusingFakeQuantizePerTensorRelu,
        fusingFakeQuantizePerChannelRelu,
+        // sum
        fusingSumEluFQ,
        fusingSum,
-        fusingPRelu1D,
-        fusingAddPerChannel // bias
+        // bias
+        fusingAddPerChannel
 };

 const std::vector<fusingSpecificParams> fusingParamsSetBF16{
        emptyFusingSpec,
-        // activations
+        // eltwise
        fusingRelu,
-        fusingElu,
-        fusingSigmoid,
-        fusingClamp,
-        fusingPReluPerChannel,
-        fusingSwish,
-        // other patterns
-        fusingReluAdd,
+        // depthwise
        fusingReluScaleShift,
-        fusingSum
+        // sum
+        fusingSum,
+        // bias
+        fusingAddPerChannel
 };

 const std::map<std::string, std::string> cpuEmptyPluginConfig;
 const std::map<std::string, std::string> cpuBF16PluginConfig = { { PluginConfigParams::KEY_ENFORCE_BF16, PluginConfigParams::YES } };

-/* ============= Convolution params (planar layout) ============= */
-const SizeVector numOutChannels_Planar = { 6 };
+/* ============= Convolution params (GEMM layout) ============= */
+const SizeVector numOutChannels_Gemm = {6 };

-/* ============= Convolution params (blocked layout) ============= */
-const SizeVector numOutChannels_Blocked = { 64 };
+/* ============= Convolution params (blocked and nspc layout) ============= */
+const SizeVector numOutChannels = { 64, 63 };

 /* ============= Convolution params (2D) ============= */
 const std::vector<SizeVector> kernels2d = { {3, 3}, {1, 1} };
@ -188,6 +190,8 @@ const std::vector<SizeVector> strides2d = { {1, 1}, {2, 2} };
 const std::vector<std::vector<ptrdiff_t>> padBegins2d = { {0, 0}, {1, 1} };
 const std::vector<std::vector<ptrdiff_t>> padEnds2d = { {0, 0} };
 const std::vector<SizeVector> dilations2d = { {1, 1}, {2, 2} };
+const std::vector<SizeVector> inputShapes2d = { {1, 64, 7, 7}, {1, 67, 7, 7} };
+const std::vector<SizeVector> inputShapesPlain2Blocked2d = { {1, 1, 7, 7}, {1, 2, 7, 7},  {1, 3, 7, 7} };

 /* ============= Convolution params (3D) ============= */
 const std::vector<SizeVector> kernels3d = { {3, 3, 3}, {1, 1, 1} };
@ -195,28 +199,31 @@ const std::vector<SizeVector> strides3d = { {1, 1, 1}, {2, 2, 2} };
 const std::vector<std::vector<ptrdiff_t>> padBegins3d = { {0, 0, 0}, {1, 1, 1} };
 const std::vector<std::vector<ptrdiff_t>> padEnds3d = { {0, 0, 0} };
 const std::vector<SizeVector> dilations3d = { {1, 1, 1}, {2, 2, 2} };
+const std::vector<SizeVector> inputShapes3d = { {1, 64, 7, 7, 7}, {1, 67, 7, 7, 7} };
+const std::vector<SizeVector> inputShapesPlain2Blocked3d = { {1, 1, 7, 7, 7}, {1, 2, 7, 7, 7},  {1, 3, 7, 7, 7} };
 /* ============= */

 /* INSTANCES */
-/* ============= Convolution (Planar 2D) ============= */
-const auto convParams_ExplicitPadding_Planar_2D = ::testing::Combine(
+/* ============= Convolution (Gemm 2D) ============= */
+const auto convParams_ExplicitPadding_GEMM_2D = ::testing::Combine(
    ::testing::ValuesIn(kernels2d),
    ::testing::ValuesIn(strides2d),
    ::testing::ValuesIn(padBegins2d),
    ::testing::ValuesIn(padEnds2d),
    ::testing::ValuesIn(dilations2d),
-    ::testing::ValuesIn(numOutChannels_Planar),
+    ::testing::ValuesIn(numOutChannels_Gemm),
    ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Planar_2D = {
-        conv_gemm_2D
+const std::vector<CPUSpecificParams> CPUParams_GEMM_2D = {
+        conv_gemm_2D,
+        conv_gemm_2D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Planar_FP32, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_GEMM_FP32, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Planar_2D,
+            convParams_ExplicitPadding_GEMM_2D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Precision::UNSPECIFIED),
@ -224,15 +231,15 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Planar_FP32, ConvolutionLayerCPUTest,
            ::testing::Values(Layout::ANY),
            ::testing::Values(std::vector<size_t >({ 2, 12, 7, 7 })),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_2D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_GEMM_2D)),
        ::testing::ValuesIn(fusingParamsSet),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Planar_BF16, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_GEMM_BF16, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Planar_2D,
+            convParams_ExplicitPadding_GEMM_2D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::BF16),
            ::testing::Values(Precision::BF16),
@ -240,15 +247,15 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Planar_BF16, ConvolutionLayerCPUTest,
            ::testing::Values(Layout::ANY),
            ::testing::Values(std::vector<size_t >({ 2, 12, 7, 7 })),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_2D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_GEMM_2D)),
        ::testing::ValuesIn(fusingParamsSetBF16),
        ::testing::Values(cpuBF16PluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Planar_I8, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_GEMM_I8, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Planar_2D,
+            convParams_ExplicitPadding_GEMM_2D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::I8),
            ::testing::Values(Precision::UNSPECIFIED),
@ -256,30 +263,31 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Planar_I8, ConvolutionLayerCPUTest,
            ::testing::Values(Layout::ANY),
            ::testing::Values(std::vector<size_t >({ 2, 12, 7, 7 })),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_2D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_GEMM_2D)),
        ::testing::Values(fusingSum),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-/* ============= GroupConvolution (Planar 3D) ============= */
-const auto convParams_ExplicitPadding_Planar_3D = ::testing::Combine(
+/* ============= Convolution (GEMM 3D) ============= */
+const auto convParams_ExplicitPadding_GEMM_3D = ::testing::Combine(
    ::testing::ValuesIn(kernels3d),
    ::testing::ValuesIn(strides3d),
    ::testing::ValuesIn(padBegins3d),
    ::testing::ValuesIn(padEnds3d),
    ::testing::ValuesIn(dilations3d),
-    ::testing::ValuesIn(numOutChannels_Planar),
+    ::testing::ValuesIn(numOutChannels_Gemm),
    ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Planar_3D = {
-        conv_gemm_3D
+const std::vector<CPUSpecificParams> CPUParams_GEMM_3D = {
+        conv_gemm_3D,
+        conv_gemm_3D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Planar_FP32, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_GEMM_FP32, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Planar_3D,
+            convParams_ExplicitPadding_GEMM_3D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Precision::UNSPECIFIED),
@ -287,15 +295,15 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Planar_FP32, ConvolutionLayerCPUTest,
            ::testing::Values(Layout::ANY),
            ::testing::Values(std::vector<size_t >({ 2, 12, 7, 7, 7 })),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_3D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_GEMM_3D)),
        ::testing::ValuesIn(fusingParamsSet),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Planar_BF16, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_GEMM_BF16, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Planar_3D,
+            convParams_ExplicitPadding_GEMM_3D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::BF16),
            ::testing::Values(Precision::BF16),
@ -303,15 +311,15 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Planar_BF16, ConvolutionLayerCPUTest,
            ::testing::Values(Layout::ANY),
            ::testing::Values(std::vector<size_t >({ 2, 12, 7, 7, 7 })),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_3D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_GEMM_3D)),
        ::testing::ValuesIn(fusingParamsSetBF16),
        ::testing::Values(cpuBF16PluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Planar_I8, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_GEMM_I8, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Planar_3D,
+            convParams_ExplicitPadding_GEMM_3D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::I8),
            ::testing::Values(Precision::UNSPECIFIED),
@ -319,141 +327,221 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Planar_I8, ConvolutionLayerCPUTest,
            ::testing::Values(Layout::ANY),
            ::testing::Values(std::vector<size_t >({ 2, 12, 7, 7, 7 })),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_3D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_GEMM_3D)),
        ::testing::Values(fusingSum),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-/* ============= GroupConvolution (Blocked 2D) ============= */
-const auto convParams_ExplicitPadding_Blocked_2D = ::testing::Combine(
+/* ============= Convolution (2D) ============= */
+const auto convParams_ExplicitPadding_2D = ::testing::Combine(
    ::testing::ValuesIn(kernels2d),
    ::testing::ValuesIn(strides2d),
    ::testing::ValuesIn(padBegins2d),
    ::testing::ValuesIn(padEnds2d),
    ::testing::ValuesIn(dilations2d),
-    ::testing::ValuesIn(numOutChannels_Blocked),
+    ::testing::ValuesIn(numOutChannels),
    ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Blocked_2D = {
+const std::vector<CPUSpecificParams> CPUParams_2D = {
        conv_sse42_2D,
        conv_avx2_2D,
-        conv_avx512_2D
+        conv_avx512_2D,
+        conv_sse42_2D_nspc,
+        conv_avx2_2D_nspc,
+        conv_avx512_2D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Blocked_FP32, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_FP32, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Blocked_2D,
+            convParams_ExplicitPadding_2D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7 })),
+            ::testing::ValuesIn(inputShapes2d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Blocked_2D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_2D)),
        ::testing::ValuesIn(fusingParamsSet),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Blocked_BF16, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_BF16, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Blocked_2D,
+            convParams_ExplicitPadding_2D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::BF16),
            ::testing::Values(Precision::BF16),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7 })),
+            ::testing::ValuesIn(inputShapes2d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_2D})),
+        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_2D, conv_avx512_2D_nspc})),
        ::testing::ValuesIn(fusingParamsSetBF16),
        ::testing::Values(cpuBF16PluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_Blocked_I8, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_I8, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Blocked_2D,
+            convParams_ExplicitPadding_2D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::I8),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7 })),
+            ::testing::ValuesIn(inputShapes2d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Blocked_2D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_2D)),
        ::testing::Values(fusingSum),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-/* ============= GroupConvolution (Blocked 3D) ============= */
-const auto convParams_ExplicitPadding_Blocked_3D = ::testing::Combine(
+const std::vector<CPUSpecificParams> CPUParams_2D_plain_to_blocked = {
+        conv_sse42_plain_to_blocked_2D,
+        conv_avx2_plain_to_blocked_2D,
+        conv_avx512_plain_to_blocked_2D,
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_Conv_PlainToBlocked_2D_FP32, ConvolutionLayerCPUTest,
+    ::testing::Combine(
+        ::testing::Combine(
+            convParams_ExplicitPadding_2D,
+            ::testing::Values(Precision::FP32),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Layout::ANY),
+            ::testing::Values(Layout::ANY),
+            ::testing::ValuesIn(inputShapesPlain2Blocked2d),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_2D_plain_to_blocked)),
+        ::testing::Values(emptyFusingSpec),
+        ::testing::Values(cpuEmptyPluginConfig)),
+    ConvolutionLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Conv_PlainToBlocked_2D_BF16, ConvolutionLayerCPUTest,
+    ::testing::Combine(
+        ::testing::Combine(
+            convParams_ExplicitPadding_2D,
+            ::testing::Values(Precision::FP32),
+            ::testing::Values(Precision::BF16),
+            ::testing::Values(Precision::BF16, Precision::FP32),
+            ::testing::Values(Layout::ANY),
+            ::testing::Values(Layout::ANY),
+            ::testing::ValuesIn(inputShapesPlain2Blocked2d),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_plain_to_blocked_2D})),
+        ::testing::Values(emptyFusingSpec),
+        ::testing::Values(cpuEmptyPluginConfig)),
+    ConvolutionLayerCPUTest::getTestCaseName);
+
+/* ============= Convolution (3D) ============= */
+const auto convParams_ExplicitPadding_3D = ::testing::Combine(
    ::testing::ValuesIn(kernels3d),
    ::testing::ValuesIn(strides3d),
    ::testing::ValuesIn(padBegins3d),
    ::testing::ValuesIn(padEnds3d),
    ::testing::ValuesIn(dilations3d),
-    ::testing::ValuesIn(numOutChannels_Blocked),
+    ::testing::ValuesIn(numOutChannels),
    ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Blocked_3D = {
+const std::vector<CPUSpecificParams> CPUParams_3D = {
            //conv_sse42_3D, // not supported jit_sse42 for 3d
            conv_avx2_3D,
-            conv_avx512_3D
+            conv_avx512_3D,
+            conv_avx2_3D_nspc,
+            conv_avx512_3D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Blocked_FP32, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_FP32, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Blocked_3D,
+            convParams_ExplicitPadding_3D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7, 7 })),
+            ::testing::ValuesIn(inputShapes3d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Blocked_3D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_3D)),
        ::testing::ValuesIn(fusingParamsSet),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Blocked_BF16, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_BF16, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Blocked_3D,
+            convParams_ExplicitPadding_3D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::BF16),
            ::testing::Values(Precision::BF16),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7, 7 })),
+            ::testing::ValuesIn(inputShapes3d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_3D})),
+        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_3D, conv_avx512_3D_nspc})),
        ::testing::ValuesIn(fusingParamsSetBF16),
        ::testing::Values(cpuBF16PluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_Blocked_I8, ConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_Conv_3D_I8, ConvolutionLayerCPUTest,
    ::testing::Combine(
        ::testing::Combine(
-            convParams_ExplicitPadding_Blocked_3D,
+            convParams_ExplicitPadding_3D,
            ::testing::Values(Precision::FP32),
            ::testing::Values(Precision::I8),
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7, 7 })),
+            ::testing::ValuesIn(inputShapes3d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Blocked_3D)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_3D)),
        ::testing::Values(fusingSum),
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

+const std::vector<CPUSpecificParams> CPUParams_3D_plain_to_blocked = {
+        conv_avx2_plain_to_blocked_3D,
+        conv_avx512_plain_to_blocked_3D,
+};
+
+INSTANTIATE_TEST_CASE_P(smoke_Conv_PlainToBlocked_3D_FP32, ConvolutionLayerCPUTest,
+    ::testing::Combine(
+        ::testing::Combine(
+            convParams_ExplicitPadding_3D,
+            ::testing::Values(Precision::FP32),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Layout::ANY),
+            ::testing::Values(Layout::ANY),
+            ::testing::ValuesIn(inputShapesPlain2Blocked3d),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_3D_plain_to_blocked)),
+        ::testing::Values(emptyFusingSpec),
+        ::testing::Values(cpuEmptyPluginConfig)),
+    ConvolutionLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_Conv_PlainToBlocked_3D_BF16, ConvolutionLayerCPUTest,
+    ::testing::Combine(
+        ::testing::Combine(
+            convParams_ExplicitPadding_3D,
+            ::testing::Values(Precision::FP32),
+            ::testing::Values(Precision::BF16),
+            ::testing::Values(Precision::BF16, Precision::FP32),
+            ::testing::Values(Layout::ANY),
+            ::testing::Values(Layout::ANY),
+            ::testing::ValuesIn(inputShapesPlain2Blocked3d),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_plain_to_blocked_3D})),
+        ::testing::Values(emptyFusingSpec),
+        ::testing::Values(cpuEmptyPluginConfig)),
+    ConvolutionLayerCPUTest::getTestCaseName);
+
 /* ============= Kernel_1x1 (2D) ============= */

 const auto convParams_ExplicitPadding_1x1_2D = ::testing::Combine(
@ -462,14 +550,17 @@ const auto convParams_ExplicitPadding_1x1_2D = ::testing::Combine(
        ::testing::Values(std::vector<ptrdiff_t>({0, 0})),
        ::testing::Values(std::vector<ptrdiff_t>({0, 0})),
        ::testing::Values(SizeVector({1, 1})),
-        ::testing::ValuesIn(numOutChannels_Blocked),
+        ::testing::Values(63),
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

 const std::vector<CPUSpecificParams> CPUParams_1x1_2D = {
    conv_sse42_2D_1x1,
    conv_avx2_2D_1x1,
-    conv_avx512_2D_1x1
+    conv_avx512_2D_1x1,
+    conv_sse42_2D_1x1_nspc,
+    conv_avx2_2D_1x1_nspc,
+    conv_avx512_2D_1x1_nspc
 };

 INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_1x1_FP32, ConvolutionLayerCPUTest,
@ -481,7 +572,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_1x1_FP32, ConvolutionLayerCPUTest,
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7 })),
+            ::testing::ValuesIn(inputShapes2d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_1x1_2D)),
        ::testing::ValuesIn(fusingParamsSet),
@ -497,9 +588,9 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_1x1_BF16, ConvolutionLayerCPUTest,
            ::testing::Values(Precision::BF16),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7 })),
+            ::testing::ValuesIn(inputShapes2d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_2D_1x1})),
+        ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_2D_1x1, conv_avx512_2D_1x1_nspc})),
        ::testing::ValuesIn(fusingParamsSetBF16),
        ::testing::Values(cpuBF16PluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);
@ -513,7 +604,7 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_2D_1x1_I8, ConvolutionLayerCPUTest,
            ::testing::Values(Precision::UNSPECIFIED),
            ::testing::Values(Layout::ANY),
            ::testing::Values(Layout::ANY),
-            ::testing::Values(std::vector<size_t >({ 2, 64, 7, 7 })),
+            ::testing::ValuesIn(inputShapes2d),
            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_1x1_2D)),
        ::testing::Values(fusingSum),
@ -534,7 +625,7 @@ const auto convParams_1D = ::testing::Combine(
    ::testing::ValuesIn(padBegins1d),
    ::testing::ValuesIn(padEnds1d),
    ::testing::ValuesIn(dilations1d),
-    ::testing::ValuesIn(numOutChannels_Blocked),
+    ::testing::ValuesIn(numOutChannels),
    ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

@ -560,7 +651,75 @@ INSTANTIATE_TEST_CASE_P(smoke_Conv_1D, ConvolutionLayerCPUTest,
        ::testing::Values(cpuEmptyPluginConfig)),
    ConvolutionLayerCPUTest::getTestCaseName);

-/* ========= */
+/* ============= Jit Planar ============= */
+
+/* ============= Convolution planar params (2D) ============= */
+const std::vector<CPUSpecificParams> CPUParams_Jit_Planar_2D = {
+        // sse42 is not supported
+        conv_avx2_planar_2D,
+        conv_avx512_planar_2D,
+};
+
+const auto convParams_Planar_ExplicitPadding_2D = ::testing::Combine(
+        ::testing::ValuesIn(kernels2d),
+        ::testing::Values(SizeVector{1, 1}),
+        ::testing::ValuesIn(padBegins2d),
+        ::testing::ValuesIn(padEnds2d),
+        ::testing::ValuesIn(dilations2d),
+        ::testing::Values(1),
+        ::testing::Values(ngraph::op::PadType::EXPLICIT)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_Conv_Jit_Planar_2D_FP32, ConvolutionLayerCPUTest,
+    ::testing::Combine(
+        ::testing::Combine(
+            convParams_Planar_ExplicitPadding_2D,
+            ::testing::Values(Precision::FP32),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Layout::ANY),
+            ::testing::Values(Layout::ANY),
+            ::testing::ValuesIn(inputShapes2d),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Jit_Planar_2D)),
+        ::testing::Values(emptyFusingSpec, fusingRelu),
+        ::testing::Values(cpuEmptyPluginConfig)),
+    ConvolutionLayerCPUTest::getTestCaseName);
+
+/* ============= Convolution planar params (3D) ============= */
+const std::vector<CPUSpecificParams> CPUParams_Jit_Planar_3D = {
+        // sse42 is not supported
+        conv_avx2_planar_3D,
+        conv_avx512_planar_3D,
+};
+
+const auto convParams_Planar_ExplicitPadding_3D = ::testing::Combine(
+        ::testing::ValuesIn(kernels3d),
+        ::testing::Values(SizeVector{1, 1, 1}),
+        ::testing::ValuesIn(padBegins3d),
+        ::testing::ValuesIn(padEnds3d),
+        ::testing::ValuesIn(dilations3d),
+        ::testing::Values(1),
+        ::testing::Values(ngraph::op::PadType::EXPLICIT)
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_Conv_Jit_Planar_3D_FP32, ConvolutionLayerCPUTest,
+    ::testing::Combine(
+        ::testing::Combine(
+            convParams_Planar_ExplicitPadding_3D,
+            ::testing::Values(Precision::FP32),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Precision::UNSPECIFIED),
+            ::testing::Values(Layout::ANY),
+            ::testing::Values(Layout::ANY),
+            ::testing::ValuesIn(inputShapes3d),
+            ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+        ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Jit_Planar_3D)),
+        ::testing::Values(emptyFusingSpec, fusingRelu),
+        ::testing::Values(cpuEmptyPluginConfig)),
+    ConvolutionLayerCPUTest::getTestCaseName);
+
+/* ============= */

 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution_backprop_data.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/convolution_backprop_data.cpp
@ -3,6 +3,7 @@
 //

 #include "test_utils/cpu_test_utils.hpp"
+#include "test_utils/convolution_params.hpp"
 #include "test_utils/fusing_test_utils.hpp"
 #include "shared_test_classes/base/layer_test_utils.hpp"
 #include "ngraph_functions/utils/ngraph_helpers.hpp"
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp
@ -333,5 +333,61 @@ const auto params_5D_Planar_Blocked = ::testing::Combine(

 INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_Planar_Blocked, EltwiseLayerCPUTest, params_5D_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName);

+
+std::vector<std::vector<std::vector<size_t>>> inShapes_4D_1D = {
+        {{2, 17, 5, 4}, {4}},
+        {{1, 3, 3, 3}, {3}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_4D_1D = {
+        CPUSpecificParams({nChw16c, x}, {nChw16c}, {}, {}),
+        CPUSpecificParams({nhwc, x}, {nhwc}, {}, {}),
+        CPUSpecificParams({nchw, x}, {nchw}, {}, {})
+};
+
+const auto params_4D_1D = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::ValuesIn(inShapes_4D_1D),
+                ::testing::Values(ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY),
+                ::testing::ValuesIn(secondaryInputTypes),
+                ::testing::ValuesIn(opTypes),
+                ::testing::ValuesIn(netPrc),
+                ::testing::Values(InferenceEngine::Precision::FP32),
+                ::testing::Values(InferenceEngine::Precision::FP32),
+                ::testing::Values(InferenceEngine::Layout::ANY),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_1D)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_4D_1D, EltwiseLayerCPUTest, params_4D_1D, EltwiseLayerCPUTest::getTestCaseName);
+
+std::vector<std::vector<std::vector<size_t>>> inShapes_5D_1D = {
+        {{2, 17, 5, 4, 10}, {10}},
+        {{1, 3, 3, 3, 3}, {3}},
+};
+
+std::vector<CPUSpecificParams> cpuParams_5D_1D = {
+        CPUSpecificParams({nCdhw16c, x}, {nCdhw16c}, {}, {}),
+        CPUSpecificParams({ndhwc, x}, {ndhwc}, {}, {}),
+        CPUSpecificParams({ncdhw, x}, {ncdhw}, {}, {})
+};
+
+const auto params_5D_1D = ::testing::Combine(
+        ::testing::Combine(
+                ::testing::ValuesIn(inShapes_5D_1D),
+                ::testing::Values(ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY),
+                ::testing::ValuesIn(secondaryInputTypes),
+                ::testing::ValuesIn(opTypes),
+                ::testing::ValuesIn(netPrc),
+                ::testing::Values(InferenceEngine::Precision::FP32),
+                ::testing::Values(InferenceEngine::Precision::FP32),
+                ::testing::Values(InferenceEngine::Layout::ANY),
+                ::testing::Values(CommonTestUtils::DEVICE_CPU),
+                ::testing::Values(additional_config)),
+        ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_1D)));
+
+INSTANTIATE_TEST_CASE_P(smoke_CompareWithRefs_5D_1D, EltwiseLayerCPUTest, params_5D_1D, EltwiseLayerCPUTest::getTestCaseName);
+
+
 } // namespace
 } // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution.cpp
@ -4,6 +4,7 @@

 #include <shared_test_classes/single_layer/group_convolution.hpp>
 #include "test_utils/cpu_test_utils.hpp"
+#include "test_utils/convolution_params.hpp"
 #include "test_utils/fusing_test_utils.hpp"

 using namespace InferenceEngine;
@ -83,18 +84,18 @@ protected:
        auto netPrecision   = InferenceEngine::Precision::UNSPECIFIED;
        std::tie(groupConvParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShape, targetDevice) = basicParamsSet;

-        if (inPrc == Precision::UNSPECIFIED) {
-            selectedType += std::string("_") + Precision(Precision::FP32).name();
-        } else {
-            selectedType += std::string("_") + inPrc.name();
-        }
-
        ngraph::op::PadType padType;
        InferenceEngine::SizeVector kernel, stride, dilation;
        std::vector<ptrdiff_t> padBegin, padEnd;
        size_t convOutChannels, numGroups;
        std::tie(kernel, stride, padBegin, padEnd, dilation, convOutChannels, numGroups, padType) = groupConvParams;

+        if (inPrc == Precision::UNSPECIFIED) {
+            selectedType += std::string("_") + Precision(Precision::FP32).name();
+        } else {
+            selectedType += std::string("_") + inPrc.name();
+        }
+
        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
        auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
        auto paramOuts = ngraph::helpers::convert2OutputVector(
@ -145,31 +146,35 @@ std::vector<groupConvLayerCPUTestParamsSet> filterParamsSetForDevice(std::vector
 /* ===================== */

 /* COMMON PARAMS */
-std::vector<fusingSpecificParams> fusingParamsSet {
+const std::vector<fusingSpecificParams> fusingParamsSet {
        emptyFusingSpec,
-        // activations
+        // eltwise
        fusingRelu,
-        fusingElu,
-        fusingSigmoid,
-        fusingClamp,
-        fusingPReluPerChannel,
-        fusingSwish,
-        fusingHSwish,
-        fusingMish,
-        fusingSoftPlus,
-        // other patterns
+        fusingPRelu1D,
+        // depthwise
        fusingReluScaleShift,
+        // fake quantize
        fusingFakeQuantizePerTensorRelu,
        fusingFakeQuantizePerChannelRelu,
+        // sum
        fusingSumEluFQ,
-        fusingSum,
-        fusingPRelu1D
+        fusingSum
+};
+
+const std::vector<fusingSpecificParams> fusingParamsSetBF16{
+        emptyFusingSpec,
+        // eltwise
+        fusingRelu,
+        // depthwise
+        fusingReluScaleShift,
+        // sum
+        fusingSum
 };


 /* ============= GroupConvolution params (planar layout) ============= */
-const SizeVector numOutChannels_Planar = {6};
-const SizeVector numGroups_Planar = {2, 3};
+const SizeVector numOutChannels_Gemm = {6};
+const SizeVector numGroups_Gemm = {2, 3};

 /* ============= GroupConvolution params (blocked layout) ============= */
 const SizeVector numOutChannels_Blocked = {64};
@ -196,26 +201,27 @@ const std::vector<SizeVector> dilations3d = {{1, 1, 1}, {2, 2, 2}};


 /* INSTANCES */
-/* ============= GroupConvolution (Planar 2D) ============= */
-const auto groupConvParams_ExplicitPadding_Planar_2D = ::testing::Combine(
+/* ============= GroupConvolution (GEMM 2D) ============= */
+const auto groupConvParams_ExplicitPadding_Gemm_2D = ::testing::Combine(
        ::testing::ValuesIn(kernels2d),
        ::testing::ValuesIn(strides2d),
        ::testing::ValuesIn(padBegins2d),
        ::testing::ValuesIn(padEnds2d),
        ::testing::ValuesIn(dilations2d),
-        ::testing::ValuesIn(numOutChannels_Planar),
-        ::testing::ValuesIn(numGroups_Planar),
+        ::testing::ValuesIn(numOutChannels_Gemm),
+        ::testing::ValuesIn(numGroups_Gemm),
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Planar_2D = {
-        conv_gemm_2D
+const std::vector<CPUSpecificParams> CPUParams_Gemm_2D = {
+        conv_gemm_2D,
+        conv_gemm_2D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_Planar_FP32, GroupConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_Gemm_FP32, GroupConvolutionLayerCPUTest,
                        ::testing::Combine(
                                ::testing::Combine(
-                                        groupConvParams_ExplicitPadding_Planar_2D,
+                                        groupConvParams_ExplicitPadding_Gemm_2D,
                                        ::testing::Values(Precision::FP32),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -223,30 +229,46 @@ INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_Planar_FP32, GroupConvolutionLayerCPU
                                        ::testing::Values(InferenceEngine::Layout::ANY),
                                        ::testing::Values(std::vector<size_t >({2, 12, 7, 7})),
                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_2D)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Gemm_2D)),
                                ::testing::ValuesIn(fusingParamsSet)),
                        GroupConvolutionLayerCPUTest::getTestCaseName);

-/* ============= GroupConvolution (Planar 3D) ============= */
-const auto groupConvParams_ExplicitPadding_Planar_3D = ::testing::Combine(
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_Gemm_BF16, GroupConvolutionLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        groupConvParams_ExplicitPadding_Gemm_2D,
+                                        ::testing::Values(Precision::FP32),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(std::vector<size_t >({2, 12, 7, 7})),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Gemm_2D)),
+                                ::testing::ValuesIn(fusingParamsSetBF16)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);
+
+/* ============= GroupConvolution (Gemm 3D) ============= */
+const auto groupConvParams_ExplicitPadding_Gemm_3D = ::testing::Combine(
        ::testing::ValuesIn(kernels3d),
        ::testing::ValuesIn(strides3d),
        ::testing::ValuesIn(padBegins3d),
        ::testing::ValuesIn(padEnds3d),
        ::testing::ValuesIn(dilations3d),
-        ::testing::ValuesIn(numOutChannels_Planar),
-        ::testing::ValuesIn(numGroups_Planar),
+        ::testing::ValuesIn(numOutChannels_Gemm),
+        ::testing::ValuesIn(numGroups_Gemm),
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Planar_3D = {
-        conv_gemm_3D
+const std::vector<CPUSpecificParams> CPUParams_Gemm_3D = {
+        conv_gemm_3D,
+        conv_gemm_3D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_Planar_FP32, GroupConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_Gemm_FP32, GroupConvolutionLayerCPUTest,
                        ::testing::Combine(
                                ::testing::Combine(
-                                        groupConvParams_ExplicitPadding_Planar_3D,
+                                        groupConvParams_ExplicitPadding_Gemm_3D,
                                        ::testing::Values(Precision::FP32),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -254,12 +276,27 @@ INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_Planar_FP32, GroupConvolutionLayerCPU
                                        ::testing::Values(InferenceEngine::Layout::ANY),
                                        ::testing::Values(std::vector<size_t >({2, 12, 7, 7, 7})),
                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Planar_3D)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Gemm_3D)),
                                ::testing::ValuesIn(fusingParamsSet)),
                        GroupConvolutionLayerCPUTest::getTestCaseName);

-/* ============= GroupConvolution (Blocked 2D) ============= */
-const auto groupConvParams_ExplicitPadding_Blocked_2D = ::testing::Combine(
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_Gemm_BF16, GroupConvolutionLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        groupConvParams_ExplicitPadding_Gemm_3D,
+                                        ::testing::Values(Precision::FP32),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(std::vector<size_t >({2, 12, 7, 7, 7})),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Gemm_3D)),
+                                ::testing::ValuesIn(fusingParamsSetBF16)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);
+
+/* ============= GroupConvolution (2D) ============= */
+const auto groupConvParams_ExplicitPadding_2D = ::testing::Combine(
        ::testing::ValuesIn(kernels2d),
        ::testing::ValuesIn(strides2d),
        ::testing::ValuesIn(padBegins2d),
@ -270,16 +307,19 @@ const auto groupConvParams_ExplicitPadding_Blocked_2D = ::testing::Combine(
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Blocked_2D = {
+const std::vector<CPUSpecificParams> CPUParams_2D = {
        conv_sse42_2D,
        conv_avx2_2D,
-        conv_avx512_2D
+        conv_avx512_2D,
+        conv_sse42_2D_nspc,
+        conv_avx2_2D_nspc,
+        conv_avx512_2D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_Blocked_FP32, GroupConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_FP32, GroupConvolutionLayerCPUTest,
                        ::testing::Combine(
                                ::testing::Combine(
-                                        groupConvParams_ExplicitPadding_Blocked_2D,
+                                        groupConvParams_ExplicitPadding_2D,
                                        ::testing::Values(Precision::FP32),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -287,12 +327,27 @@ INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_Blocked_FP32, GroupConvolutionLayerCP
                                        ::testing::Values(InferenceEngine::Layout::ANY),
                                        ::testing::Values(std::vector<size_t >({2, 64, 7, 7})),
                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Blocked_2D)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_2D)),
                                ::testing::ValuesIn(fusingParamsSet)),
                        GroupConvolutionLayerCPUTest::getTestCaseName);

-/* ============= GroupConvolution (Blocked 3D) ============= */
-const auto groupConvParams_ExplicitPadding_Blocked_3D = ::testing::Combine(
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_BF16, GroupConvolutionLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        groupConvParams_ExplicitPadding_2D,
+                                        ::testing::Values(Precision::FP32),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(std::vector<size_t >({2, 64, 7, 7})),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_2D, conv_avx512_2D_nspc})),
+                                ::testing::ValuesIn(fusingParamsSetBF16)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);
+
+/* ============= GroupConvolution (3D) ============= */
+const auto groupConvParams_ExplicitPadding_3D = ::testing::Combine(
        ::testing::ValuesIn(kernels3d),
        ::testing::ValuesIn(strides3d),
        ::testing::ValuesIn(padBegins3d),
@ -303,16 +358,18 @@ const auto groupConvParams_ExplicitPadding_Blocked_3D = ::testing::Combine(
        ::testing::Values(ngraph::op::PadType::EXPLICIT)
 );

-const std::vector<CPUSpecificParams> CPUParams_Blocked_3D = {
+const std::vector<CPUSpecificParams> CPUParams_3D = {
 //        conv_sse42_3D, // not supported jit_sse42 for 3d
        conv_avx2_3D,
-        conv_avx512_3D
+        conv_avx512_3D,
+        conv_avx2_3D_nspc,
+        conv_avx512_3D_nspc
 };

-INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_Blocked_FP32, GroupConvolutionLayerCPUTest,
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_FP32, GroupConvolutionLayerCPUTest,
                        ::testing::Combine(
                                ::testing::Combine(
-                                        groupConvParams_ExplicitPadding_Blocked_3D,
+                                        groupConvParams_ExplicitPadding_3D,
                                        ::testing::Values(Precision::FP32),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
                                        ::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
@ -320,10 +377,25 @@ INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_Blocked_FP32, GroupConvolutionLayerCP
                                        ::testing::Values(InferenceEngine::Layout::ANY),
                                        ::testing::Values(std::vector<size_t >({2, 64, 7, 7, 7})),
                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
-                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_Blocked_3D)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice(CPUParams_3D)),
                                ::testing::ValuesIn(fusingParamsSet)),
                        GroupConvolutionLayerCPUTest::getTestCaseName);

+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_BF16, GroupConvolutionLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        groupConvParams_ExplicitPadding_3D,
+                                        ::testing::Values(Precision::FP32),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(std::vector<size_t >({2, 64, 7, 7, 7})),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_3D, conv_avx512_3D_nspc})),
+                                ::testing::ValuesIn(fusingParamsSetBF16)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);
+
 /* ============= GroupConvolution (DW 2D) ============= */
 const auto groupConvParams_ExplicitPadding_DW_2D = ::testing::Combine(
        ::testing::ValuesIn(kernels2d),
@ -339,7 +411,10 @@ const auto groupConvParams_ExplicitPadding_DW_2D = ::testing::Combine(
 const std::vector<CPUSpecificParams> CPUParams_DW_2D = {
        conv_sse42_dw_2D,
        conv_avx2_dw_2D,
-        conv_avx512_dw_2D
+        conv_avx512_dw_2D,
+        conv_sse42_dw_2D_nspc,
+        conv_avx2_dw_2D_nspc,
+        conv_avx512_dw_2D_nspc
 };

 INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_DW_FP32, GroupConvolutionLayerCPUTest,
@ -357,6 +432,22 @@ INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_DW_FP32, GroupConvolutionLayerCPUTest
                                ::testing::ValuesIn(fusingParamsSet)),
                        GroupConvolutionLayerCPUTest::getTestCaseName);

+
+INSTANTIATE_TEST_CASE_P(smoke_GroupConv_2D_DW_BF16, GroupConvolutionLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        groupConvParams_ExplicitPadding_DW_2D,
+                                        ::testing::Values(Precision::FP32),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(Precision::BF16),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(InferenceEngine::Layout::ANY),
+                                        ::testing::Values(std::vector<size_t >({2, 32, 7, 7})),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUInfoForDevice({conv_avx512_dw_2D, conv_avx512_dw_2D_nspc})),
+                                ::testing::ValuesIn(fusingParamsSetBF16)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);
+
 /* ============= GroupConvolution (DW 3D) ============= */
 const auto groupConvParams_ExplicitPadding_DW_3D = ::testing::Combine(
        ::testing::ValuesIn(kernels3d),
@ -372,7 +463,10 @@ const auto groupConvParams_ExplicitPadding_DW_3D = ::testing::Combine(
 const std::vector<CPUSpecificParams> CPUParams_DW_3D = {
        conv_sse42_dw_3D,
        conv_avx2_dw_3D,
-        conv_avx512_dw_3D
+        conv_avx512_dw_3D,
+        conv_sse42_dw_3D_nspc,
+        conv_avx2_dw_3D_nspc,
+        conv_avx512_dw_3D_nspc
 };

 INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_DW_FP32, GroupConvolutionLayerCPUTest,
@ -393,10 +487,16 @@ INSTANTIATE_TEST_CASE_P(smoke_GroupConv_3D_DW_FP32, GroupConvolutionLayerCPUTest


 /* ============= SINGLE TEST CASES ============= */
-groupConvLayerCPUTestParamsSet makeSingleGroupConvCPUTestCase(SizeVector kernels, SizeVector strides, SizeVector dilations,
-                                                        std::vector<ptrdiff_t> padBegins, std::vector<ptrdiff_t> padEnds, ngraph::op::PadType padType,
-                                                        int groups, int mb, SizeVector spDims, int inGroupSize, int outGroupSize,
-                                                        CPUSpecificParams CPUParams) {
+using VecFusingParams = std::vector<fusingSpecificParams>;
+using PrcConnectedParams = std::tuple<Precision, Precision,  VecFusingParams>; // inPrc, outPrc, FusingParamsSet
+using VecPrcConnectedParams = std::vector<PrcConnectedParams>;
+
+std::vector<groupConvLayerCPUTestParamsSet> makeSingleGroupConvCPUTestCases(SizeVector kernels, SizeVector strides, SizeVector dilations,
+                                                                            std::vector<ptrdiff_t> padBegins, std::vector<ptrdiff_t> padEnds,
+                                                                            ngraph::op::PadType padType, int groups, int mb, SizeVector spDims,
+                                                                            int inGroupSize, int outGroupSize,
+                                                                            const std::vector<CPUSpecificParams>& CPUParams,
+                                                                            const VecPrcConnectedParams& vecPrcConnectedParams) {
    int inChannels = groups * inGroupSize;
    int outChannels = groups * outGroupSize;

@ -406,42 +506,92 @@ groupConvLayerCPUTestParamsSet makeSingleGroupConvCPUTestCase(SizeVector kernels
    inputShapes.insert(inputShapes.end(), spDims.begin(), spDims.end());

    groupConvSpecificParams specificParams(kernels, strides, padBegins, padEnds, dilations, outChannels, groups, padType);
-    groupConvLayerTestParamsSet basicParamsSet(specificParams, Precision::FP32,
-        InferenceEngine::Precision::UNSPECIFIED,
-        InferenceEngine::Precision::UNSPECIFIED,
-        InferenceEngine::Layout::ANY,
-        InferenceEngine::Layout::ANY, inputShapes, CommonTestUtils::DEVICE_CPU);
-    return groupConvLayerCPUTestParamsSet(basicParamsSet, CPUParams, emptyFusingSpec);
+    std::vector<groupConvLayerCPUTestParamsSet> retVector;
+
+    for (auto& prcConnectedParams : vecPrcConnectedParams) {
+        Precision inPrc, outPrc;
+        VecFusingParams fusingParams;
+        std::tie(inPrc, outPrc, fusingParams) = prcConnectedParams;
+
+        groupConvLayerTestParamsSet basicParamsSet(specificParams, Precision::FP32, inPrc, outPrc,
+                                                   InferenceEngine::Layout::ANY, InferenceEngine::Layout::ANY,
+                                                   inputShapes, CommonTestUtils::DEVICE_CPU);
+
+        for (auto &item : CPUParams) {
+            for (auto &fusingParam : fusingParams) {
+                retVector.push_back(groupConvLayerCPUTestParamsSet(basicParamsSet, item, fusingParam));
+            }
+        }
+    }
+    return  retVector;
 }

+template<typename T>
+void concatTestCases(std::vector<groupConvLayerCPUTestParamsSet>& resultVec, T tesCase) {
+    resultVec.insert(resultVec.begin(), std::make_move_iterator(tesCase.begin()), std::make_move_iterator(tesCase.end()));
+}
+
+template<typename T, typename... Args>
+void concatTestCases(std::vector<groupConvLayerCPUTestParamsSet>& resultVec, T&& tesCase, Args&&... args) {
+    concatTestCases(resultVec, std::forward<T>(tesCase));
+    concatTestCases(resultVec, std::forward<Args>(args)...);
+}
+
+template<typename... Args>
+std::vector<groupConvLayerCPUTestParamsSet> generateSingleGroupConvCPUTestCases(Args&&... args) {
+    std::vector<groupConvLayerCPUTestParamsSet> retVec;
+    concatTestCases(retVec, std::forward<Args>(args)...);
+    return retVec;
+}
+
+/* COMMON PARAMS */
+
+const VecPrcConnectedParams vecPrcConnectParamsFP32 = {PrcConnectedParams{Precision::FP32, Precision::FP32, fusingParamsSet}};
+const VecPrcConnectedParams vecPrcConnectParams = {PrcConnectedParams{Precision::FP32, Precision::FP32, fusingParamsSet},
+                                                   PrcConnectedParams{Precision::BF16, Precision::BF16, fusingParamsSetBF16},
+                                                   PrcConnectedParams{Precision::BF16, Precision::FP32, fusingParamsSetBF16}};
+
+const VecPrcConnectedParams vecPrcConnectParamsFP32Default = {PrcConnectedParams{Precision::FP32, Precision::FP32, VecFusingParams{emptyFusingSpec}}};
+const VecPrcConnectedParams vecPrcConnectParamsDefault = {PrcConnectedParams{Precision::FP32, Precision::FP32, VecFusingParams{emptyFusingSpec}},
+                                                          PrcConnectedParams{Precision::BF16, Precision::BF16, VecFusingParams{emptyFusingSpec}},
+                                                          PrcConnectedParams{Precision::BF16, Precision::FP32, VecFusingParams{emptyFusingSpec}}};
+
 /* ============= GEMM GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> gemmGroupConvTestCases = {
+const std::vector<groupConvLayerCPUTestParamsSet> gemmGroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. is_depthwise (true, false)
        //  2. jcp.im2col_sz (=0,>0)
        //  3. is_blocking_applicable (true, false)

        //  is_depthwise == false, im2col_sz > 0
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 2, 2, conv_gemm_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 2, 2, CPUParams_Gemm_2D, vecPrcConnectParams),
        //  is_depthwise == true
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 1, 1, conv_gemm_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 1, 1,
+                                        CPUParams_Gemm_2D, vecPrcConnectParams),
        //  im2col_sz == 0, is_blocking_applicable == true
-        makeSingleGroupConvCPUTestCase({1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 2, 2, conv_gemm_2D),
+        makeSingleGroupConvCPUTestCases({1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 2, 2, CPUParams_Gemm_2D, vecPrcConnectParams),
        //  is_blocking_applicable == false ((jcp.im2col_sz == 0) && (jcp.ic / jcp.oc >= 42))
-        makeSingleGroupConvCPUTestCase({1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 42, 1, conv_gemm_2D),
+        makeSingleGroupConvCPUTestCases({1, 1}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 42, 1, CPUParams_Gemm_2D, vecPrcConnectParams),

        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 3, 2, {129, 129}, 4, 2, conv_gemm_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 2, 1, {10, 10}, 3, 3, conv_gemm_2D),
-        makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
-                3, 2, {33, 33, 33}, 4, 2, conv_gemm_3D),
-        makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
-                2, 1, {10, 10, 10}, 3, 3, conv_gemm_3D),
-};
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        3, 2, {129, 129}, 4, 2, CPUParams_Gemm_2D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10}, 3, 3, CPUParams_Gemm_2D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        3, 2, {33, 33, 33}, 4, 2, CPUParams_Gemm_3D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10, 10}, 3, 3, CPUParams_Gemm_3D, vecPrcConnectParams)
+);

-INSTANTIATE_TEST_CASE_P(smoke_GEMM_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(gemmGroupConvTestCases)));
+INSTANTIATE_TEST_CASE_P(smoke_GEMM_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(gemmGroupConvTestCases)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT SSE42 GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> JIT_SSE42_GroupConvTestCases = {
+const std::vector<CPUSpecificParams> sse42_GroupConv = {conv_sse42_2D, conv_sse42_2D_nspc};
+const std::vector<groupConvLayerCPUTestParamsSet> JIT_SSE42_GroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. jcp.ur_w (=3,<3)
        //  2. jcp.ur_w_tail (=0,>0)
        //  3. jcp.kw (>7,<=7)
@ -450,37 +600,50 @@ const std::vector<groupConvLayerCPUTestParamsSet> JIT_SSE42_GroupConvTestCases =
        //  6. ocb_work

        //  jcp.ur_w == 3, jcp.ur_w_tail == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 10}, 8, 8, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 10}, 8, 8, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  jcp.ur_w < 3 (jcp.ur_w == jcp.ow)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 4}, 8, 8, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 4}, 8, 8, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  jcp.ur_w == 3, jcp.ur_w_tail == 0
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 11}, 8, 8, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 11}, 8, 8, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  jcp.kw > 7
-        makeSingleGroupConvCPUTestCase({3, 8}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 10}, 8, 8, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 8}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 10}, 8, 8, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  jcp.nb_oc == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 8, 16, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 8, 16, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  jcp.nb_ic == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 16, 8, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 16, 8, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  ocb_work > 1 (ocb_work == 2)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 8, 40, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 8, 40, sse42_GroupConv, vecPrcConnectParamsFP32),
        //  jcp.nb_ic == 2, ocb_work == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 16, 40, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 16, 40, sse42_GroupConv, vecPrcConnectParamsFP32),

        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 3, 2, {129, 129}, 8, 8, conv_sse42_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 2, 1, {10, 10}, 8, 8, conv_sse42_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        3, 2, {129, 129}, 8, 8, sse42_GroupConv, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10}, 8, 8, sse42_GroupConv, vecPrcConnectParamsFP32Default)

        //  not supported jit_sse42 for 3d
-        //  makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+        //  makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
        //                              3, 2, {33, 33, 33}, 8, 8, cpuParams_sse42_3D),
-        //  makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+        //  makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
        //                              2, 1, {10, 10, 10}, 8, 8, cpuParams_sse42_3D),
-};
+);

-INSTANTIATE_TEST_CASE_P(smoke_JIT_SSE42_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(JIT_SSE42_GroupConvTestCases)));
+INSTANTIATE_TEST_CASE_P(smoke_JIT_SSE42_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(JIT_SSE42_GroupConvTestCases)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT AVX2 GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX2_GroupConvTestCases = {
+const std::vector<CPUSpecificParams> avx2_GroupConv_2D = {conv_avx2_2D, conv_avx2_2D_nspc};
+const std::vector<CPUSpecificParams> avx2_GroupConv_3D = {conv_avx2_3D, conv_avx2_3D_nspc};
+const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX2_GroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. jcp.ur_w (=3,<3)
        //  2. jcp.ur_w_tail (=0,>0)
        //  3. jcp.kw (>7,<=7)
@ -489,144 +652,181 @@ const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX2_GroupConvTestCases =
        //  6. ocb_work

        //  jcp.ur_w == 3, jcp.ur_w_tail == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 10}, 8, 8, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 10}, 8, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  jcp.ur_w < 3 (jcp.ur_w == jcp.ow)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 4}, 8, 8, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 4}, 8, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  jcp.ur_w == 3, jcp.ur_w_tail == 0
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 11}, 8, 8, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 11}, 8, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  jcp.kw > 7
-        makeSingleGroupConvCPUTestCase({3, 8}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 10}, 8, 8, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 8}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 10}, 8, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  jcp.nb_oc == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 8, 16, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 8, 16, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  jcp.nb_ic == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 16, 8, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 16, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  ocb_work > 1 (ocb_work == 2)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 8, 40, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 8, 40, avx2_GroupConv_2D, vecPrcConnectParamsFP32),
        //  jcp.nb_ic == 2, ocb_work == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 16, 40, conv_avx2_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 16, 40, avx2_GroupConv_2D, vecPrcConnectParamsFP32),

        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 3, 2, {129, 129}, 8, 8, conv_avx2_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 2, 1, {10, 10}, 8, 8, conv_avx2_2D),
-        makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
-                                    3, 2, {33, 33, 33}, 8, 8, conv_avx2_3D),
-        makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
-                                    2, 1, {10, 10, 10}, 8, 8, conv_avx2_3D),
-};
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        3, 2, {129, 129}, 8, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10}, 8, 8, avx2_GroupConv_2D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        3, 2, {33, 33, 33}, 8, 8, avx2_GroupConv_3D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10, 10}, 8, 8, avx2_GroupConv_3D, vecPrcConnectParamsFP32)
+);

-INSTANTIATE_TEST_CASE_P(smoke_JIT_AVX2_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(JIT_AVX2_GroupConvTestCases)));
+INSTANTIATE_TEST_CASE_P(smoke_JIT_AVX2_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(JIT_AVX2_GroupConvTestCases)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT AVX512 GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX512_GroupConvTestCases = {
+const std::vector<CPUSpecificParams> avx512_GroupConv_2D = {conv_avx512_2D, conv_avx512_2D_nspc};
+const std::vector<CPUSpecificParams> avx512_GroupConv_3D = {conv_avx512_3D, conv_avx512_3D_nspc};
+const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX512_GroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. "blocked to blocked" or "planar to blocked"
        //  2. jcp.nb_ic, jcp.nb_oc

        //  blocked to blocked
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 16, 16, conv_avx512_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 16, 16, avx512_GroupConv_2D, vecPrcConnectParams),
        //  jcp.nb_ic == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 32, 16, conv_avx512_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 32, 16, avx512_GroupConv_2D, vecPrcConnectParams),
        //  jcp.nb_oc == 2
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 2, 1, {5, 5}, 16, 32, conv_avx512_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        2, 1, {5, 5}, 16, 32, avx512_GroupConv_2D, vecPrcConnectParams),

        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 3, 2, {129, 129}, 16, 16,
-                                       conv_avx512_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 2, 1, {10, 10}, 16, 16, conv_avx512_2D),
-        makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
-                                    3, 2, {33, 33, 33}, 16, 16, conv_avx512_3D),
-        makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
-                                    2, 1, {10, 10, 10}, 16, 16, conv_avx512_3D),
-};
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 3, 2, {129, 129}, 16, 16,
+                                        avx512_GroupConv_2D, vecPrcConnectParams),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10}, 16, 16, avx512_GroupConv_2D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        3, 2, {33, 33, 33}, 16, 16, avx512_GroupConv_3D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        2, 1, {10, 10, 10}, 16, 16, avx512_GroupConv_3D, vecPrcConnectParams)
+);

-INSTANTIATE_TEST_CASE_P(smoke_JIT_AVX512_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(JIT_AVX512_GroupConvTestCases)));
+INSTANTIATE_TEST_CASE_P(smoke_JIT_AVX512_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice(JIT_AVX512_GroupConvTestCases)),
+                        GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT SSE42 DW GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> JIT_SSE42_DW_GroupConvTestCases = {
+const std::vector<CPUSpecificParams> sse42_DW_2D = {conv_sse42_dw_2D, conv_sse42_dw_2D_nspc};
+const std::vector<CPUSpecificParams> sse42_DW_3D = {conv_sse42_dw_3D, conv_sse42_dw_3D_nspc};
+const std::vector<groupConvLayerCPUTestParamsSet> JIT_SSE42_DW_GroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. jcp.ngroups % simd_w (=0,!=0)
        //  2. jcp.nb_ch
        //  3. jcp.nb_ch_blocking (=2,<2)
        //  4. jcp.ur_w == 3

        //  jcp.ngroups % simd_w == 0, jcp.nb_ch == 1, jcp.nb_ch_blocking == 1 (jcp.ngroups == 8)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 8, 1, {5, 5}, 1, 1, conv_sse42_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        8, 1, {5, 5}, 1, 1, sse42_DW_2D, vecPrcConnectParamsFP32),
        //  jcp.ngroups % simd_w == 0, jcp.nb_ch == 2, jcp.nb_ch_blocking == 2 (jcp.ngroups == 16)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 16, 1, {5, 5}, 1, 1, conv_sse42_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        16, 1, {5, 5}, 1, 1, sse42_DW_2D, vecPrcConnectParamsFP32),
        //  jcp.ngroups % simd_w != 0, jcp.nb_ch == 3, jcp.nb_ch_blocking == 2 (jcp.ngroups == 17) TODO: pad channels not supported for SSE42
-        //  makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 17, 1, {5, 5}, 1, 1, conv_sse42_dw_2D),
+        //  makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+        //  17, 1, {5, 5}, 1, 1, conv_sse42_DW_2D, vecPrcConnectParamsFP32only),
        //  jcp.ow > jcp.ur_w (jcp.ow == 7)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 8, 1, {5, 9}, 1, 1, conv_sse42_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        8, 1, {5, 9}, 1, 1, sse42_DW_2D, vecPrcConnectParamsFP32),

        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 8, 2, {129, 129}, 1, 1,
-                                       conv_sse42_dw_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 8, 1, {10, 10}, 1, 1, conv_sse42_dw_2D),
-        makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
-                                    8, 2, {33, 33, 33}, 1, 1, conv_sse42_dw_3D),
-        makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
-                                    8, 1, {10, 10, 10}, 1, 1, conv_sse42_dw_3D),
-};
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 8, 2, {129, 129}, 1, 1,
+                                        sse42_DW_2D, vecPrcConnectParamsFP32),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        8, 1, {10, 10}, 1, 1, sse42_DW_2D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        8, 2, {33, 33, 33}, 1, 1, sse42_DW_3D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        8, 1, {10, 10, 10}, 1, 1, sse42_DW_3D, vecPrcConnectParamsFP32)
+);

 INSTANTIATE_TEST_CASE_P(smoke_JIT_SSE42_DW_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice
-(JIT_SSE42_DW_GroupConvTestCases)));
+(JIT_SSE42_DW_GroupConvTestCases)), GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT AVX2 DW GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX2_DW_GroupConvTestCases = {
+const std::vector<CPUSpecificParams> avx2_DW_2D = {conv_avx2_dw_2D, conv_avx2_dw_2D_nspc};
+const std::vector<CPUSpecificParams> avx2_DW_3D = {conv_avx2_dw_3D, conv_avx2_dw_3D_nspc};
+const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX2_DW_GroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. jcp.ngroups % simd_w (=0,!=0)
        //  2. jcp.nb_ch
        //  3. jcp.nb_ch_blocking (=3,<3)
        //  4. jcp.ur_w == 4

        //  jcp.ngroups % simd_w == 0, jcp.nb_ch == 1, jcp.nb_ch_blocking == 1 (jcp.ngroups == 8)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 8, 1, {5, 5}, 1, 1, conv_avx2_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        8, 1, {5, 5}, 1, 1, avx2_DW_2D, vecPrcConnectParamsFP32),
        //  jcp.ngroups % simd_w == 0, jcp.nb_ch == 3, jcp.nb_ch_blocking == 3 (jcp.ngroups == 24)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 24, 1, {5, 5}, 1, 1, conv_avx2_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        24, 1, {5, 5}, 1, 1, avx2_DW_2D, vecPrcConnectParamsFP32),
        //  jcp.ngroups % simd_w != 0, jcp.nb_ch == 4, jcp.nb_ch_blocking == 3 (jcp.ngroups == 25)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 25, 1, {5, 5}, 1, 1, conv_avx2_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        25, 1, {5, 5}, 1, 1, avx2_DW_2D, vecPrcConnectParamsFP32),
        //  jcp.ow > jcp.ur_w (jcp.ow == 7)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 8, 1, {5, 9}, 1, 1, conv_avx2_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        8, 1, {5, 9}, 1, 1, avx2_DW_2D, vecPrcConnectParamsFP32),

        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 8, 2, {129, 129}, 1, 1,
-                                       conv_avx2_dw_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 8, 1, {10, 10}, 1, 1, conv_avx2_dw_2D),
-        makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
-                                    8, 2, {33, 33, 33}, 1, 1, conv_avx2_dw_3D),
-        makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
-                                    8, 1, {10, 10, 10}, 1, 1, conv_avx2_dw_3D),
-};
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 8, 2, {129, 129}, 1, 1,
+                                        avx2_DW_2D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        8, 1, {10, 10}, 1, 1, avx2_DW_2D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        8, 2, {33, 33, 33}, 1, 1, avx2_DW_3D, vecPrcConnectParamsFP32Default),
+        makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        8, 1, {10, 10, 10}, 1, 1, avx2_DW_3D, vecPrcConnectParamsFP32)
+);

 INSTANTIATE_TEST_CASE_P(smoke_JIT_AVX2_DW_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice
-(JIT_AVX2_DW_GroupConvTestCases)));
+(JIT_AVX2_DW_GroupConvTestCases)), GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT AVX512 DW GroupConvolution ============= */
-const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX512_DW_GroupConvTestCases = {
+const std::vector<CPUSpecificParams> avx512_DW_2D = {conv_avx512_dw_2D, conv_avx512_dw_2D_nspc};
+const std::vector<CPUSpecificParams> avx512_DW_3D = {conv_avx512_dw_3D, conv_avx512_dw_3D_nspc};
+const std::vector<groupConvLayerCPUTestParamsSet> JIT_AVX512_DW_GroupConvTestCases = generateSingleGroupConvCPUTestCases(
        //  1. jcp.ngroups % simd_w (=0,!=0)
        //  2. jcp.nb_ch
        //  3. jcp.nb_ch_blocking (=4,<4)
        //  4. jcp.ur_w == 6

        //  jcp.ngroups % simd_w == 0, jcp.nb_ch == 1, jcp.nb_ch_blocking == 1 (jcp.ngroups == 16)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 16, 1, {5, 5}, 1, 1, conv_avx512_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        16, 1, {5, 5}, 1, 1, avx512_DW_2D, vecPrcConnectParams),
        //  jcp.ngroups % simd_w == 0, jcp.nb_ch == 4, jcp.nb_ch_blocking == 4 (jcp.ngroups == 64)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 64, 1, {5, 5}, 1, 1, conv_avx512_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        64, 1, {5, 5}, 1, 1, avx512_DW_2D, vecPrcConnectParams),
        //  jcp.ngroups % simd_w != 0, jcp.nb_ch == 5, jcp.nb_ch_blocking == 4 (jcp.ngroups == 65)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 65, 1, {5, 5}, 1, 1, conv_avx512_dw_2D),
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        65, 1, {5, 5}, 1, 1, avx512_DW_2D, vecPrcConnectParams),
        //  jcp.ow > jcp.ur_w (jcp.ow == 7)
-        makeSingleGroupConvCPUTestCase({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID, 8, 1, {5, 9}, 1, 1, conv_avx512_dw_2D),
-
+        makeSingleGroupConvCPUTestCases({3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}, ngraph::op::PadType::VALID,
+                                        8, 1, {5, 9}, 1, 1, avx512_DW_2D, vecPrcConnectParams),
        //  "hard" cases
-        makeSingleGroupConvCPUTestCase({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 16, 2, {129, 129}, 1, 1,
-                                       conv_avx512_dw_2D),
-        makeSingleGroupConvCPUTestCase({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 16, 1, {10, 10}, 1, 1,
-                                       conv_avx512_dw_2D),
-        makeSingleGroupConvCPUTestCase({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
-                                    16, 2, {33, 33, 33}, 1, 1, conv_avx512_dw_3D),
-        makeSingleGroupConvCPUTestCase({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
-                                    16, 1, {10, 10, 10}, 1, 1, conv_avx512_dw_3D),
-};
+        makeSingleGroupConvCPUTestCases({3, 3}, {2, 2}, {1, 1}, {1, 1}, {1, 1}, ngraph::op::PadType::EXPLICIT, 16, 2, {129, 129}, 1, 1,
+                                        avx512_DW_2D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({2, 4}, {1, 2}, {3, 2}, {2, 1}, {1, 0}, ngraph::op::PadType::EXPLICIT, 16, 1, {10, 10}, 1, 1,
+                                        avx512_DW_2D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({3, 3, 3}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, ngraph::op::PadType::EXPLICIT,
+                                        16, 2, {33, 33, 33}, 1, 1, avx512_DW_3D, vecPrcConnectParamsDefault),
+        makeSingleGroupConvCPUTestCases({2, 3, 4}, {1, 2, 2}, {3, 1, 2}, {2, 2, 1}, {1, 1, 0}, ngraph::op::PadType::EXPLICIT,
+                                        16, 1, {10, 10, 10}, 1, 1, avx512_DW_3D, vecPrcConnectParams)
+);

 INSTANTIATE_TEST_CASE_P(smoke_JIT_AVX512_DW_GroupConv, GroupConvolutionLayerCPUTest, ::testing::ValuesIn(filterParamsSetForDevice
-(JIT_AVX512_DW_GroupConvTestCases)));
+(JIT_AVX512_DW_GroupConvTestCases)), GroupConvolutionLayerCPUTest::getTestCaseName);

 /* ============= JIT SSE42 1x1 Convolution (not supported with groups) ============= */
 /* ============= JIT AVX2 1x1 Convolution (not supported with groups) ============= */
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution_backprop_data.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/group_convolution_backprop_data.cpp
@ -4,6 +4,7 @@

 #include <shared_test_classes/single_layer/group_convolution_backprop_data.hpp>
 #include "test_utils/cpu_test_utils.hpp"
+#include "test_utils/convolution_params.hpp"
 #include "test_utils/fusing_test_utils.hpp"

 using namespace InferenceEngine;
--- a/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/conv_concat.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/subgraph_tests/src/conv_concat.cpp
@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //

+#include "test_utils/convolution_params.hpp"
 #include "subgraph_tests/include/conv_concat.hpp"

 using namespace InferenceEngine;
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/convolution_params.hpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/convolution_params.hpp
@ -0,0 +1,73 @@
+// Copyright (C) 2021 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "cpu_test_utils.hpp"
+
+namespace CPUTestUtils {
+    const auto conv_sse42_1D = CPUSpecificParams{{}, {}, {"jit_sse42"}, "jit_sse42"};
+    const auto conv_avx2_1D = CPUSpecificParams{{}, {}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx512_1D = CPUSpecificParams{{}, {}, {"jit_avx512"}, "jit_avx512"};
+
+    const auto conv_ref_2D = CPUSpecificParams{{nchw}, {nchw}, {"ref_any"}, "ref_any"};
+    const auto conv_ref_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref_any"}, "ref_any"};
+
+    const auto conv_gemm_2D = CPUSpecificParams{{nchw}, {nchw}, {"gemm_any"}, "jit_gemm"};
+    const auto conv_gemm_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"gemm_any"}, "jit_gemm"};
+
+    const auto conv_gemm_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_gemm"}, "jit_gemm"};
+    const auto conv_gemm_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_gemm"}, "jit_gemm"};
+
+    const auto conv_sse42_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42"}, "jit_sse42"};
+    const auto conv_sse42_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_sse42"}, "jit_sse42"};
+    const auto conv_sse42_dw_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42_dw"}, "jit_sse42_dw"};
+    const auto conv_sse42_dw_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_sse42_dw"}, "jit_sse42_dw"};
+
+    const auto conv_sse42_plain_to_blocked_2D = CPUSpecificParams{{nchw}, {nChw8c}, {"jit_sse42"}, "jit_sse42"};
+    const auto conv_sse42_plain_to_blocked_3D = CPUSpecificParams{{ncdhw}, {nCdhw8c}, {"jit_sse42"}, "jit_sse42"};
+
+    const auto conv_sse42_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_sse42"}, "jit_sse42"};
+    const auto conv_sse42_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_sse42"}, "jit_sse42"};
+    const auto conv_sse42_dw_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_sse42_dw"}, "jit_sse42_dw"};
+    const auto conv_sse42_dw_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_sse42_dw"}, "jit_sse42_dw"};
+
+    const auto conv_avx2_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx2_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx2_dw_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2_dw"}, "jit_avx2_dw"};
+    const auto conv_avx2_dw_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_avx2_dw"}, "jit_avx2_dw"};
+    const auto conv_avx2_planar_2D = CPUSpecificParams{{nchw}, {nchw}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx2_planar_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"jit_avx2"}, "jit_avx2"};
+
+    const auto conv_avx2_plain_to_blocked_2D = CPUSpecificParams{{nchw}, {nChw8c}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx2_plain_to_blocked_3D = CPUSpecificParams{{ncdhw}, {nCdhw8c}, {"jit_avx2"}, "jit_avx2"};
+
+    const auto conv_avx2_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx2_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_avx2"}, "jit_avx2"};
+    const auto conv_avx2_dw_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_avx2_dw"}, "jit_avx2_dw"};
+    const auto conv_avx2_dw_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_avx2_dw"}, "jit_avx2_dw"};
+
+    const auto conv_avx512_2D = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512"}, "jit_avx512"};
+    const auto conv_avx512_3D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {"jit_avx512"}, "jit_avx512"};
+    const auto conv_avx512_dw_2D = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512_dw"}, "jit_avx512_dw"};
+    const auto conv_avx512_dw_3D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {"jit_avx512_dw"}, "jit_avx512_dw"};
+    const auto conv_avx512_planar_2D = CPUSpecificParams{{nchw}, {nchw}, {"jit_avx512"}, "jit_avx512"};
+    const auto conv_avx512_planar_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"jit_avx512"}, "jit_avx512"};
+
+    const auto conv_avx512_plain_to_blocked_2D = CPUSpecificParams{{nchw}, {nChw16c}, {"jit_avx512"}, "jit_avx512"};
+    const auto conv_avx512_plain_to_blocked_3D = CPUSpecificParams{{ncdhw}, {nCdhw16c}, {"jit_avx512"}, "jit_avx512"};
+
+    const auto conv_avx512_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_avx512"}, "jit_avx512"};
+    const auto conv_avx512_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_avx512"}, "jit_avx512"};
+    const auto conv_avx512_dw_2D_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_avx512_dw"}, "jit_avx512_dw"};
+    const auto conv_avx512_dw_3D_nspc = CPUSpecificParams{{ndhwc}, {ndhwc}, {"jit_avx512_dw"}, "jit_avx512_dw"};
+
+    const auto conv_sse42_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42_1x1"}, "jit_sse42_1x1"};
+    const auto conv_avx2_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2_1x1"}, "jit_avx2_1x1"};
+    const auto conv_avx512_2D_1x1 = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512_1x1"}, "jit_avx512_1x1"};
+
+    const auto conv_sse42_2D_1x1_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_sse42_1x1"}, "jit_sse42_1x1"};
+    const auto conv_avx2_2D_1x1_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_avx2_1x1"}, "jit_avx2_1x1"};
+    const auto conv_avx512_2D_1x1_nspc = CPUSpecificParams{{nhwc}, {nhwc}, {"jit_avx512_1x1"}, "jit_avx512_1x1"};
+} // namespace CPUTestUtils
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp
@ -148,35 +148,6 @@ protected:

 const auto emptyCPUSpec = CPUSpecificParams{{}, {}, {}, {}};

-const auto conv_sse42_1D = CPUSpecificParams{{}, {}, {"jit_sse42"}, "jit_sse42"};
-const auto conv_avx2_1D = CPUSpecificParams{{}, {}, {"jit_avx2"}, "jit_avx2"};
-const auto conv_avx512_1D = CPUSpecificParams{{}, {}, {"jit_avx512"}, "jit_avx512"};
-
-const auto conv_ref_2D = CPUSpecificParams{{nchw}, {nchw}, {"ref_any"}, "ref_any"};
-const auto conv_ref_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"ref_any"}, "ref_any"};
-
-const auto conv_gemm_2D = CPUSpecificParams{{nchw}, {nchw}, {"gemm_any"}, "jit_gemm"};
-const auto conv_gemm_3D = CPUSpecificParams{{ncdhw}, {ncdhw}, {"gemm_any"}, "jit_gemm"};
-
-const auto conv_sse42_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42"}, "jit_sse42"};
-const auto conv_sse42_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_sse42"}, "jit_sse42"};
-const auto conv_sse42_dw_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42_dw"}, "jit_sse42_dw"};
-const auto conv_sse42_dw_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_sse42_dw"}, "jit_sse42_dw"};
-
-const auto conv_avx2_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2"}, "jit_avx2"};
-const auto conv_avx2_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_avx2"}, "jit_avx2"};
-const auto conv_avx2_dw_2D = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2_dw"}, "jit_avx2_dw"};
-const auto conv_avx2_dw_3D = CPUSpecificParams{{nCdhw8c}, {nCdhw8c}, {"jit_avx2_dw"}, "jit_avx2_dw"};
-
-const auto conv_avx512_2D = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512"}, "jit_avx512"};
-const auto conv_avx512_3D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {"jit_avx512"}, "jit_avx512"};
-const auto conv_avx512_dw_2D = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512_dw"}, "jit_avx512_dw"};
-const auto conv_avx512_dw_3D = CPUSpecificParams{{nCdhw16c}, {nCdhw16c}, {"jit_avx512_dw"}, "jit_avx512_dw"};
-
-const auto conv_sse42_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_sse42_1x1"}, "jit_sse42_1x1"};
-const auto conv_avx2_2D_1x1 = CPUSpecificParams{{nChw8c}, {nChw8c}, {"jit_avx2_1x1"}, "jit_avx2_1x1"};
-const auto conv_avx512_2D_1x1 = CPUSpecificParams{{nChw16c}, {nChw16c}, {"jit_avx512_1x1"}, "jit_avx512_1x1"};
-
 // utility functions
 std::vector<CPUSpecificParams> filterCPUSpecificParams(std::vector<CPUSpecificParams>& paramsVector);
 std::vector<CPUSpecificParams> filterCPUInfoForDevice(std::vector<CPUSpecificParams> CPUParams);
--- a/inference-engine/thirdparty/mkl-dnn
+++ b/inference-engine/thirdparty/mkl-dnn
@ -1 +1 @@
-Subproject commit a81b4753105bb0a1622790256b02f19916cce77c
+Subproject commit 2f19a90c0273415e832520264c23d365b2dc43ed