[CPU] PSROIPooling node enhancements (#3851)

- bf support for PSROIPooling - nhwc, blocking formats support - code refactor & performance improvements - cpu specific tests
2021-01-28 11:55:54 +03:00 · 2021-01-28 11:55:54 +03:00 · a1422a49d7
commit a1422a49d7
parent 46f0775c09
5 changed files with 738 additions and 340 deletions
--- a/inference-engine/src/mkldnn_plugin/bf16transformer.h
+++ b/inference-engine/src/mkldnn_plugin/bf16transformer.h
@ -14,7 +14,7 @@ namespace MKLDNNPlugin {

 class BF16Transformer {
    const InferenceEngine::details::caseless_set<std::string> _initbf16 =
-        { "convolution", "fullyconnected", "innerproduct", "gemm", "RegionYolo", "Interpolate" };
+        { "convolution", "fullyconnected", "innerproduct", "gemm", "RegionYolo", "Interpolate", "PSROIPooling" };
    const InferenceEngine::details::caseless_set<std::string> _complementbf16 =
        { "relu", "tanh", "elu", "square", "abs", "sqrt", "linear", "bounded_relu", "soft_relu", "normalize",
          "sigmoid", "ReLU6", "not", "activation", "HSwish", "mish", "logistic", "mod", "resample",
--- a/inference-engine/src/mkldnn_plugin/nodes/psroi.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/psroi.cpp
@ -6,8 +6,12 @@
 #include <cmath>
 #include <vector>
 #include <string>
-#include <algorithm>
+#include <mkldnn_types.h>
 #include "ie_parallel.hpp"
+#include "utils/bfloat16.hpp"
+#include <mkldnn_selective_build.h>
+
+using namespace MKLDNNPlugin;

 namespace InferenceEngine {
 namespace Extensions {
@ -17,18 +21,18 @@ class PSROIPoolingImpl: public ExtLayerBase {
 public:
    explicit PSROIPoolingImpl(const CNNLayer* layer) {
        try {
-            mode_ = layer->GetParamAsString("mode", "average");
-            if (mode_ != "bilinear_deformable")
+            mode = layer->GetParamAsString("mode", "average");
+            if (mode != "bilinear_deformable")
                if (layer->insData.size() !=  2 || layer->outData.size() != 1)
                    THROW_IE_EXCEPTION << "Incorrect number of input/output edges!";
            // LayerSetUp
-            output_dim_ = static_cast<size_t>(layer->GetParamAsInt("output_dim"));
-            group_size_ = static_cast<size_t>(layer->GetParamAsInt("group_size"));
-            spatial_scale_ = layer->GetParamAsFloat("spatial_scale");
-            pooled_height_ = static_cast<size_t>(layer->GetParamAsInt("pooled_height", static_cast<int>(group_size_)));
-            pooled_width_ = static_cast<size_t>(layer->GetParamAsInt("pooled_width", static_cast<int>(group_size_)));
-            spatial_bins_x_ = static_cast<size_t>(layer->GetParamAsInt("spatial_bins_x", 1));
-            spatial_bins_y_ = static_cast<size_t>(layer->GetParamAsInt("spatial_bins_y", 1));
+            outputDim = static_cast<size_t>(layer->GetParamAsInt("output_dim"));
+            groupSize = static_cast<size_t>(layer->GetParamAsInt("group_size"));
+            spatialScale = layer->GetParamAsFloat("spatial_scale");
+            pooledHeight = static_cast<size_t>(layer->GetParamAsInt("pooled_height", static_cast<int>(groupSize)));
+            pooledWidth = static_cast<size_t>(layer->GetParamAsInt("pooled_width", static_cast<int>(groupSize)));
+            spatialBinsX = static_cast<size_t>(layer->GetParamAsInt("spatial_bins_x", 1));
+            spatialBinsY = static_cast<size_t>(layer->GetParamAsInt("spatial_bins_y", 1));

            SizeVector inDims = layer->insData[0].lock()->getTensorDesc().getDims();
            channels = static_cast<int>(inDims[1]);
@ -42,250 +46,460 @@ public:
            nw = static_cast<int>(outDims[3]);

            //  for Deformable PSROIPolling
-            no_trans_ = layer->GetParamAsBool("no_trans", true);
-            part_size_ = layer->GetParamAsInt("part_size", 1);
-            trans_std_ = layer->GetParamAsFloat("trans_std", 1);
+            noTrans = layer->GetParamAsBool("no_trans", true);
+            partSize = layer->GetParamAsInt("part_size", 1);
+            transStd = layer->GetParamAsFloat("trans_std", 1);

-            if (no_trans_) {
-                addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
+            auto supportedPrecision = (layer->insData[0].lock()->getTensorDesc().getPrecision() == Precision::BF16 ? Precision::BF16 : Precision::FP32);
+
+            std::vector<std::pair<Layout, Layout> > plainConfs{
+                {NCHW, NCHW},
+                {NHWC, NHWC}
+            };
+
+            std::vector<std::pair<ConfLayout, ConfLayout> > blockConfs {
+                    {ConfLayout::BLK16, ConfLayout::BLK16},
+                    {ConfLayout::BLK8, ConfLayout::BLK8}
+            };
+
+            if (mode != "bilinear_deformable") {
+                for (auto conf : plainConfs) {
+                    LayerConfig config;
+                    DataConfig inConfig0, inConfig1, inConfig2;
+                    SizeVector propDims = layer->insData[1].lock()->getTensorDesc().getDims();
+                    inConfig0.desc = TensorDesc(supportedPrecision, inDims, conf.first);
+                    inConfig1.desc = TensorDesc(Precision::FP32, propDims, NC);
+                    config.inConfs.push_back(inConfig0);
+                    config.inConfs.push_back(inConfig1);
+                    DataConfig outConfig;
+                    outConfig.desc = TensorDesc(supportedPrecision, outDims, conf.second);
+                    config.outConfs.push_back(outConfig);
+                    confs.push_back(config);
+                }
+                for (auto conf : blockConfs) {
+                    addConfig(layer, {DataConfigurator(conf.first, supportedPrecision),
+                                      DataConfigurator(ConfLayout::PLN, Precision::FP32)},
+                              {DataConfigurator(conf.second, supportedPrecision)});
+                }
+            } else if (noTrans) {
+                addConfig(layer, {DataConfigurator(ConfLayout::PLN, supportedPrecision), DataConfigurator(ConfLayout::PLN, Precision::FP32)},
+                            {DataConfigurator(ConfLayout::PLN, supportedPrecision)});
            } else {
-                addConfig(layer, {DataConfigurator(ConfLayout::PLN), DataConfigurator(ConfLayout::PLN),
-                                  DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN)});
+                addConfig(layer, {DataConfigurator(ConfLayout::PLN, supportedPrecision),
+                                  DataConfigurator(ConfLayout::PLN, Precision::FP32),
+                                  DataConfigurator(ConfLayout::PLN)}, {DataConfigurator(ConfLayout::PLN, supportedPrecision)});
            }
        } catch (InferenceEngine::details::InferenceEngineException &ex) {
            errorMsg = ex.what();
        }
    }

-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs,
-                       ResponseDesc *resp) noexcept override {
-        float* dst_data = outputs[0]->buffer();
-        const float *bottom_data_beginning = inputs[0]->buffer();
-        const float *bottom_rois_beginning = inputs[1]->buffer();
+    struct PSROIPoolingContext {
+        PSROIPoolingImpl &node;
+        std::vector<Blob::Ptr>& inputs;
+        std::vector<Blob::Ptr>& outputs;
+    };

-        int real_rois = 0;
-        for (; real_rois < nn; real_rois++) {
-            const float *bottom_rois = bottom_rois_beginning + real_rois * 5;
-            int roi_batch_ind = static_cast<int>(bottom_rois[0]);
-            if (roi_batch_ind == -1) {
+    template<typename T>
+    struct PSROIPoolingExecute {
+        using srcT = typename std::tuple_element<0, T>::type;
+        using dstT = typename std::tuple_element<1, T>::type;
+
+        void operator()(PSROIPoolingContext & ctx) {
+            ctx.node.executeSpecified<srcT, dstT>(ctx.inputs, ctx.outputs);
+        }
+    };
+
+    static void unpackParams(const TensorDesc& srcDesc, const TensorDesc& dstDesc,
+                      int& hInputStride, int& wInputStride,
+                      int& hOutputStride, int& wOutputStride,
+                      Layout& inFmt, Layout& outFmt,
+                      int& inBlockSize, int& outBlockSize,
+                      int& outBlockCount,
+                      unsigned long& inputChannelsPadding, unsigned long& outputChannelsPadding) {
+        inFmt = srcDesc.getLayout();
+        outFmt = dstDesc.getLayout();
+        int expectedInBlockDimsSize = (inFmt == Layout::BLOCKED ? 5 : 4);
+        int expectedOutBlockDimsSize = (outFmt == Layout::BLOCKED ? 5 : 4);
+        auto inBlkDims = srcDesc.getBlockingDesc().getBlockDims();
+        auto outBlkDims = dstDesc.getBlockingDesc().getBlockDims();
+        if (inBlkDims.size() != expectedInBlockDimsSize)
+            THROW_IE_EXCEPTION << "Unexpected size of blocking dims in input (given " << inBlkDims.size() << ", expected " << expectedInBlockDimsSize << ")";
+        if (outBlkDims.size() != expectedOutBlockDimsSize)
+            THROW_IE_EXCEPTION << "Unexpected size of blocking dims in output (given " << outBlkDims.size() << ", expected " << expectedOutBlockDimsSize << ")";
+
+        inBlockSize = (inFmt == Layout::BLOCKED ? srcDesc.getBlockingDesc().getBlockDims()[4] : 1);
+        outBlockSize = (outFmt == Layout::BLOCKED ? dstDesc.getBlockingDesc().getBlockDims()[4] : 1);
+        inputChannelsPadding = srcDesc.getBlockingDesc().getBlockDims()[1] * inBlockSize;
+        outputChannelsPadding = dstDesc.getBlockingDesc().getBlockDims()[1] * outBlockSize;
+        outBlockCount = outputChannelsPadding / outBlockSize;
+
+        int hOutStrIndex = 0, wOutStrIndex = 0, hInStrIndex = 0, wInStrIndex = 0;
+        const auto& outOrder = dstDesc.getBlockingDesc().getOrder();
+        const auto& inOrder = srcDesc.getBlockingDesc().getOrder();
+        for (int i = 0; i < outOrder.size(); i++) {
+            if (outOrder[i] == 2) hOutStrIndex = i;
+            if (outOrder[i] == 3) wOutStrIndex = i;
+        }
+        for (int i = 0; i < inOrder.size(); i++) {
+            if (inOrder[i] == 2) hInStrIndex = i;
+            if (inOrder[i] == 3) wInStrIndex = i;
+        }
+        hInputStride = srcDesc.getBlockingDesc().getStrides()[hInStrIndex];
+        wInputStride = srcDesc.getBlockingDesc().getStrides()[wInStrIndex];
+        hOutputStride = dstDesc.getBlockingDesc().getStrides()[hOutStrIndex];
+        wOutputStride = dstDesc.getBlockingDesc().getStrides()[wOutStrIndex];
+    }
+
+    template <typename inputType, typename outputType>
+    void executeAverage(const inputType *srcData, outputType *dstData, const float *bottomRois,
+                        const int n, const int roiBatchInd,
+                        const TensorDesc& srcDesc, const TensorDesc& dstDesc) {
+        Layout inFmt, outFmt;
+        int inBlockSize, outBlockSize, outBlockCount, hInputStride, wInputStride, hOutputStride, wOutputStride;
+        unsigned long inputChannelsPadding, outputChannelsPadding;
+        unpackParams(srcDesc, dstDesc, hInputStride, wInputStride, hOutputStride, wOutputStride,
+            inFmt, outFmt, inBlockSize, outBlockSize, outBlockCount, inputChannelsPadding, outputChannelsPadding);
+        const float roiStartW = static_cast<float>(round(bottomRois[1])) * spatialScale;
+        const float roiStartH = static_cast<float>(round(bottomRois[2])) * spatialScale;
+        const float roiEndW   = static_cast<float>(round(bottomRois[3] + 1.0f)) * spatialScale;
+        const float roiEndH   = static_cast<float>(round(bottomRois[4] + 1.0f)) * spatialScale;
+        // Force too small ROIs to be 1x1
+        const float roiWidth  = std::max<float>(roiEndW - roiStartW, 0.1f);  // avoid 0
+        const float roiHeight = std::max<float>(roiEndH - roiStartH, 0.1f);
+
+        auto avgPsroi = [&] (int c, int h, int w, int binOffIn, int binOffOut, int inBlkRes, int outBlkRes) {
+            float binSizeH = roiHeight / static_cast<float>(pooledHeight);
+            float binSizeW = roiWidth / static_cast<float>(pooledWidth);
+
+            int hStart = static_cast<int>(floor(static_cast<float>(h + 0) * binSizeH + roiStartH));
+            int hEnd = static_cast<int>(ceil(static_cast<float>(h + 1) * binSizeH + roiStartH));
+
+            hStart = std::min<int>(std::max<int>(hStart, 0), height);
+            hEnd = std::min<int>(std::max<int>(hEnd, 0), height);
+            int wStart = static_cast<int>(floor(static_cast<float>(w + 0) * binSizeW + roiStartW));
+            int wEnd = static_cast<int>(ceil(static_cast<float>(w + 1) * binSizeW + roiStartW));
+
+            wStart = std::min<int>(std::max<int>(wStart, 0), width);
+            wEnd = std::min<int>(std::max<int>(wEnd, 0), width);
+
+            const float binArea = static_cast<float>((hEnd - hStart) * (wEnd - wStart));
+
+            size_t dstIndex = binOffOut + h * hOutputStride + w * wOutputStride + outBlkRes;
+            dstData[dstIndex] = 0;
+            if (binArea) {
+                float outSum = 0.0f;
+                const int heightIndexBound = hEnd * hInputStride;
+                const int widthIndexBound = wEnd * wInputStride;
+                for (int hh = hStart * hInputStride; hh < heightIndexBound; hh += hInputStride) {
+                    for (int ww = wStart * wInputStride; ww < widthIndexBound; ww += wInputStride) {
+                        outSum += srcData[binOffIn + hh + ww + inBlkRes];
+                    }
+                }
+                dstData[dstIndex] = outSum / binArea;
+            }
+        };
+        if (inFmt == Layout::NHWC) {
+            parallel_for2d(nh, nw, [&](int h, int w) {
+                const int binOffsetOutput = n * nc * nh * nw;
+                const int binOffsetInput = roiBatchInd * channels * height * width;
+                for (int c = 0; c < nc; c++) {
+                    const int gc = (c * groupSize + h) * groupSize + w;
+                    avgPsroi(c, h, w, 0, 0, binOffsetInput + gc, binOffsetOutput + c);
+                }
+            });
+        } else if (inFmt == Layout::NCHW) {
+            parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
+                const int gc = (c * groupSize + h) * groupSize + w;
+                const int outputBlockResidual = (outFmt == Layout::NCHW ? 0 : c % inBlockSize);
+                const int outputBlockIdx = (c / outBlockSize) * outBlockSize;
+                const int binOffsetInput = (roiBatchInd * inputChannelsPadding + gc) * height * width;
+                const int binOffsetOutput = (n * outputChannelsPadding + outputBlockIdx) * nh * nw;
+                avgPsroi(c, h, w, 0, outputBlockResidual, binOffsetInput, binOffsetOutput);
+            });
+        } else {  // nChw16c, nChw8c
+            parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) {
+                int cStart = blkIdx * outBlockSize;
+                int cEnd = (blkIdx == outBlockCount - 1 ? nc : cStart + outBlockSize);
+                for (int c = cStart; c < cEnd; c++) {
+                    const int gc = (c * groupSize + h) * groupSize + w;
+                    const int inputBlockResidual = (inFmt == Layout::NCHW ? 0 : gc % inBlockSize);
+                    const int outputBlockResidual = (outFmt == Layout::NCHW ? 0 : c % inBlockSize);
+                    const int inputBlockIdx = (gc / inBlockSize) * inBlockSize;
+                    const int outputBlockIdx = (c / outBlockSize) * outBlockSize;
+                    const int binOffsetInput = (roiBatchInd * inputChannelsPadding + inputBlockIdx) * height * width;
+                    const int binOffsetOutput = (n * outputChannelsPadding + outputBlockIdx) * nh * nw;
+                    avgPsroi(c, h, w, inputBlockResidual, outputBlockResidual, binOffsetInput, binOffsetOutput);
+                }
+            });
+        }
+    }
+
+    template <typename inputType, typename outputType>
+    void executeBilinear(const inputType *srcData, outputType *dstData, const float *bottomRois,
+                                     const int currentRoi, const int roiBatchInd,
+                                     const TensorDesc& srcDesc, const TensorDesc& dstDesc) {
+        Layout inFmt, outFmt;
+        int inBlockSize, outBlockSize, outBlockCount, hInputStride, wInputStride, hOutputStride, wOutputStride;
+        unsigned long inputChannelsPadding, outputChannelsPadding;
+        unpackParams(srcDesc, dstDesc, hInputStride, wInputStride, hOutputStride, wOutputStride,
+                     inFmt, outFmt, inBlockSize, outBlockSize, outBlockCount, inputChannelsPadding, outputChannelsPadding);
+        const float roiStartW = bottomRois[1] * spatialScale;
+        const float roiStartH = bottomRois[2] * spatialScale;
+        const float roiEndW = bottomRois[3] * spatialScale;
+        const float roiEndH = bottomRois[4] * spatialScale;
+        const float roiWidth  = roiEndW - roiStartW;
+        const float roiHeight = roiEndH - roiStartH;
+        size_t numBins = spatialBinsX * spatialBinsY;
+        const int binCount = nh * nw;
+
+        auto bilinearPsroi = [&] (int c, int h, int w, int binOffOut, int outBlkRes) {
+            float accum = 0.0f;
+            int binOffIn, inBlkRes;
+            size_t dstIndex = binOffOut + h * hOutputStride + w * wOutputStride + outBlkRes;
+            dstData[dstIndex] = 0;
+
+            for (size_t binY = 0; binY < spatialBinsY; binY++) {
+                const float boxYmin = roiStartH + (binY + 0) * (roiHeight / spatialBinsY);
+                const float boxYmax = roiStartH + (binY + 1) * (roiHeight / spatialBinsY);
+                const float heightScale = nh > 1 ? (boxYmax - boxYmin) * (height - 1) / (pooledHeight - 1) : 0.0f;
+                const float inY = nh > 1 ? (h * heightScale + boxYmin * (height - 1)) : 0.5f * (boxYmin + boxYmax) * (height - 1);
+                for (size_t binX = 0; binX < spatialBinsX; binX++) {
+                    size_t gc = c + (binY * spatialBinsX + binX) * nc;
+                    if (inFmt == Layout::NHWC) {
+                        binOffIn = roiBatchInd * channels * height * width + gc;
+                        inBlkRes = 0;
+                    } else {  // nchw, nChw16c, nChw8c
+                        const int inputBlockIdx = (gc / inBlockSize) * inBlockSize;
+                        binOffIn = (roiBatchInd * inputChannelsPadding + inputBlockIdx) * height * width;
+                        inBlkRes = (inFmt == Layout::BLOCKED ? gc % inBlockSize : 0);
+                    }
+                    const auto *bottomData = srcData + binOffIn;
+
+                    const float boxXmin = roiStartW + (binX + 0) * (roiWidth / spatialBinsX);
+                    const float boxXmax = roiStartW + (binX + 1) * (roiWidth / spatialBinsX);
+
+                    const float widthScale = nw > 1 ? (boxXmax - boxXmin) * (width - 1) / (pooledWidth - 1) : 0.0f;
+                    const float inX = nw > 1 ? (w * widthScale + boxXmin * (width - 1)) : 0.5f * (boxXmin + boxXmax) * (width - 1);
+
+                    if (!(inY < 0 || inY > height - 1 || inX < 0 || inX > width - 1)) {
+                        const int topYIndex = static_cast<int>(floorf(inY));
+                        int bottomYIndex = static_cast<int>(ceilf(inY));
+                        const int leftXIndex = static_cast<int>(floorf(inX));
+                        int rightXIndex = static_cast<int>(ceilf(inX));
+
+                        if (rightXIndex > width - 1) rightXIndex = width - 1;
+                        if (bottomYIndex > height - 1) bottomYIndex = height - 1;
+
+                        auto topLeftIndex = topYIndex * hInputStride + leftXIndex * wInputStride + inBlkRes;
+                        auto topRightIndex = topYIndex * hInputStride + rightXIndex * wInputStride + inBlkRes;
+                        auto bottomLeftIndex = bottomYIndex * hInputStride + leftXIndex * wInputStride + inBlkRes;
+                        auto bottomRightIndex = bottomYIndex * hInputStride + rightXIndex * wInputStride + inBlkRes;
+
+                        const float topLeft = bottomData[topLeftIndex];
+                        const float topRight = bottomData[topRightIndex];
+                        const float bottomLeft = bottomData[bottomLeftIndex];
+                        const float bottomRight = bottomData[bottomRightIndex];
+
+                        const float top = topLeft + (topRight - topLeft) * (inX - leftXIndex);
+                        const float bottom = bottomLeft + (bottomRight - bottomLeft) * (inX - leftXIndex);
+
+                        accum += top + (bottom - top) * (inY - topYIndex);
+                    }
+                }
+            }
+            accum /= numBins;
+            dstData[dstIndex] = accum;
+        };
+
+        if (inFmt == Layout::NHWC) {
+            const int binOffsetOutput = currentRoi * nc * nh * nw;
+            parallel_for2d(nh, nw, [&](int h, int w) {
+                for (int c = 0; c < nc; c++) {
+                    bilinearPsroi(c, h, w, 0, binOffsetOutput + c);
+                }
+            });
+        } else if (inFmt == Layout::NCHW) {
+            parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
+                bilinearPsroi(c, h, w, 0, (currentRoi * outputChannelsPadding + c) * binCount);
+            });
+        } else {  // nChw16c, nChw8c
+            parallel_for3d(outBlockCount, nh, nw, [&](int blkIdx, int h, int w) {
+                int cStart = blkIdx * outBlockSize;
+                int cEnd = (blkIdx == outBlockCount - 1 ? nc : cStart + outBlockSize);
+                for (int c = cStart; c < cEnd; c++) {
+                    const int outputBlockIdx = (c / inBlockSize) * inBlockSize;
+                    const int binOffsetOutput = (currentRoi * outputChannelsPadding + outputBlockIdx) * binCount;
+                    const int outputBlockResidual = (inFmt == Layout::BLOCKED ? c % inBlockSize : 0);
+                    bilinearPsroi(c, h, w, outputBlockResidual, binOffsetOutput);
+                }
+            });
+        }
+    }
+
+    template <typename inputType, typename outputType>
+    void executeBilinearDeformable(const inputType *srcData, outputType *dstData, const float *bottomRois,
+                                   const float *bottomTrans, const int numClasses, const int channelsEachClass,
+                                   const int currentRoi, const int roiBatchInd) {
+        const float roiStartW = static_cast<float>(round(bottomRois[1])) * spatialScale - 0.5f;
+        const float roiStartH = static_cast<float>(round(bottomRois[2])) * spatialScale - 0.5f;
+        const float roiEndW   = static_cast<float>(round(bottomRois[3]) + 1.0f) * spatialScale - 0.5f;
+        const float roiEndH   = static_cast<float>(round(bottomRois[4]) + 1.0f) * spatialScale - 0.5f;
+        // Force too small ROIs to be 1x1
+        const float roiWidth  = std::max<float>(roiEndW - roiStartW, 0.1f);  // avoid 0
+        const float roiHeight = std::max<float>(roiEndH - roiStartH, 0.1f);
+        parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
+            size_t dstIndex = ((currentRoi * nc + c) * nh + h) * nw + w;
+            dstData[dstIndex] = 0;
+            // Compute w and h at bottom
+            float binSizeH = roiHeight / static_cast<float>(pooledHeight);
+            float binSizeW = roiWidth / static_cast<float>(pooledWidth);
+
+            float subBinSizeH = binSizeH / static_cast<float>(spatialBinsX);
+            float subBinSizeW = binSizeW / static_cast<float>(spatialBinsY);
+
+            int partH = h * partSize / pooledHeight;
+            int partW = w * partSize / pooledWidth;
+            int classId = c / channelsEachClass;
+            float transX = noTrans ? 0 :
+                           bottomTrans[(((currentRoi * numClasses + classId) * 2) * partSize + partH)
+                                       * partSize + partW] * transStd;
+            float transY = noTrans ? 0 :
+                           bottomTrans[(((currentRoi * numClasses + classId) * 2 + 1) * partSize + partH)
+                                       * partSize + partW] * transStd;
+
+            float wStart = w * binSizeW + roiStartW + transX * roiWidth;
+            float hStart = h * binSizeH + roiStartH + transY * roiHeight;
+
+            float sum = 0;
+            int count = 0;
+            int gw = w * groupSize / pooledWidth;
+            int gh = h * groupSize / pooledHeight;
+            gw = (std::min)((std::max)(gw, 0), static_cast<int>(groupSize - 1));
+            gh = (std::min)((std::max)(gh, 0), static_cast<int>(groupSize - 1));
+
+            const inputType* offsetBottomData = srcData + (roiBatchInd * channels) * height * width;
+            for (size_t ih = 0; ih < spatialBinsY; ih++) {
+                for (size_t iw = 0; iw < spatialBinsX; iw++) {
+                    float w1 = wStart + iw * subBinSizeW;
+                    float h1 = hStart + ih * subBinSizeH;
+                    // bilinear interpolation
+                    if (w1 < -0.5 || w1 > width - 0.5 || h1 < -0.5 || h1 > height - 0.5)
+                        continue;
+                    w1 = static_cast<float>((std::min)((std::max)(static_cast<double>(w1), 0.0), width - 1.0));
+                    h1 = static_cast<float>((std::min)((std::max)(static_cast<double>(h1), 0.0), height - 1.0));
+                    int c1 = static_cast<int>((c * groupSize + gh) * groupSize + gw);
+                    float val = bilinearInterp<inputType>(offsetBottomData +
+                                                          c1 * height * width, w1, h1, width);
+
+                    sum += val;
+                    count++;
+                }
+            }
+            dstData[dstIndex] = count == 0 ? 0 : sum / count;
+        });
+    }
+
+    template <typename inputType, typename outputType>
+    void executeSpecified(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs) {
+        const auto *srcData = inputs[0]->cbuffer().as<const inputType*>() + inputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        const float *bottomRoisBeginning = inputs[1]->cbuffer().as<const float*>() + inputs[1]->getTensorDesc().getBlockingDesc().getOffsetPadding();
+        auto *dstData = outputs[0]->buffer().as<outputType*>() + outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
+
+        auto srcDesc = inputs[0]->getTensorDesc();
+        auto dstDesc = outputs[0]->getTensorDesc();
+
+        int realRois = 0;
+        for (; realRois < nn; realRois++) {
+            int roiBatchInd = static_cast<int>(bottomRoisBeginning[realRois * 5]);
+            if (roiBatchInd == -1) {
                break;
            }
        }

        //  for Deformable PSROIPooling
-        float *bottom_trans = nullptr;
-        int num_classes = 1;
-        int channels_each_class = output_dim_;
-        if (!no_trans_) {
-            bottom_trans = inputs[2]->buffer();
-            num_classes = static_cast<int>(inputs[2]->getTensorDesc().getDims()[1]) / 2;
-            channels_each_class /= num_classes;
+        float *bottomTrans = nullptr;
+        int numClasses = 1;
+        int channelsEachClass = outputDim;
+        if (!noTrans) {
+            bottomTrans = inputs[2]->cbuffer().as<float*>() + inputs[2]->getTensorDesc().getBlockingDesc().getOffsetPadding();
+            numClasses = static_cast<int>(inputs[2]->getTensorDesc().getDims()[1]) / 2;
+            channelsEachClass /= numClasses;
        }

-        size_t num_bins = spatial_bins_x_*spatial_bins_y_;
-
-        parallel_for(real_rois, [&](int n) {
-            const float* bottom_rois = bottom_rois_beginning + n * 5;
-            int roi_batch_ind = static_cast<int>(bottom_rois[0]);
-            float roi_start_w = 0.0f;
-            float roi_start_h = 0.0f;
-            float roi_end_w   = 0.0f;
-            float roi_end_h   = 0.0f;
-            float roi_width   = 0.0f;
-            float roi_height  = 0.0f;
-
-            if (mode_ == "bilinear") {
-                roi_start_w = bottom_rois[1] * spatial_scale_;
-                roi_start_h = bottom_rois[2] * spatial_scale_;
-                roi_end_w = bottom_rois[3] * spatial_scale_;
-                roi_end_h = bottom_rois[4] * spatial_scale_;
-                roi_width  = roi_end_w - roi_start_w;
-                roi_height = roi_end_h - roi_start_h;
-            } else if (mode_ == "average") {
-                roi_start_w = static_cast<float>(round(bottom_rois[1])) * spatial_scale_;
-                roi_start_h = static_cast<float>(round(bottom_rois[2])) * spatial_scale_;
-                roi_end_w   = static_cast<float>(round(bottom_rois[3]) + 1.0f) * spatial_scale_;
-                roi_end_h   = static_cast<float>(round(bottom_rois[4]) + 1.0f) * spatial_scale_;
-                // Force too small ROIs to be 1x1
-                roi_width  = std::max<float>(roi_end_w - roi_start_w, 0.1f);  // avoid 0
-                roi_height = std::max<float>(roi_end_h - roi_start_h, 0.1f);
-            } else if (mode_ == "bilinear_deformable") {
-                roi_start_w = static_cast<float>(round(bottom_rois[1])) * spatial_scale_ - 0.5f;
-                roi_start_h = static_cast<float>(round(bottom_rois[2])) * spatial_scale_ - 0.5f;
-                roi_end_w   = static_cast<float>(round(bottom_rois[3]) + 1.0f) * spatial_scale_ - 0.5f;
-                roi_end_h   = static_cast<float>(round(bottom_rois[4]) + 1.0f) * spatial_scale_ - 0.5f;
-                // Force too small ROIs to be 1x1
-                roi_width  = std::max<float>(roi_end_w - roi_start_w, 0.1f);  // avoid 0
-                roi_height = std::max<float>(roi_end_h - roi_start_h, 0.1f);
-            }
-
-            for (int c = 0; c < nc; c++) {
-                for (int h = 0; h < nh; h++) {
-                    for (int w = 0; w < nw; w++) {
-                        size_t index = n*nc*nh*nw + c*nh*nw + h*nw + w;
-                        dst_data[index] = 0.0f;
-
-                        if (mode_ == "average") {
-                            float bin_size_h = roi_height / static_cast<float>(pooled_height_);
-                            float bin_size_w = roi_width  / static_cast<float>(pooled_width_);
-
-                            int hstart = static_cast<int>(floor(static_cast<float>(h + 0) * bin_size_h + roi_start_h));
-                            int hend = static_cast<int>(ceil(static_cast<float>(h + 1) * bin_size_h + roi_start_h));
-
-                            hstart = std::min<int>(std::max<int>(hstart, 0), height);
-                            hend = std::min<int>(std::max<int>(hend, 0), height);
-                            int wstart = static_cast<int>(floor(static_cast<float>(w + 0) * bin_size_w + roi_start_w));
-                            int wend = static_cast<int>(ceil(static_cast<float>(w + 1) * bin_size_w + roi_start_w));
-
-                            wstart = std::min<int>(std::max<int>(wstart, 0), width);
-                            wend = std::min<int>(std::max<int>(wend, 0), width);
-
-                            float bin_area = static_cast<float>((hend - hstart) * (wend - wstart));
-                            if (bin_area) {
-                                int gc = (c * group_size_ + h) * group_size_ + w;
-                                const float *bottom_data =
-                                        bottom_data_beginning + ((roi_batch_ind * channels + gc) * height * width);
-
-                                float out_sum = 0.0f;
-                                for (int hh = hstart; hh < hend; ++hh)
-                                    for (int ww = wstart; ww < wend; ++ww)
-                                        out_sum += bottom_data[hh * width + ww];
-
-                                dst_data[index] = out_sum / bin_area;
-                            }
-                        } else if (mode_ == "bilinear") {
-                            for (size_t bin_y = 0; bin_y < spatial_bins_y_; bin_y++) {
-                                for (size_t bin_x = 0; bin_x < spatial_bins_x_; bin_x++) {
-                                    float box_xmin = roi_start_w + (bin_x + 0) * (roi_width / spatial_bins_x_);
-                                    float box_xmax = roi_start_w + (bin_x + 1) * (roi_width / spatial_bins_x_);
-                                    float box_ymin = roi_start_h + (bin_y + 0) * (roi_height / spatial_bins_y_);
-                                    float box_ymax = roi_start_h + (bin_y + 1) * (roi_height / spatial_bins_y_);
-
-                                    size_t gc = c + (bin_y*spatial_bins_x_ + bin_x)*nc;
-                                    size_t src_idx = (roi_batch_ind * channels + gc) * height * width;
-                                    const float *bottom_data = bottom_data_beginning + src_idx;
-
-                                    float height_scale = nh > 1 ? (box_ymax - box_ymin) * (height - 1) / (pooled_height_ - 1)
-                                                                : 0.0f;
-                                    float width_scale = nw > 1 ? (box_xmax - box_xmin) * (width - 1) / (pooled_width_ - 1)
-                                                               : 0.0f;
-
-                                    float in_y = nh > 1 ? (h * height_scale + box_ymin * (height - 1))
-                                                        : 0.5f * (box_ymin + box_ymax) * (height - 1);
-                                    float in_x = nw > 1 ? (w * width_scale + box_xmin * (width - 1))
-                                                        : 0.5f * (box_xmin + box_xmax) * (width - 1);
-
-                                    if (!(in_y < 0 || in_y > height - 1 || in_x < 0 || in_x > width - 1)) {
-                                        int top_y_index = static_cast<int>(floorf(in_y));
-                                        int bottom_y_index = static_cast<int>(ceilf(in_y));
-                                        int left_x_index = static_cast<int>(floorf(in_x));
-                                        int right_x_index = static_cast<int>(ceilf(in_x));
-
-                                        if (right_x_index > width - 1)
-                                            right_x_index = width - 1;
-
-                                        if (bottom_y_index > height - 1)
-                                            bottom_y_index = height - 1;
-
-                                        const float top_left = bottom_data[top_y_index * width + left_x_index];
-                                        const float top_right = bottom_data[top_y_index * width + right_x_index];
-                                        const float bottom_left = bottom_data[bottom_y_index * width + left_x_index];
-                                        const float bottom_right = bottom_data[bottom_y_index * width + right_x_index];
-
-                                        const float top = top_left + (top_right - top_left) * (in_x - left_x_index);
-                                        const float bottom = bottom_left + (bottom_right - bottom_left) * (in_x - left_x_index);
-
-                                        dst_data[index] += top + (bottom - top) * (in_y - top_y_index);
-                                    }
-                                }
-                            }
-                            dst_data[index] /= num_bins;
-                        } else if (mode_ == "bilinear_deformable") {
-                            // Compute w and h at bottom
-                            float bin_size_h = roi_height / static_cast<float>(pooled_height_);
-                            float bin_size_w = roi_width  / static_cast<float>(pooled_width_);
-
-                            float sub_bin_size_h = bin_size_h / static_cast<float>(spatial_bins_x_);
-                            float sub_bin_size_w = bin_size_w / static_cast<float>(spatial_bins_y_);
-
-                            int part_h = h * part_size_ / pooled_height_;
-                            int part_w = w * part_size_ / pooled_width_;
-                            int class_id = c / channels_each_class;
-                            float trans_x = no_trans_ ? 0 :
-                                            bottom_trans[(((n * num_classes + class_id) * 2) * part_size_ + part_h)
-                                                         * part_size_ + part_w] * trans_std_;
-                            float trans_y = no_trans_ ? 0 :
-                                            bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size_ + part_h)
-                                                         * part_size_ + part_w] * trans_std_;
-
-                            float wstart = w * bin_size_w + roi_start_w + trans_x * roi_width;
-                            float hstart = h * bin_size_h + roi_start_h + trans_y * roi_height;
-
-                            float sum = 0;
-                            int count = 0;
-                            int gw = w * group_size_ / pooled_width_;
-                            int gh = h * group_size_ / pooled_height_;
-                            gw = (std::min)((std::max)(gw, 0), static_cast<int>(group_size_ - 1));
-                            gh = (std::min)((std::max)(gh, 0), static_cast<int>(group_size_ - 1));
-
-                            const float* offset_bottom_data = bottom_data_beginning + (roi_batch_ind * channels) * height * width;
-                            for (size_t ih = 0; ih < spatial_bins_y_; ih++) {
-                                for (size_t iw = 0; iw < spatial_bins_x_; iw++) {
-                                    float w1 = wstart + iw * sub_bin_size_w;
-                                    float h1 = hstart + ih * sub_bin_size_h;
-                                    // bilinear interpolation
-                                    if (w1 < -0.5 || w1 > width - 0.5 || h1 < -0.5 || h1 > height - 0.5)
-                                        continue;
-                                    w1 = static_cast<float>((std::min)((std::max)(static_cast<double>(w1), 0.0), width - 1.0));
-                                    h1 = static_cast<float>((std::min)((std::max)(static_cast<double>(h1), 0.0), height - 1.0));
-                                    int c1 = static_cast<int>((c * group_size_ + gh) * group_size_ + gw);
-                                    float val = bilinear_interp(offset_bottom_data + c1 * height * width, w1, h1, width);
-                                    sum += val;
-                                    count++;
-                                }
-                            }
-                            dst_data[index] = count == 0 ? 0 : sum / count;
-                        }
-                    }
-                }
+        parallel_for(realRois, [&](int currentRoi) {
+            const float *bottomRois = bottomRoisBeginning + currentRoi * 5;
+            int roiBatchInd = static_cast<int>(bottomRois[0]);
+            if (mode == "average") {
+                executeAverage(srcData, dstData, bottomRois, currentRoi, roiBatchInd, srcDesc, dstDesc);
+            } else if (mode == "bilinear") {
+                executeBilinear(srcData, dstData, bottomRois, currentRoi, roiBatchInd, srcDesc, dstDesc);
+            } else if (mode == "bilinear_deformable") {
+                executeBilinearDeformable(srcData, dstData, bottomRois, bottomTrans,
+                        numClasses, channelsEachClass, currentRoi, roiBatchInd);
            }
        });

-        for (int n = real_rois; n < nn; n++) {
-            parallel_for3d(nc, nh, nw, [&](int c, int h, int w) {
-                int index = n * nc * nh * nw + c * nh * nw + h * nw + w;
-                dst_data[index] = 0.0f;
-            });
-        }
-
-        return OK;
+        memset(dstData + realRois * nc * nh * nw, 0, (nn - realRois) * nc * nh * nw * sizeof(outputType));
    }

-    inline float bilinear_interp(const float* data, const float x, const float y, const int width) {
+    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
+        try {
+            auto inputPrec = inputs[0]->getTensorDesc().getPrecision();
+            auto outputPrec = outputs[0]->getTensorDesc().getPrecision();
+
+            if (!((inputPrec == Precision::BF16 && outputPrec == Precision::BF16) ||
+                  (inputPrec == Precision::FP32 && outputPrec == Precision::FP32)))
+                return NOT_IMPLEMENTED;
+
+            PSROIPoolingContext ctx = {
+                    *this,
+                    inputs,
+                    outputs
+            };
+
+            OV_SWITCH(MKLDNNPlugin, PSROIPoolingExecute, ctx, std::tie(inputPrec, outputPrec),
+                      OV_CASE2(Precision::FP32, Precision::FP32, float, float),
+                      OV_CASE2(Precision::BF16, Precision::BF16, bfloat16_t, bfloat16_t))
+
+            return OK;
+        }
+        catch (const std::exception& excp) {
+            snprintf(resp->msg, sizeof(resp->msg), "%s", excp.what());
+            return GENERAL_ERROR;
+        }
+        catch(...) {
+            return GENERAL_ERROR;
+        }
+    }
+
+    template <typename inputType>
+    inline float bilinearInterp(const inputType* data, const float x, const float y, const int width_) {
        int x1 = static_cast<int>(std::floor(x));
        int x2 = static_cast<int>(std::ceil(x));
        int y1 = static_cast<int>(std::floor(y));
        int y2 = static_cast<int>(std::ceil(y));
-        float dist_x = x - x1;
-        float dist_y = y - y1;
-        float value11 = data[y1 * width + x1];
-        float value12 = data[y2 * width + x1];
-        float value21 = data[y1 * width + x2];
-        float value22 = data[y2 * width + x2];
-        float value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12
-                      + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+        float distX = x - x1;
+        float distY = y - y1;
+
+        float value11 = data[y1 * width_ + x1];
+        float value12 = data[y2 * width_ + x1];
+        float value21 = data[y1 * width_ + x2];
+        float value22 = data[y2 * width_ + x2];
+        float value = (1 - distX) * (1 - distY) * value11 + (1 - distX) * distY * value12
+                      + distX * (1 - distY) * value21 + distX * distY * value22;
        return value;
    }

 private:
-    size_t output_dim_ = 0;
-    size_t group_size_ = 0;
-    float spatial_scale_ = 0;
-    size_t pooled_height_ = 0;
-    size_t pooled_width_ = 0;
-    size_t spatial_bins_x_ = 0;
-    size_t spatial_bins_y_ = 0;
-    std::string mode_ = "";
+    size_t outputDim = 0;
+    size_t groupSize = 0;
+    float spatialScale = 0;
+    size_t pooledHeight = 0;
+    size_t pooledWidth = 0;
+    size_t spatialBinsX = 0;
+    size_t spatialBinsY = 0;
+    std::string mode = "";

    int channels = 0;
    int height = 0;
@ -297,9 +511,9 @@ private:
    int nw = 0;

    //  for Deformable PSROIPolling
-    bool no_trans_;
-    int part_size_;
-    float trans_std_;
+    bool noTrans;
+    int partSize;
+    float transStd;
 };

 REG_FACTORY_FOR(PSROIPoolingImpl, PSROIPooling);
--- a/inference-engine/tests/functional/plugin/cpu/single_layer_tests/psroi_pooling.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/single_layer_tests/psroi_pooling.cpp
@ -0,0 +1,184 @@
+// Copyright (C) 2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils/cpu_test_utils.hpp"
+
+#include "ngraph_functions/builders.hpp"
+#include "ngraph_functions/utils/ngraph_helpers.hpp"
+
+using namespace InferenceEngine;
+using namespace CPUTestUtils;
+
+namespace CPULayerTestsDefinitions {
+namespace {
+    std::vector<float> proposal;
+    std::vector<size_t> featureMapShape;
+    size_t spatialBinsX;
+    size_t spatialBinsY;
+    float spatialScale;
+    size_t groupSize;
+    size_t outputDim;
+    std::string mode;
+}  // namespace
+
+typedef std::tuple<
+        std::vector<size_t>,            // feature map shape
+        std::vector<float>,             // coords shape
+        size_t,                         // output_dim
+        size_t,                         // group_size
+        float,                          // Spatial scale
+        size_t,                         // spatial_bins_x
+        size_t,                         // spatial_bins_y
+        std::string                     // mode
+> PSROIPoolingSpecificParams;
+
+typedef std::tuple<
+        PSROIPoolingSpecificParams,
+        InferenceEngine::Precision,     // Net precision
+        LayerTestsUtils::TargetDevice   // Device name
+> PSROIPoolingLayerTestParams;
+
+typedef std::tuple<
+        CPULayerTestsDefinitions::PSROIPoolingLayerTestParams,
+        CPUSpecificParams> PSROIPoolingLayerCPUTestParamsSet;
+
+class PSROIPoolingLayerCPUTest : public testing::WithParamInterface<PSROIPoolingLayerCPUTestParamsSet>,
+                             virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<PSROIPoolingLayerCPUTestParamsSet> obj) {
+        CPULayerTestsDefinitions::PSROIPoolingLayerTestParams basicParamsSet;
+        CPUSpecificParams cpuParams;
+        std::tie(basicParamsSet, cpuParams) = obj.param;
+        std::string td;
+        Precision netPr;
+        PSROIPoolingSpecificParams psroiPar;
+        std::tie(psroiPar, netPr, td) = basicParamsSet;
+        std::tie(featureMapShape, proposal, outputDim, groupSize,
+                 spatialScale, spatialBinsX, spatialBinsY, mode) = psroiPar;
+        std::ostringstream result;
+        result << "PSROIPoolingTest_";
+        result << std::to_string(obj.index) << "_";
+        result << "binsX=" << spatialBinsX << "_";
+        result << "binsY=" << spatialBinsY << "_";
+        result << "spatialScale=" << spatialScale << "_";
+        result << "outputD=" << outputDim << "_";
+        result << "groupS=" << groupSize << "_";
+        result << netPr.name() << "_";
+        result << mode << "_";
+        result << CPUTestsBase::getTestCaseName(cpuParams);
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        CPULayerTestsDefinitions::PSROIPoolingLayerTestParams basicParamsSet;
+        CPUSpecificParams cpuParams;
+        std::tie(basicParamsSet, cpuParams) = this->GetParam();
+        std::tie(inFmts, outFmts, priority, selectedType) = cpuParams;
+
+        CPULayerTestsDefinitions::PSROIPoolingSpecificParams psroiPoolingParams;
+        auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
+        std::tie(psroiPoolingParams, netPrecision, targetDevice) = basicParamsSet;
+        inPrc = outPrc = netPrecision;
+        std::tie(featureMapShape, proposal, outputDim, groupSize,
+                 spatialScale, spatialBinsX, spatialBinsY, mode) = psroiPoolingParams;
+
+
+        ngraph::Shape proposalShape = { proposal.size() / 5, 5 };
+
+        auto coords = ngraph::builder::makeConstant<float>(ngraph::element::f32, proposalShape, proposal);
+        auto params = ngraph::builder::makeParams(ngraph::element::f32, {featureMapShape});
+
+        auto psroi = std::make_shared<ngraph::op::v0::PSROIPooling>(params[0], coords, outputDim, groupSize,
+                                                       spatialScale, spatialBinsX, spatialBinsY, mode);
+        psroi->get_rt_info() = getCPUInfo();
+        selectedType = std::string("unknown_") + inPrc.name();
+
+        threshold = 0.001f;
+        const ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(psroi)};
+        function = std::make_shared<ngraph::Function>(results, params, "PSROIPooling");
+    }
+};
+
+TEST_P(PSROIPoolingLayerCPUTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED()
+    Run();
+    CheckPluginRelatedResults(executableNetwork, "PSROIPooling");
+}
+
+namespace {
+
+/* CPU PARAMS */
+std::vector<CPUSpecificParams> resCPUParams {
+    CPUSpecificParams{{nchw, nc}, {nchw}, {}, {}},
+    CPUSpecificParams{{nhwc, nc}, {nhwc}, {}, {}},
+    CPUSpecificParams{{nChw16c, nc}, {nChw16c}, {}, {}}
+};
+
+const std::vector<InferenceEngine::Precision> netPrecisions = {
+        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::BF16
+};
+
+const std::vector<float> spatialScaleVector = { 1.0f };
+
+const std::vector<std::vector<size_t>> inputShapeVector = {
+        SizeVector({ 2, 200, 20, 20 }),
+        SizeVector({ 2, 200, 20, 16 }),
+        SizeVector({ 2, 200, 16, 20 }),
+        SizeVector({ 3, 200, 16, 16 })
+};
+
+const std::vector<std::vector<float>> averagePropVector = {
+        { 0, 0.9, 0.9, 18.9, 18.9,
+          1, 0.9, 0.9, 18.9, 18.9 },
+        { 1, 1, 1, 15, 15 }
+};
+
+const std::vector<std::vector<float>> bilinearPropVector = {
+        { 0, 0.1, 0.1, 0.9, 0.9,
+          1, 0.1, 0.1, 0.9, 0.9 },
+        { 1, 0.1, 0.1, 0.9, 0.9 }
+};
+
+const auto psroiPoolingAverageParams = ::testing::Combine(
+        ::testing::ValuesIn(inputShapeVector),
+        ::testing::ValuesIn(averagePropVector),
+        ::testing::Values(50),
+        ::testing::Values(2),
+        ::testing::ValuesIn(spatialScaleVector),
+        ::testing::Values(1),
+        ::testing::Values(1),
+        ::testing::Values("average")
+);
+
+const auto psroiPoolingBilinearParams = ::testing::Combine(
+        ::testing::Values(std::vector<size_t>{3, 32, 20, 20}),
+        ::testing::ValuesIn(bilinearPropVector),
+        ::testing::Values(4),
+        ::testing::Values(3),
+        ::testing::ValuesIn(spatialScaleVector),
+        ::testing::Values(4),
+        ::testing::Values(2),
+        ::testing::Values("bilinear")
+);
+
+INSTANTIATE_TEST_CASE_P(smoke_PSROIPoolingAverageLayoutTest, PSROIPoolingLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        psroiPoolingAverageParams,
+                                        ::testing::ValuesIn(netPrecisions),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUSpecificParams(resCPUParams))),
+                        PSROIPoolingLayerCPUTest::getTestCaseName);
+
+INSTANTIATE_TEST_CASE_P(smoke_PSROIPoolingBilinearLayoutTest, PSROIPoolingLayerCPUTest,
+                        ::testing::Combine(
+                                ::testing::Combine(
+                                        psroiPoolingBilinearParams,
+                                        ::testing::ValuesIn(netPrecisions),
+                                        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                                ::testing::ValuesIn(filterCPUSpecificParams(resCPUParams))),
+                        PSROIPoolingLayerCPUTest::getTestCaseName);
+} // namespace
+} // namespace CPULayerTestsDefinitions
--- a/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp
+++ b/inference-engine/tests/functional/plugin/cpu/test_utils/cpu_test_utils.hpp
@ -34,7 +34,7 @@ namespace CPUTestUtils {
        ncdhw = abcde,
        nCdhw8c = aBcde8b,
        nCdhw16c = aBcde16b,
-        ndhwc = acdeb,
+        ndhwc = acdeb
    } cpu_memory_format_t;

    using CPUSpecificParams =  std::tuple<
--- a/inference-engine/tests/functional/shared_test_classes/src/single_layer/psroi_pooling.cpp
+++ b/inference-engine/tests/functional/shared_test_classes/src/single_layer/psroi_pooling.cpp
@ -7,124 +7,124 @@

 namespace LayerTestsDefinitions {

-    std::string PSROIPoolingLayerTest::getTestCaseName(testing::TestParamInfo<psroiParams> obj) {
-        std::vector<size_t> inputShape;
-        std::vector<size_t> coordsShape;
-        size_t outputDim;
-        size_t groupSize;
-        float spatialScale;
-        size_t spatialBinsX;
-        size_t spatialBinsY;
-        std::string mode;
-        InferenceEngine::Precision netPrecision;
-        std::string targetDevice;
-        std::tie(inputShape, coordsShape, outputDim, groupSize, spatialScale, spatialBinsX, spatialBinsY, mode, netPrecision, targetDevice) = obj.param;
+std::string PSROIPoolingLayerTest::getTestCaseName(testing::TestParamInfo<psroiParams> obj) {
+    std::vector<size_t> inputShape;
+    std::vector<size_t> coordsShape;
+    size_t outputDim;
+    size_t groupSize;
+    float spatialScale;
+    size_t spatialBinsX;
+    size_t spatialBinsY;
+    std::string mode;
+    InferenceEngine::Precision netPrecision;
+    std::string targetDevice;
+    std::tie(inputShape, coordsShape, outputDim, groupSize, spatialScale, spatialBinsX, spatialBinsY, mode, netPrecision, targetDevice) = obj.param;

-        std::ostringstream result;
+    std::ostringstream result;

-        result << "in_shape=" << CommonTestUtils::vec2str(inputShape) << "_";
-        result << "coord_shape=" << CommonTestUtils::vec2str(coordsShape) << "_";
-        result << "out_dim=" << outputDim << "_";
-        result << "group_size=" << groupSize << "_";
-        result << "scale=" << spatialScale << "_";
-        result << "bins_x=" << spatialBinsX << "_";
-        result << "bins_y=" << spatialBinsY << "_";
-        result << "mode=" << mode << "_";
-        result << "prec=" << netPrecision.name() << "_";
-        result << "dev=" << targetDevice;
-        return result.str();
+    result << "in_shape=" << CommonTestUtils::vec2str(inputShape) << "_";
+    result << "coord_shape=" << CommonTestUtils::vec2str(coordsShape) << "_";
+    result << "out_dim=" << outputDim << "_";
+    result << "group_size=" << groupSize << "_";
+    result << "scale=" << spatialScale << "_";
+    result << "bins_x=" << spatialBinsX << "_";
+    result << "bins_y=" << spatialBinsY << "_";
+    result << "mode=" << mode << "_";
+    result << "prec=" << netPrecision.name() << "_";
+    result << "dev=" << targetDevice;
+    return result.str();
+}
+
+static int randInt(int low, int high) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> dis(low, high);
+    return dis(gen);
+}
+
+static void fillROITensor(float* buffer, int numROIs, int batchSize,
+                          int height, int width, int groupSize,
+                          float spatialScale, int spatialBinsX, int spatialBinsY, const std::string& mode) {
+    int minRoiWidth = groupSize;
+    int maxRoiWidth = width / groupSize * groupSize;
+    int minRoiHeight = groupSize;
+    int maxRoiHeight = height / groupSize * groupSize;
+    float scaleX = spatialScale;
+    float scaleY = spatialScale;
+    if (mode == "bilinear") {
+        minRoiWidth = spatialBinsX;
+        maxRoiWidth = width / spatialBinsX * spatialBinsX;
+        minRoiHeight = spatialBinsY;
+        maxRoiHeight = height / spatialBinsY * spatialBinsY;
+        scaleX *= width;
+        scaleY *= height;
    }
+    int batchId = 0;
+    for (int i = 0; i < numROIs; i++) {
+        int sizeX = std::min(width, randInt(minRoiWidth, maxRoiWidth));
+        int sizeY = std::min(height, randInt(minRoiHeight, maxRoiHeight));
+        int startX = randInt(0, std::max(1, width - sizeX - 1));
+        int startY = randInt(0, std::max(1, height - sizeY - 1));

-    static int randInt(int low, int high) {
-        std::random_device rd;
-        std::mt19937 gen(rd());
-        std::uniform_int_distribution<int> dis(low, high);
-        return dis(gen);
+        float* roi = buffer + i * 5;
+        roi[0] = batchId;
+        roi[1] = startX / scaleX;
+        roi[2] = startY / scaleY;
+        roi[3] = (startX + sizeX - 1) / scaleX;
+        roi[4] = (startY + sizeY - 1) / scaleY;
+
+        batchId = (batchId + 1) % batchSize;
    }
+}

-    static void fillROITensor(float* buffer, int numROIs, int batchSize,
-                              int height, int width, int groupSize,
-                              float spatialScale, int spatialBinsX, int spatialBinsY, const std::string& mode) {
-        int minRoiWidth = groupSize;
-        int maxRoiWidth = width / groupSize * groupSize;
-        int minRoiHeight = groupSize;
-        int maxRoiHeight = height / groupSize * groupSize;
-        float scaleX = spatialScale;
-        float scaleY = spatialScale;
-        if (mode == "bilinear") {
-            minRoiWidth = spatialBinsX;
-            maxRoiWidth = width / spatialBinsX * spatialBinsX;
-            minRoiHeight = spatialBinsY;
-            maxRoiHeight = height / spatialBinsY * spatialBinsY;
-            scaleX *= width;
-            scaleY *= height;
-        }
-        int batchId = 0;
-        for (int i = 0; i < numROIs; i++) {
-            int sizeX = std::min(width, randInt(minRoiWidth, maxRoiWidth));
-            int sizeY = std::min(height, randInt(minRoiHeight, maxRoiHeight));
-            int startX = randInt(0, std::max(1, width - sizeX - 1));
-            int startY = randInt(0, std::max(1, height - sizeY - 1));
-
-            float* roi = buffer + i * 5;
-            roi[0] = batchId;
-            roi[1] = startX / scaleX;
-            roi[2] = startY / scaleY;
-            roi[3] = (startX + sizeX - 1) / scaleX;
-            roi[4] = (startY + sizeY - 1) / scaleY;
-
-            batchId = (batchId + 1) % batchSize;
+void PSROIPoolingLayerTest::Infer() {
+    inferRequest = executableNetwork.CreateInferRequest();
+    inputs.clear();
+
+    auto inputShape = cnnNetwork.getInputShapes().begin()->second;
+
+    size_t it = 0;
+    for (const auto &input : cnnNetwork.getInputsInfo()) {
+        const auto &info = input.second;
+        InferenceEngine::Blob::Ptr blob;
+
+        if (it == 1) {
+            blob = make_blob_with_precision(info->getTensorDesc());
+            blob->allocate();
+            fillROITensor(blob->buffer(), blob->size() / 5,
+                          inputShape[0], inputShape[2], inputShape[3], groupSize_,
+                          spatialScale_, spatialBinsX_, spatialBinsY_, mode_);
+        } else {
+            blob = GenerateInput(*info);
        }
+        inferRequest.SetBlob(info->name(), blob);
+        inputs.push_back(blob);
+        it++;
    }
+    inferRequest.Infer();
+}

-    void PSROIPoolingLayerTest::Infer() {
-        inferRequest = executableNetwork.CreateInferRequest();
-        inputs.clear();
+void PSROIPoolingLayerTest::SetUp() {
+    std::vector<size_t> inputShape;
+    std::vector<size_t> coordsShape;
+    size_t outputDim;
+    InferenceEngine::Precision netPrecision;
+    std::tie(inputShape, coordsShape, outputDim, groupSize_, spatialScale_,
+             spatialBinsX_, spatialBinsY_, mode_, netPrecision, targetDevice) = this->GetParam();

-        auto inputShape = cnnNetwork.getInputShapes().begin()->second;
-
-        size_t it = 0;
-        for (const auto &input : cnnNetwork.getInputsInfo()) {
-            const auto &info = input.second;
-            InferenceEngine::Blob::Ptr blob;
-
-            if (it == 1) {
-                blob = make_blob_with_precision(info->getTensorDesc());
-                blob->allocate();
-                fillROITensor(blob->buffer(), blob->size() / 5,
-                              inputShape[0], inputShape[2], inputShape[3], groupSize_,
-                              spatialScale_, spatialBinsX_, spatialBinsY_, mode_);
-            } else {
-                blob = GenerateInput(*info);
-            }
-            inferRequest.SetBlob(info->name(), blob);
-            inputs.push_back(blob);
-            it++;
-        }
-        inferRequest.Infer();
-    }
-
-    void PSROIPoolingLayerTest::SetUp() {
-        std::vector<size_t> inputShape;
-        std::vector<size_t> coordsShape;
-        size_t outputDim;
-        InferenceEngine::Precision netPrecision;
-        std::tie(inputShape, coordsShape, outputDim, groupSize_, spatialScale_,
-                 spatialBinsX_, spatialBinsY_, mode_, netPrecision, targetDevice) = this->GetParam();
-
-        auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
-        auto params = ngraph::builder::makeParams(ngPrc, {inputShape, coordsShape});
-        auto paramOuts = ngraph::helpers::convert2OutputVector(
-                ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
-        std::shared_ptr<ngraph::Node> psroiPooling = std::make_shared<ngraph::op::v0::PSROIPooling>(paramOuts[0],
-                                                                                                    paramOuts[1],
-                                                                                                    outputDim,
-                                                                                                    groupSize_,
-                                                                                                    spatialScale_,
-                                                                                                    spatialBinsX_,
-                                                                                                    spatialBinsY_,
-                                                                                                    mode_);
-        ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(psroiPooling)};
-        function = std::make_shared<ngraph::Function>(results, params, "psroi_pooling");
-    }
+    auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
+    auto params = ngraph::builder::makeParams(ngPrc, {inputShape, coordsShape});
+    auto paramOuts = ngraph::helpers::convert2OutputVector(
+            ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
+    std::shared_ptr<ngraph::Node> psroiPooling = std::make_shared<ngraph::op::v0::PSROIPooling>(paramOuts[0],
+                                                                                                paramOuts[1],
+                                                                                                outputDim,
+                                                                                                groupSize_,
+                                                                                                spatialScale_,
+                                                                                                spatialBinsX_,
+                                                                                                spatialBinsY_,
+                                                                                                mode_);
+    ngraph::ResultVector results{std::make_shared<ngraph::opset3::Result>(psroiPooling)};
+    function = std::make_shared<ngraph::Function>(results, params, "psroi_pooling");
+}
 }  // namespace LayerTestsDefinitions