* [CPU] Extend Reduce node to support blocked layouts nC[d]hw8/16C (#580)

2020-09-07 20:35:11 +08:00 · 2020-09-07 20:35:11 +08:00 · cba0892832
commit cba0892832
parent ab6d3a5227
10 changed files with 2215 additions and 437 deletions
--- a/inference-engine/src/mkldnn_plugin/CMakeLists.txt
+++ b/inference-engine/src/mkldnn_plugin/CMakeLists.txt
@ -46,6 +46,7 @@ set(LAYERS
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_normalize_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_scatter_update_node.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_interpolate_node.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/mkldnn_reduce_node.cpp

    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/list.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/batch_to_space.cpp
@ -77,7 +78,6 @@ set(LAYERS
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/proposal_onnx.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/psroi.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/range.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reduce.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/region_yolo.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reorg_yolo.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/nodes/reverse_sequence.cpp
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.cpp
@ -42,6 +42,7 @@
 #include <nodes/mkldnn_mvn_node.h>
 #include <nodes/mkldnn_resample_node.h>
 #include <nodes/mkldnn_normalize_node.h>
+#include <nodes/mkldnn_reduce_node.h>
 #include <nodes/mkldnn_tensoriterator_node.h>
 #include <nodes/mkldnn_scatter_update_node.h>
 #include <nodes/mkldnn_interpolate_node.h>
@ -124,6 +125,18 @@ static const InferenceEngine::details::caseless_unordered_map<std::string, Type>
        { "ScatterElementsUpdate", ScatterElementsUpdate},
        { "ScatterNDUpdate", ScatterNDUpdate},
        { "Interpolate", Interpolate},
+        { "ReduceAnd", ReduceAnd},
+        { "ReduceL1", ReduceL1},
+        { "ReduceL2", ReduceL2},
+        { "ReduceLogSum", ReduceLogSum},
+        { "ReduceLogSumExp", ReduceLogSumExp},
+        { "ReduceMax", ReduceMax},
+        { "ReduceMean", ReduceMean},
+        { "ReduceMin", ReduceMin},
+        { "ReduceOr", ReduceOr},
+        { "ReduceProd", ReduceProd},
+        { "ReduceSum", ReduceSum},
+        { "ReduceSumSquare", ReduceSumSquare},
 };

 Type TypeFromName(const std::string type) {
--- a/inference-engine/src/mkldnn_plugin/mkldnn_node.h
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_node.h
@ -77,7 +77,19 @@ enum Type {
    ScatterUpdate,
    ScatterElementsUpdate,
    ScatterNDUpdate,
-    Interpolate
+    Interpolate,
+    ReduceAnd,
+    ReduceL1,
+    ReduceL2,
+    ReduceLogSum,
+    ReduceLogSumExp,
+    ReduceMax,
+    ReduceMean,
+    ReduceMin,
+    ReduceOr,
+    ReduceProd,
+    ReduceSum,
+    ReduceSumSquare
 };

 Type TypeFromName(const std::string type);
@ -168,6 +180,30 @@ static std::string NameFromType(Type type) {
            return "ScatterNDUpdate";
        case Interpolate:
            return "Interpolate";
+        case ReduceAnd:
+            return "ReduceAnd";
+        case ReduceL1:
+            return "ReduceL1";
+        case ReduceL2:
+            return "ReduceL2";
+        case ReduceLogSum:
+            return "ReduceLogSum";
+        case ReduceLogSumExp:
+            return "ReduceLogSumExp";
+        case ReduceMax:
+            return "ReduceMax";
+        case ReduceMean:
+            return "ReduceMean";
+        case ReduceMin:
+            return "ReduceMin";
+        case ReduceOr:
+            return "ReduceOr";
+        case ReduceProd:
+            return "ReduceProd";
+        case ReduceSum:
+            return "ReduceSum";
+        case ReduceSumSquare:
+            return "ReduceSumSquare";
        default:
            return "Unknown";
    }
--- a/inference-engine/src/mkldnn_plugin/nodes/base.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/base.hpp
@ -102,7 +102,7 @@ protected:
            const bool isInt8 = (data->getPrecision() == Precision::I8 || data->getPrecision() == Precision::U8);

            if (conf.layout == ConfLayout::BLK8 || conf.layout == ConfLayout::BLK16) {
-                if (data_dims.size() < 4 && data_dims.size() > 5)
+                if (data_dims.size() < 4 || data_dims.size() > 5)
                    THROW_IE_EXCEPTION << "Inapplicable blocking layout."
                        << "Tensor should be 4D or 5D.";

--- a/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/list_tbl.hpp
@ -76,18 +76,6 @@ MKLDNN_EXTENSION_NODE(GatherImpl, Gather);
 MKLDNN_EXTENSION_NODE(ProposalImpl, Proposal);
 MKLDNN_EXTENSION_NODE(RangeImpl, Range);
 MKLDNN_EXTENSION_NODE(SelectImpl, Select);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceAnd);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceL1);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceL2);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceLogSum);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceLogSumExp);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceMax);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceMean);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceMin);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceOr);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceProd);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceSum);
-MKLDNN_EXTENSION_NODE(ReduceImpl, ReduceSumSquare);
 MKLDNN_EXTENSION_NODE(GatherTreeImpl, GatherTree);
 MKLDNN_EXTENSION_NODE(PriorBoxClusteredImpl, PriorBoxClustered);
 MKLDNN_EXTENSION_NODE(SpaceToBatchImpl, SpaceToBatch);
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.cpp
--- a/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h
+++ b/inference-engine/src/mkldnn_plugin/nodes/mkldnn_reduce_node.h
@ -0,0 +1,126 @@
+// Copyright (C) 2018-2020 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ie_common.h>
+#include <mkldnn_node.h>
+#include <string>
+#include <memory>
+#include <vector>
+
+namespace MKLDNNPlugin {
+
+enum class Reduce {
+    And,
+    L1,
+    L2,
+    LogSum,
+    LogSumExp,
+    Max,
+    Mean,
+    Min,
+    Or,
+    Prod,
+    Sum,
+    SumSquare
+};
+
+struct jit_reduce_config_params {
+    bool planar_layout;
+    Reduce reduce_mode;
+    mkldnn::memory::data_type src_dt;
+    mkldnn::memory::data_type dst_dt;
+    int src_data_size;
+    int dst_data_size;
+};
+
+struct jit_reduce_call_args {
+    const void *src;
+    void *dst;
+    size_t work_amount;
+    size_t reduce_w = 2;  // only used in planar layout  [1: reduce width dimension]   [0: reduce other dimension] [other value: N/A]
+    size_t reduce_c = 2;  // only used in blocked layout [1: reduce channel dimension] [0: reduce other dimension] [other value: N/A]
+    const float *divisor; // mean = sum / divisor
+};
+
+struct jit_uni_reduce_kernel {
+    void (*ker_)(const jit_reduce_call_args *);
+
+    void operator()(const jit_reduce_call_args *args) {
+        assert(ker_);
+        ker_(args);
+    }
+
+    explicit jit_uni_reduce_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
+    virtual ~jit_uni_reduce_kernel() {}
+
+    jit_reduce_config_params jcp_;
+};
+
+struct jit_uni_reduce_post_kernel {
+    void (*ker_)(const jit_reduce_call_args *);
+
+    void operator()(const jit_reduce_call_args *args) {
+        assert(ker_);
+        ker_(args);
+    }
+
+    explicit jit_uni_reduce_post_kernel(jit_reduce_config_params jcp) : ker_(nullptr), jcp_(jcp) {}
+    virtual ~jit_uni_reduce_post_kernel() {}
+
+    jit_reduce_config_params jcp_;
+};
+
+class MKLDNNReduceNode : public MKLDNNNode {
+public:
+    MKLDNNReduceNode(const InferenceEngine::CNNLayerPtr& layer, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache);
+    ~MKLDNNReduceNode() override = default;
+
+    void getSupportedDescriptors() override;
+    void initSupportedPrimitiveDescriptors() override;
+    void createPrimitive() override;
+    bool created() const override;
+    void execute(mkldnn::stream strm) override;
+    bool canBeInPlace() const override {
+        return false;
+    }
+
+private:
+    void reduce_type(const uint8_t *in_ptr, uint8_t *out_ptr, size_t dst_size);
+    void reduce_PLN(const uint8_t *in_ptr, uint8_t *out_ptr);
+    void reduce_BLK(const uint8_t *in_ptr, uint8_t *out_ptr);
+    void reduce_BLK_concern_padding(const uint8_t *in_ptr, uint8_t *out_ptr);
+    inline void reduce_kernel_process(const uint8_t *in_p, uint8_t *out_p, size_t work_amount, size_t reduce_w = 2);
+    inline void reduce_kernel_post_process(uint8_t *out_ptr);
+    inline void init_dst_data(uint8_t *out_ptr, size_t dst_size);
+    inline void calc_process_dst_dims(const int32_t *idx_data);
+    inline void reduce_ref(const float *in_ptr, float *out_ptr);
+    void reduce_ref_process(const float *in_ptr, float *out_ptr, float init_value, std::function<float(float, float)> func);
+    inline void reduce_ref_map(float *out_ptr, size_t work_amount_dst, size_t reduced_dims_work_amount);
+
+    Reduce reduceMode = Reduce::Sum;
+    size_t blk_size;
+    size_t dims_size;
+    const size_t REDUCE_DATA = 0;
+    const size_t REDUCE_INDEXES = 1;
+    bool planar_layout = true;
+    bool jit_mode = true;
+    bool keep_dims = true;
+    bool ReduceN, ReduceC, ReduceD, ReduceH, ReduceW;
+    size_t IB, IC, ID, IH, IW;
+    size_t OB, OC, OD, OH, OW;
+    size_t src_data_size, dst_data_size;
+    InferenceEngine::Precision input_prec, output_prec;
+    InferenceEngine::SizeVector src_dims;
+    InferenceEngine::SizeVector src_strides;
+    InferenceEngine::SizeVector process_dst_dims;
+    InferenceEngine::SizeVector axes_for_reduction;
+
+    std::shared_ptr<jit_uni_reduce_kernel> reduce_kernel;
+    std::shared_ptr<jit_uni_reduce_post_kernel> reduce_post_kernel;
+};
+
+}  // namespace MKLDNNPlugin
+
--- a/inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/reduce.cpp
@ -1,406 +0,0 @@
-// Copyright (C) 2018-2020 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "base.hpp"
-
-#include <cmath>
-#include <limits>
-#include <cfloat>
-#include <string>
-#include <vector>
-#include <cassert>
-#include <legacy/ie_util_internal.hpp>
-#include "ie_parallel.hpp"
-
-namespace InferenceEngine {
-namespace Extensions {
-namespace Cpu {
-
-class ReduceImpl: public ExtLayerBase {
-public:
-    explicit ReduceImpl(const CNNLayer* layer) {
-        try {
-            if (layer->insData.empty() || layer->outData.empty())
-                THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output edges!";
-
-            if (layer->insData.size() != 2)
-                THROW_IE_EXCEPTION << layer->name << " Incorrect number of input edges!";
-
-            idx_dims = layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getDims();
-            if (idx_dims.size() > 1)
-                THROW_IE_EXCEPTION << layer->name << " Index vector should be 1 dimension";
-
-            if (layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::FP32 &&
-                layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::I32 &&
-                layer->insData[REDUCE_DATA].lock()->getTensorDesc().getPrecision() != Precision::U8)
-                THROW_IE_EXCEPTION << layer->name << " Incorrect input data tensor precision. Only FP32/I32/U8 are supported!";
-
-            if (layer->insData[REDUCE_INDEXES].lock()->getTensorDesc().getPrecision() != Precision::I32)
-                THROW_IE_EXCEPTION << layer->name << " Incorrect 'axes_to_reduction' input precision. Only I32 is supported!";
-
-            data_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
-            SizeVector dst_dims = layer->outData[0]->getTensorDesc().getDims();
-
-            keep_dims = layer->GetParamAsBool("keep_dims", true);
-            if (keep_dims) {
-                if (data_dims.size() != dst_dims.size())
-                    THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
-            } else {
-                if (data_dims.size() <= dst_dims.size())
-                    THROW_IE_EXCEPTION << layer->name << " Incorrect number of input/output dimensions!";
-            }
-
-            std::string reduce_mode = layer->type;
-            if (reduce_mode == "ReduceAnd") reduceMode = Reduce::And;
-            else if (reduce_mode == "ReduceL1") reduceMode = Reduce::L1;
-            else if (reduce_mode == "ReduceL2") reduceMode = Reduce::L2;
-            else if (reduce_mode == "ReduceLogSum") reduceMode = Reduce::LogSum;
-            else if (reduce_mode == "ReduceLogSumExp") reduceMode = Reduce::LogSumExp;
-            else if (reduce_mode == "ReduceMax") reduceMode = Reduce::Max;
-            else if (reduce_mode == "ReduceMean") reduceMode = Reduce::Mean;
-            else if (reduce_mode == "ReduceMin") reduceMode = Reduce::Min;
-            else if (reduce_mode == "ReduceOr") reduceMode = Reduce::Or;
-            else if (reduce_mode == "ReduceProd") reduceMode = Reduce::Prod;
-            else if (reduce_mode == "ReduceSum") reduceMode = Reduce::Sum;
-            else if (reduce_mode == "ReduceSumSquare") reduceMode = Reduce::SumSquare;
-            else
-                THROW_IE_EXCEPTION << layer->name << " Incorrect Reduce layer type!";
-
-            src_dims = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getDims();
-            srcStrides = layer->insData[REDUCE_DATA].lock()->getTensorDesc().getBlockingDesc().getStrides();
-
-            addConfig(layer, { { ConfLayout::PLN, false }, { ConfLayout::PLN, false } }, { { ConfLayout::PLN, false } });
-        } catch (InferenceEngine::details::InferenceEngineException &ex) {
-            errorMsg = ex.what();
-        }
-    }
-
-    StatusCode execute(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, ResponseDesc *resp) noexcept override {
-        int32_t *idx_data = inputs[REDUCE_INDEXES]->cbuffer().as<int32_t *>() +
-                            inputs[REDUCE_INDEXES]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-        SizeVector axes;
-        const size_t axesIter = idx_dims.empty() ? 1 : idx_dims[0];
-        for (size_t i = 0; i < axesIter; i++) {
-            int32_t axis = idx_data[i];
-            if (axis < 0)
-                axis += data_dims.size();
-
-            if (static_cast<size_t>(axis) > data_dims.size()) {
-                if (resp) {
-                    std::string errorMsg = "Index to reduce exceeds data tensor dimension";
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return PARAMETER_MISMATCH;
-            }
-            axes.push_back(static_cast<size_t>(axis));
-        }
-
-        size_t reduced_dims_work_amount = 1;
-        InferenceEngine::SizeVector our_dims, out_dims, axes_for_reduction;
-        for (size_t i = 0; i < src_dims.size(); i++) {
-            bool found = false;
-            for (size_t axis : axes)
-                if (i == axis) found = true;
-
-            if (found) {
-                axes_for_reduction.push_back(i);
-                reduced_dims_work_amount *= src_dims[i];
-                if (keep_dims) out_dims.push_back(1);
-                our_dims.push_back(1);
-            } else {
-                out_dims.push_back(src_dims[i]);
-                our_dims.push_back(src_dims[i]);
-            }
-        }
-
-        if (!our_dims.size())
-            our_dims = InferenceEngine::SizeVector(1, 1);
-
-        InferenceEngine::SizeVector dst_dims = outputs[0]->getTensorDesc().getDims();
-        for (size_t i = 0; i < (std::min)(out_dims.size(), dst_dims.size()); i++) {
-            if (out_dims[i] != dst_dims[i]) {
-                if (resp) {
-                    std::string errorMsg = "Incorrect number of output dimensions!";
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return PARAMETER_MISMATCH;
-            }
-        }
-
-        size_t work_amount_dst;
-        if (!dst_dims.size()) {
-            work_amount_dst = 1;
-        } else {
-            size_t stride = !outputs[0]->getTensorDesc().getBlockingDesc().getStrides().empty()
-                    ? outputs[0]->getTensorDesc().getBlockingDesc().getStrides()[0]
-                    : 1;
-            work_amount_dst = stride * dst_dims[0];
-        }
-
-        auto compare = getPrecisionMask(inputs[REDUCE_DATA]->getTensorDesc().getPrecision(), outputs[0]->getTensorDesc().getPrecision());
-        switch (compare) {
-            case getPrecisionMask(Precision::FP32, Precision::FP32):
-                return reduce_type<float , float>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            case getPrecisionMask(Precision::I32, Precision::I64):
-                return reduce_type<int32_t , int64_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            case getPrecisionMask(Precision::I32, Precision::U64):
-                return reduce_type<int32_t , uint64_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            case getPrecisionMask(Precision::I32, Precision::FP32):
-                return reduce_type<int32_t , float>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            case getPrecisionMask(Precision::I32, Precision::I32):
-                return reduce_type<int32_t , int32_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            case getPrecisionMask(Precision::U8, Precision::U8):
-                return reduce_type<int8_t , int8_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            case getPrecisionMask(Precision::FP32, Precision::U8):
-                return reduce_type<float , uint8_t>(inputs, outputs, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims);
-            default:
-                if (resp) {
-                    std::string errorMsg = "Incorrect Reduce layer type";
-                    errorMsg.copy(resp->msg, sizeof(resp->msg) - 1);
-                }
-                return GENERAL_ERROR;
-        }
-    }
-
-private:
-    template <typename src_d, typename dst_t, typename F1, typename F2>
-    void reduce(const src_d *src_data, dst_t* dst_data, size_t work_amount_dst, size_t reduced_dims_work_amount,
-        SizeVector axes_for_reduction, SizeVector dst_dims, dst_t init_value, F1 func1, F2 func2);
-    template <typename src_d, typename dst_t>
-    StatusCode reduce_type(std::vector<Blob::Ptr>& inputs, std::vector<Blob::Ptr>& outputs, size_t work_amount_dst, size_t reduced_dims_work_amount,
-                SizeVector axes_for_reduction, SizeVector dst_dims);
-    enum class Reduce { And, L1, L2, LogSum, LogSumExp, Max, Mean, Min, Or, Prod, Sum, SumSquare };
-
-    const size_t REDUCE_DATA = 0;
-    const size_t REDUCE_INDEXES = 1;
-    bool keep_dims = true;
-    Reduce reduceMode = Reduce::Sum;
-    SizeVector data_dims;
-    SizeVector idx_dims;
-    SizeVector src_dims;
-    SizeVector srcStrides;
-};
-
-template <typename src_d, typename dst_t>
-StatusCode ReduceImpl::reduce_type(
-        std::vector<Blob::Ptr>& inputs,
-        std::vector<Blob::Ptr>& outputs,
-        size_t       work_amount_dst,
-        size_t       reduced_dims_work_amount,
-        SizeVector   axes_for_reduction,
-        SizeVector   our_dims
-) {
-    const src_d *src_data = inputs[REDUCE_DATA]->cbuffer().as<src_d *>() +
-                            inputs[REDUCE_DATA]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-    dst_t* dst_data = outputs[0]->cbuffer().as<dst_t *>() +
-                      outputs[0]->getTensorDesc().getBlockingDesc().getOffsetPadding();
-
-    switch (reduceMode) {
-        case Reduce::And:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(1),
-                   [](dst_t x, src_d y)->dst_t { return x && y; },
-                   [](dst_t x, src_d y)->dst_t { return x && y; });
-            break;
-        case Reduce::L1:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t old, src_d y)->dst_t { return old + (std::abs)(y); },
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-            break;
-        case Reduce::L2:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t old, src_d y)->dst_t { return old + y * y;},
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-
-            parallel_for(work_amount_dst, [&](size_t i) {
-                dst_data[i] = sqrt(dst_data[i]);
-            });
-            break;
-        case Reduce::LogSum:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t x, src_d y)->dst_t { return x + y; },
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-
-            parallel_for(work_amount_dst, [&](size_t i) {
-                dst_data[i] = logf(dst_data[i]);
-            });
-            break;
-        case Reduce::LogSumExp:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t old, src_d y)->dst_t { return old + expf(y); },
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-
-            parallel_for(work_amount_dst, [&](size_t i) {
-                dst_data[i] = logf(dst_data[i]);
-            });
-            break;
-        case Reduce::Max:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims,
-                                 (std::numeric_limits<dst_t>::min)(),
-                   [](dst_t x, src_d y)->dst_t { return x > y ? x : y; },
-                   [](dst_t x, src_d y)->dst_t { return x > y ? x : y; });
-            break;
-        case Reduce::Mean:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t x, src_d y)->dst_t { return x + y; },
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-
-            parallel_for(work_amount_dst, [&](size_t i) {
-                dst_data[i] /= static_cast<dst_t>(reduced_dims_work_amount);
-            });
-            break;
-        case Reduce::Min:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims,
-                                 (std::numeric_limits<dst_t>::max)(),
-                   [](dst_t x, src_d y)->dst_t { return x < y ? x : y; },
-                   [](dst_t x, src_d y)->dst_t { return x < y ? x : y; });
-            break;
-        case Reduce::Or:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t x, src_d y)->dst_t { return x || y; },
-                   [](dst_t x, src_d y)->dst_t { return x || y; });
-            break;
-        case Reduce::Prod:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(1),
-                   [](dst_t x, src_d y)->dst_t { return x * y; },
-                   [](dst_t x, src_d y)->dst_t { return x * y; });
-            break;
-        case Reduce::Sum:
-            reduce(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t x, src_d y)->dst_t { return x + y; },
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-            break;
-        case Reduce::SumSquare:
-            reduce<src_d, dst_t>(src_data, dst_data, work_amount_dst, reduced_dims_work_amount, axes_for_reduction, our_dims, static_cast<dst_t>(0),
-                   [](dst_t old, src_d y)->dst_t { return old + y * y; },
-                   [](dst_t x, src_d y)->dst_t { return x + y; });
-            break;
-        default:
-            return GENERAL_ERROR;
-    }
-    return OK;
-}
-
-template <typename src_d, typename dst_t, typename F1, typename F2>
-void ReduceImpl::reduce(
-    const src_d *src_data,
-    dst_t       *dst_data,
-    size_t       work_amount_dst,
-    size_t       reduced_dims_work_amount,
-    SizeVector   axes_for_reduction,
-    SizeVector   dst_dims,
-    dst_t        init_value,
-    F1           func1,
-    F2           func2
-) {
-    unsigned int nthr = parallel_get_max_threads();
-    if ((work_amount_dst + 1) >= nthr) {
-        parallel_nt(0, [&](const int ithr, const int nthr) {
-            int j;
-            size_t i, start = 0, end = 0;
-            SizeVector dst_counters(dst_dims.size(), 0);
-            splitter(work_amount_dst, nthr, ithr, start, end);
-            for (j = dst_dims.size() - 1, i = start; j >= 0; j--) {
-                dst_counters[j] = i % dst_dims[j];
-                i /= dst_dims[j];
-            }
-            for (size_t src_idx = 0, dst_idx = start; dst_idx < end; ++dst_idx) {
-                dst_t reduce_prod = init_value;
-                bool update_idx = true;
-                SizeVector src_counters = dst_counters;
-                for (i = 0; i < reduced_dims_work_amount; ++i) {
-                    if (update_idx) {
-                        src_idx = 0;
-                        for (j = 0; j < static_cast<int>(src_dims.size()); ++j)
-                            src_idx += (src_counters[j] % src_dims[j]) * srcStrides[j];
-                        update_idx = false;
-                    }
-                    reduce_prod = func1(reduce_prod, src_data[src_idx]);
-                    for (j = axes_for_reduction.size() - 1; j >= 0; j--) {
-                        src_counters[axes_for_reduction[j]]++;
-                        if (src_counters[axes_for_reduction[j]] < src_dims[axes_for_reduction[j]]) {
-                            src_idx += srcStrides[axes_for_reduction[j]];
-                            break;
-                        } else {
-                            src_counters[axes_for_reduction[j]] = 0;
-                            update_idx = true;
-                        }
-                    }
-                }
-                dst_data[dst_idx] = reduce_prod;
-                for (j = dst_dims.size() - 1; j >= 0; j--) {
-                    dst_counters[j]++;
-                    if (dst_counters[j] < dst_dims[j])
-                        break;
-                    else
-                        dst_counters[j] = 0;
-                }
-            }
-        });
-    } else {
-        std::vector<dst_t> reduce_prod((nthr * work_amount_dst), init_value);
-        if (work_amount_dst == 1) {
-            parallel_nt(nthr, [&](const int ithr, const int nthr) {
-                size_t i, start = 0, end = 0;
-                splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
-                for (i = start; i < end; ++i)
-                    reduce_prod[ithr] = func1(reduce_prod[ithr], src_data[i]);
-            });
-        } else {
-            SizeVector dstStrides(dst_dims.size(), 1);
-            for (int j = dst_dims.size() - 1; j >= 1; --j)
-                dstStrides[j - 1] = dstStrides[j] * dst_dims[j];
-            parallel_nt(nthr, [&](const int ithr, const int nthr) {
-                int j;
-                bool update_idx = true;
-                size_t i, src_idx, dst_idx = 0, start = 0, end = 0;
-                splitter((srcStrides[0] * src_dims[0]), nthr, ithr, start, end);
-                SizeVector src_counters(src_dims.size(), 0);
-                for (j = src_dims.size() - 1, src_idx = start; j >= 0; j--) {
-                    src_counters[j] = src_idx % src_dims[j];
-                    src_idx /= src_dims[j];
-                }
-                for (src_idx = start; src_idx < end; ++src_idx) {
-                    if (update_idx) {
-                        for (i = 0, dst_idx = 0; i < dst_dims.size(); ++i)
-                            dst_idx += (src_counters[i] % dst_dims[i]) * dstStrides[i];
-                        update_idx = false;
-                    }
-                    reduce_prod[ithr * work_amount_dst + dst_idx] = func1(reduce_prod[ithr * work_amount_dst + dst_idx], src_data[src_idx]);
-                    for (j = src_dims.size() - 1; j >= 0; j--) {
-                        src_counters[j]++;
-                        if (src_counters[j] < src_dims[j]) {
-                            if (dst_dims[j] > 1) dst_idx += dstStrides[j];
-                            break;
-                        } else {
-                            src_counters[j] = 0;
-                            update_idx = true;
-                        }
-                    }
-                }
-            });
-        }
-        for (size_t dst_idx = 0; dst_idx < work_amount_dst; dst_idx++) {
-            for (size_t ithr = work_amount_dst; ithr < (nthr * work_amount_dst); ithr += work_amount_dst)
-                reduce_prod[dst_idx] = func2(reduce_prod[dst_idx], reduce_prod[dst_idx + ithr]);
-            dst_data[dst_idx] = reduce_prod[dst_idx];
-        }
-    }
-}
-
-REG_FACTORY_FOR(ReduceImpl, ReduceAnd);
-REG_FACTORY_FOR(ReduceImpl, ReduceL1);
-REG_FACTORY_FOR(ReduceImpl, ReduceL2);
-REG_FACTORY_FOR(ReduceImpl, ReduceLogSum);
-REG_FACTORY_FOR(ReduceImpl, ReduceLogSumExp);
-REG_FACTORY_FOR(ReduceImpl, ReduceMax);
-REG_FACTORY_FOR(ReduceImpl, ReduceMean);
-REG_FACTORY_FOR(ReduceImpl, ReduceMin);
-REG_FACTORY_FOR(ReduceImpl, ReduceOr);
-REG_FACTORY_FOR(ReduceImpl, ReduceProd);
-REG_FACTORY_FOR(ReduceImpl, ReduceSum);
-REG_FACTORY_FOR(ReduceImpl, ReduceSumSquare);
-
-}  // namespace Cpu
-}  // namespace Extensions
-}  // namespace InferenceEngine
--- a/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp
+++ b/inference-engine/tests/functional/plugin/cpu/shared_tests_instances/single_layer_tests/reduce_ops.cpp
@ -12,15 +12,37 @@ using namespace LayerTestsDefinitions;
 namespace {
 const std::vector<InferenceEngine::Precision> netPrecisions = {
        InferenceEngine::Precision::FP32,
+        InferenceEngine::Precision::I32,
+        InferenceEngine::Precision::U8,
+        InferenceEngine::Precision::I8,
+};
+
+const std::vector<bool> keepDims = {
+        true,
+        false,
 };

 const std::vector<std::vector<size_t>> inputShapes = {
-        std::vector<size_t>{10, 20, 40},
-        std::vector<size_t>{5, 6, 10, 11},
+        std::vector<size_t>{10, 20, 30, 40},
+        std::vector<size_t>{3, 5, 7, 9},
 };

 const std::vector<std::vector<int>> axes = {
+        {0},
+        {1},
+        {2},
+        {3},
+        {0, 1},
        {0, 2},
+        {0, 3},
+        {1, 2},
+        {1, 3},
+        {2, 3},
+        {0, 1, 2},
+        {0, 1, 3},
+        {0, 2, 3},
+        {1, 2, 3},
+        {0, 1, 2, 3},
        {1, -1}
 };

@ -45,11 +67,57 @@ const auto paramsOneAxis = testing::Combine(
        testing::ValuesIn(opTypes),
        testing::Values(true, false),
        testing::ValuesIn(reductionTypes),
-        testing::ValuesIn(netPrecisions),
+        testing::Values(InferenceEngine::Precision::FP32),
        testing::ValuesIn(inputShapes),
        testing::Values(CommonTestUtils::DEVICE_CPU)
 );

+const auto params_Precisions = testing::Combine(
+        testing::Values(std::vector<int>{1, 3}),
+        testing::Values(opTypes[1]),
+        testing::ValuesIn(keepDims),
+        testing::Values(ngraph::helpers::ReductionType::Sum),
+        testing::Values(InferenceEngine::Precision::FP32,
+                        InferenceEngine::Precision::I32),
+        testing::Values(std::vector<size_t>{2, 2, 2, 2}),
+        testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+const auto params_InputShapes = testing::Combine(
+        testing::Values(std::vector<int>{0}),
+        testing::Values(opTypes[1]),
+        testing::ValuesIn(keepDims),
+        testing::Values(ngraph::helpers::ReductionType::Mean),
+        testing::Values(InferenceEngine::Precision::FP32),
+        testing::Values(std::vector<size_t>{3},
+                        std::vector<size_t>{3, 5},
+                        std::vector<size_t>{2, 4, 6},
+                        std::vector<size_t>{2, 4, 6, 8},
+                        std::vector<size_t>{2, 2, 2, 2, 2},
+                        std::vector<size_t>{2, 2, 2, 2, 2, 2}),
+        testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+const auto params_Axes = testing::Combine(
+        testing::ValuesIn(axes),
+        testing::Values(opTypes[1]),
+        testing::ValuesIn(keepDims),
+        testing::Values(ngraph::helpers::ReductionType::Mean),
+        testing::Values(InferenceEngine::Precision::FP32),
+        testing::ValuesIn(inputShapes),
+        testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
+const auto params_ReductionTypes = testing::Combine(
+        testing::Values(std::vector<int>{0, 1, 3}),
+        testing::Values(opTypes[1]),
+        testing::ValuesIn(keepDims),
+        testing::ValuesIn(reductionTypes),
+        testing::Values(InferenceEngine::Precision::FP32),
+        testing::Values(std::vector<size_t>{2, 9, 2, 9}),
+        testing::Values(CommonTestUtils::DEVICE_CPU)
+);
+
 INSTANTIATE_TEST_CASE_P(
        ReduceOneAxis,
        ReduceOpsLayerTest,
@ -57,21 +125,31 @@ INSTANTIATE_TEST_CASE_P(
        ReduceOpsLayerTest::getTestCaseName
 );

-const auto params = testing::Combine(
-        testing::ValuesIn(axes),
-        testing::Values(opTypes[1]),
-        testing::Values(true, false),
-        testing::ValuesIn(reductionTypes),
-        testing::ValuesIn(netPrecisions),
-        testing::ValuesIn(inputShapes),
-        testing::Values(CommonTestUtils::DEVICE_CPU)
-);
-
 INSTANTIATE_TEST_CASE_P(
-        Reduce,
+        Reduce_Precisions,
        ReduceOpsLayerTest,
-        params,
+        params_Precisions,
        ReduceOpsLayerTest::getTestCaseName
 );

+INSTANTIATE_TEST_CASE_P(
+        Reduce_InputShapes,
+        ReduceOpsLayerTest,
+        params_InputShapes,
+        ReduceOpsLayerTest::getTestCaseName
+);
+
+INSTANTIATE_TEST_CASE_P(
+        Reduce_Axes,
+        ReduceOpsLayerTest,
+        params_Axes,
+        ReduceOpsLayerTest::getTestCaseName
+);
+
+INSTANTIATE_TEST_CASE_P(
+        Reduce_ReductionTypes,
+        ReduceOpsLayerTest,
+        params_ReductionTypes,
+        ReduceOpsLayerTest::getTestCaseName
+);
 }  // namespace
--- a/inference-engine/thirdparty/mkl-dnn
+++ b/inference-engine/thirdparty/mkl-dnn
@ -1 +1 @@
-Subproject commit e759306d9500a958033954390be0faeac5e31f99
+Subproject commit ae3c03550796c2131dfb683a8eefb286cf7e8db3