[CPU] Optimize post ops processing

2022-03-11 16:39:02 +03:00
parent 8ee5514629
commit 5e1a5aef3e
3 changed files with 151 additions and 65 deletions
--- a/src/plugins/intel_cpu/src/nodes/conv.cpp
+++ b/src/plugins/intel_cpu/src/nodes/conv.cpp
@@ -513,7 +513,6 @@ void Convolution::getSupportedDescriptors() {

 void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, bool initWeights = false) {
    dnnl::post_ops ops;
-    const bool useLegacyPostOps = true; // @todo remove after issue with performance of binary post ops fixed

    auto getBinPostOpShape = [&](){
        const auto outShape = getOutputShapeAtPort(0).getStaticDims();
@@ -536,7 +535,7 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims,
                }
                ops.append_sum(1.0, DnnlExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
            } else {
-                if (useLegacyPostOps || eltwiseNode->getOneDnnAlgorithm() != dnnl::algorithm::undef) {
+                if (eltwiseNode->getOneDnnAlgorithm() != dnnl::algorithm::undef) {
                    eltwiseNode->appendPostOps(ops, dims, postOpsArgs);
                } else {
                    eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), postOpsArgs);
@@ -546,83 +545,111 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims,
        }

        if (auto* fakeQuantizeNode = dynamic_cast<FakeQuantize *>(node.get())) {
-            if (useLegacyPostOps) {
-                if (i == 0) {
-                    bool hasSubsequentSum = false;
-                    bool hasSubsequentFQ = false;
-                    for (int j = i + 1; j < fusedWith.size(); j++) {
-                        auto &nextNode = fusedWith[j];
+            if (i == 0) {
+                bool hasSubsequentSum = false;
+                bool hasSubsequentFQ = false;
+                for (int j = i + 1; j < fusedWith.size(); j++) {
+                    auto &nextNode = fusedWith[j];

-                        auto *nextEltwiseNode = dynamic_cast<Eltwise *>(nextNode.get());
-                        if (nextEltwiseNode && nextEltwiseNode->isSpecialConvolutionAddFusing()) {
-                            hasSubsequentSum = true;
-                        }
-
-                        auto *nextQuantizeNode = dynamic_cast<FakeQuantize *>(nextNode.get());
-                        if (nextQuantizeNode) {
-                            hasSubsequentFQ = true;
-                        }
+                    auto *nextEltwiseNode = dynamic_cast<Eltwise *>(nextNode.get());
+                    if (nextEltwiseNode && nextEltwiseNode->isSpecialConvolutionAddFusing()) {
+                        hasSubsequentSum = true;
                    }

-                    if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQCommon &&
-                        hasSubsequentSum &&
-                        hasSubsequentFQ) {
-                        std::vector<float> fqScale = fakeQuantizeNode->getFQScales();
-                        if (!fqScale.empty()) {
-                            size_t size = fqScale.size();
-                            size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
-                            if (size == 1) {
-                                fqScale.resize(OC);
-                                for (size_t k = 0; k < OC; k++)
-                                    fqScale[k] = fqScale[0];
-                            }
-
-                            attr.set_output_scales(1 << 1, fqScale);
-
-                            continue;
-                        }
+                    auto *nextQuantizeNode = dynamic_cast<FakeQuantize *>(nextNode.get());
+                    if (nextQuantizeNode) {
+                        hasSubsequentFQ = true;
                    }
                }

-                if (node == fusedWith[fusedWith.size() - 1] &&
-                    outputDataType == memory::data_type::u8 &&
-                    fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization
-                    /*levels == 256*/) {
-                    auto &cl = fakeQuantizeNode->getCropLow();
-                    auto &isc = fakeQuantizeNode->getInputScale();
-                    auto &ish = fakeQuantizeNode->getInputShift();
+                if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQCommon &&
+                    hasSubsequentSum &&
+                    hasSubsequentFQ) {
+                    std::vector<float> fqScale = fakeQuantizeNode->getFQScales();
+                    if (!fqScale.empty()) {
+                        size_t size = fqScale.size();
+                        size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
+                        if (size == 1) {
+                            fqScale.resize(OC);
+                            for (size_t k = 0; k < OC; k++)
+                                fqScale[k] = fqScale[0];
+                        }

-                    if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
-                        std::all_of(isc.cbegin(), isc.cend(), [&](float val) { return val == isc[0]; }) &&
-                        std::all_of(ish.cbegin(), ish.cend(), [&](float val) { return val == ish[0]; })) {
-                        ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, isc[0], ish[0]);
+                        attr.set_output_scales(1 << 1, fqScale);

                        continue;
                    }
-//                    } else if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; })) {
-//                        std::vector<float> new_isc = isc;
-//                        new_isc.resize(rnd_up(isc.size(), 16), 0);
-//
-//                        std::vector<float> new_ish = ish;
-//                        new_ish.resize(rnd_up(ish.size(), 16), 0);
-//
-//                        fakeQuantizeNode->setInputScale(new_isc);
-//                        fakeQuantizeNode->setInputShift(new_ish);
-//
-//                        ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift,
-//                                             &fakeQuantizeNode->getInputScale()[0],
-//                                             &fakeQuantizeNode->getInputShift()[0]);
-//
-//                        continue;
-//                    }
                }

-                fakeQuantizeNode->appendPostOps(ops, dims, postOpsArgs);
+                if (node == fusedWith[fusedWith.size() - 1]) {
+                    auto &cl = fakeQuantizeNode->getCropLow();
+                    auto &ch = fakeQuantizeNode->getCropHigh();
+                    auto &isc = fakeQuantizeNode->getInputScale();
+                    auto &ish = fakeQuantizeNode->getInputShift();
+                    auto &osc = fakeQuantizeNode->getOutputScale();
+                    auto &osh = fakeQuantizeNode->getOutputShift();
+                    if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization) {
+                        if (outputDataType == memory::data_type::u8 &&
+                            std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
+                            std::all_of(ish.cbegin(), ish.cend(), [](float val) { return val == 0.0f; })) {
+                            std::vector<float> outScale = isc;
+                            if (!outScale.empty()) {
+                                size_t size = outScale.size();
+                                size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
+                                if (size == 1) {
+                                    outScale.resize(OC);
+                                    for (size_t k = 0; k < OC; k++)
+                                        outScale[k] = outScale[0];
+                                }

-                continue;
-            } else {
-                fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), postOpsArgs);
+                                attr.set_output_scales(1 << 1, outScale);
+
+                                continue;
+                            }
+                        }
+                    }
+
+                    if (outputDataType == memory::data_type::s8 &&
+                        std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < 0.0001f; }) &&
+                        std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) &&
+                        std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < 0.0001f; })) {
+                        bool isCropAligned = true;
+                        for (int i = 0; i < std::max(cl.size(), isc.size()); i++) {
+                            if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > 0.0001f) {
+                                isCropAligned = false;
+                            }
+                        }
+
+                        for (int i = 0; i < std::max(ch.size(), isc.size()); i++) {
+                            if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > 0.0001f) {
+                                isCropAligned = false;
+                            }
+                        }
+
+                        if (isCropAligned) {
+                            std::vector<float> outScale = isc;
+                            if (!outScale.empty()) {
+                                size_t size = outScale.size();
+                                size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
+                                if (size == 1) {
+                                    outScale.resize(OC);
+                                    for (size_t k = 0; k < OC; k++)
+                                        outScale[k] = outScale[0];
+                                }
+
+                                attr.set_output_scales(1 << 1, outScale);
+
+                                continue;
+                            }
+                        }
+                    }
+                }
            }
+
+            fakeQuantizeNode->appendBinPostOpsOptimized(ops, getBinPostOpShape(), postOpsArgs,
+                    node == fusedWith[fusedWith.size() - 1], outputDataType);
+
+            continue;
        }

        auto* convolutionNode = dynamic_cast<Convolution *>(node.get());
--- a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp
+++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp
@@ -1919,6 +1919,63 @@ void FakeQuantize::appendBinPostOps(dnnl::post_ops& ops, const VectorDims& postO
    appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
 }

+void FakeQuantize::appendBinPostOpsOptimized(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& binaryPostOpsMem,
+                                             bool isLastPostOp, dnnl::memory::data_type outDataType) {
+    static const size_t bufferAlignment = 1;
+
+    initializePostOpData(postOpDims, bufferAlignment);
+
+    VectorDims broadcastBinaryShape(postOpDims.size(), 1);
+
+    auto appendBinary = [&](const dnnl::algorithm alg, const size_t dataSize, MemoryPtr &memPtr, const void *data) {
+        DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, dataSize == 1 ? Shape(broadcastBinaryShape) : Shape(postOpDims));
+        ops.append_binary(alg, memoryDesc.getDnnlDesc());
+
+        if (!memPtr) {
+            memPtr.reset(new Memory(getEngine()));
+            memPtr->Create(memoryDesc, data);
+
+            binaryPostOpsMem.push_back(memPtr);
+        }
+    };
+
+    dnnl::algorithm alg = getAlgorithm() == Algorithm::FQCommon || getAlgorithm() == Algorithm::FQRequantization
+                                ? dnnl::algorithm::quantization_quantize_dequantize
+                                : dnnl::algorithm::quantization_quantize;
+
+    if (isLastPostOp &&
+        outDataType == memory::data_type::u8 &&
+        getAlgorithm() == Algorithm::FQQuantization
+        /*levels == 256*/) {
+        auto &cl = getCropLow();
+        auto &isc = getInputScale();
+        auto &ish = getInputShift();
+
+        if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
+            std::all_of(isc.cbegin(), isc.cend(), [&](float val) { return val == isc[0]; }) &&
+            std::all_of(ish.cbegin(), ish.cend(), [&](float val) { return val == ish[0]; })) {
+            ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, isc[0], ish[0]);
+
+            return;
+        } else if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; })) {
+            appendBinary(dnnl::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
+            appendBinary(dnnl::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
+
+            return;
+        }
+    }
+
+    appendBinary(dnnl::algorithm::binary_min, cropHighSize, cropHighMemory, &cropHighData.shifts_[0]);
+    appendBinary(dnnl::algorithm::binary_max, cropLowSize, cropLowMemory, &cropLowData.shifts_[0]);
+    appendBinary(dnnl::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
+    appendBinary(dnnl::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
+    if (alg == dnnl::algorithm::quantization_quantize_dequantize) {
+        ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_round_half_to_even, 0, 0);
+    }
+    appendBinary(dnnl::algorithm::binary_mul, outputScaleSize, outputScaleMemory, &outputScaleData.scales_[0]);
+    appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
+}
+
 FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
    bool isBinarization = _jqp.op_type == Algorithm::FQBinarization;
    if (mayiuse(cpu::x64::avx512_core)) {
--- a/src/plugins/intel_cpu/src/nodes/fake_quantize.h
+++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.h
@@ -127,6 +127,8 @@ public:
    void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& postOpsMem) override;
    void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<const void*>& postOpsMem) override;
    void appendBinPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& binaryPostOpsMem) override;
+    void appendBinPostOpsOptimized(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& binaryPostOpsMem,
+            bool isLastPostOp, dnnl::memory::data_type outDataType);

    static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;