[CPU] Rnn weights repacking (#16992)

2023-04-24 13:48:57 +02:00 · 2023-04-24 13:48:57 +02:00 · f8522a6ea1
commit f8522a6ea1
parent f410658d32
6 changed files with 101 additions and 75 deletions
--- a/src/plugins/intel_cpu/src/node.cpp
+++ b/src/plugins/intel_cpu/src/node.cpp
@ -771,6 +771,52 @@ void Node::initDescriptor(const NodeConfig& config) {
    selectedPD->setConfig(updatedConfig);
 }

+void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) {
+    size_t minSize = indx + 1;
+    if (internalBlobMemory.size() < minSize) {
+        internalBlobMemory.resize(minSize);
+    }
+
+    if (minSize > internalBlobs.size()) {
+        IE_THROW() << "Can't prepare memory for internal blob, requested index: " << indx <<
+            " is out of bounds of the internalBlobs vector of size " << internalBlobs.size();
+    }
+
+    const auto &internalBlob = internalBlobs[indx];
+
+    auto create = [&] () {
+        // TODO [DS]: internal blobs should be removed or rewritten using Memory object
+        auto newDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(internalBlob->getTensorDesc());
+
+        Memory memory{ engine };
+        memory.Create(newDesc, internalBlob->buffer());
+
+        MemoryPtr _ptr = std::make_shared<Memory>(engine);
+        _ptr->Create(intDesc);
+        node::Reorder::reorderData(memory, *_ptr, context->getParamsCache());
+        return _ptr;
+    };
+
+    MemoryPtr ptr;
+    auto weightCache = context->getWeightsCache();
+    if (weightCache != nullptr && memory::format_kind::blocked == intDesc->getDnnlDesc().get_format_kind()) {
+        const auto& format = intDesc->serializeFormat();
+        const uint64_t data_hash = weightCache->GetHashFunc().hash(
+                internalBlob->buffer(), internalBlob->byteSize());
+
+        const std::string string_hash = name + "_" + std::to_string(indx)
+                                        + "_" + format
+                                        + "_" + std::to_string(internalBlob->byteSize())
+                                        + "_" + std::to_string(data_hash);
+
+        ptr = *weightCache->findOrCreate(string_hash, create);
+    } else {
+        ptr = create();
+    }
+
+    internalBlobMemory[indx] = ptr;
+}
+
 void Node::prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs) {
    if (internalBlobs.size() != intDescs.size()) {
        IE_THROW() << "Can't prepare memory for internal blob, internal blob and internal descs number do not match "
@ -779,38 +825,7 @@ void Node::prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs) {

    internalBlobMemory.clear();
    for (size_t i = 0; i < internalBlobs.size(); i++) {
-        const auto &internalBlob = internalBlobs[i];
-
-        auto create = [&] () {
-            // TODO [DS]: internal blobs should be removed or rewritten using Memory object
-            auto newDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(internalBlob->getTensorDesc());
-
-            Memory memory{ engine };
-            memory.Create(newDesc, internalBlob->buffer());
-
-            MemoryPtr _ptr = std::make_shared<Memory>(engine);
-            _ptr->Create(*intDescs[i]);
-            _ptr->SetData(memory);
-
-            return _ptr;
-        };
-
-        MemoryPtr ptr;
-        auto weightCache = context->getWeightsCache();
-        if (weightCache != nullptr) {
-            const uint64_t data_hash = weightCache->GetHashFunc().hash(
-                    internalBlob->buffer(), internalBlob->byteSize());
-
-            const std::string string_hash = name + "_" + std::to_string(i)
-                                            + "_" + std::to_string(internalBlob->byteSize())
-                                            + "_" + std::to_string(data_hash);
-
-            ptr = *weightCache->findOrCreate(string_hash, create);
-        } else {
-            ptr = create();
-        }
-
-        internalBlobMemory.push_back(ptr);
+        prepareMemory(intDescs[i], i);
    }
 }

--- a/src/plugins/intel_cpu/src/node.h
+++ b/src/plugins/intel_cpu/src/node.h
@ -648,6 +648,7 @@ protected:
                              bool dynBatchSupport = false);

    void prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs);
+    void prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx);
    void prepareMemory(dnnl::primitive_desc_iterator& itpd);

    MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr weightDesc);
--- a/src/plugins/intel_cpu/src/nodes/rnn.cpp
+++ b/src/plugins/intel_cpu/src/nodes/rnn.cpp
@ -1072,40 +1072,38 @@ void RNN::prepareParams() {
                                                       key.wDescs,
                                                       key.attr);

-        return std::make_shared<DnnlExecutor>(descPtr);
+        return descPtr ? std::make_shared<RnnDnnlExecutor>(descPtr) : nullptr;
    };

    auto cache = context->getParamsCache();
    auto result = cache->getOrCreate(key, builder);
-
+    auto prevExecPtr = execPtr;
    execPtr = result.first;

    if (!execPtr) {
        IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
    }

-    scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
-
-    if (!wasMemoryPrepared || wFormatWasChanged) {
-        auto pd = execPtr->getPrimitiveDesc();
-        auto query_weights_md = [&](int idx = 0) -> dnnl::memory::desc {
-            auto what = dnnl::convert_to_c(dnnl::query::weights_md);
-            const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md(pd, what, idx);
-            if (!cdesc)
-                IE_THROW() << "query_weights_md failed for node " << getName() << " idx " << idx << ".";
-            dnnl_memory_desc_t cloned_md = nullptr;
-            dnnl_memory_desc_clone(&cloned_md, cdesc);
-
-            return dnnl::memory::desc(cloned_md);
-        };
-        std::vector<DnnlMemoryDescPtr> intDescs {
-            DnnlExtensionUtils::makeDescriptor(query_weights_md(0)),
-            DnnlExtensionUtils::makeDescriptor(query_weights_md(1)),
-            DnnlExtensionUtils::makeDescriptor(query_weights_md(2))
-        };
-        prepareMemory(intDescs);
-        wasMemoryPrepared = true;
+    if (!primArgs.count(DNNL_ARG_WEIGHTS_LAYER) || !prevExecPtr ||
+        !execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
+        prepareMemory(execPtr->getWeightDesc(), 0);
+        primArgs[DNNL_ARG_WEIGHTS_LAYER] = internalBlobMemory[0]->GetPrimitive();
    }
+
+    if (!primArgs.count(DNNL_ARG_WEIGHTS_ITER) || !prevExecPtr ||
+        !execPtr->getWeightIterDesc()->isCompatible(*(prevExecPtr->getWeightIterDesc()))) {
+        prepareMemory(execPtr->getWeightIterDesc(), 1);
+        primArgs[DNNL_ARG_WEIGHTS_ITER] = internalBlobMemory[1]->GetPrimitive();
+    }
+
+    if (!primArgs.count(DNNL_ARG_BIAS) || !prevExecPtr ||
+        !execPtr->getBiasDesc()->isCompatible(*(prevExecPtr->getBiasDesc()))) {
+        prepareMemory(execPtr->getBiasDesc(), 2);
+        primArgs[DNNL_ARG_BIAS] = internalBlobMemory[2]->GetPrimitive();
+    }
+
+    auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
+    primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
 }

 std::shared_ptr<MemoryDesc> RNN::getSrcMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) {
@ -1123,18 +1121,10 @@ void RNN::execute(dnnl::stream strm) {
    const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
    const auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();

-    const auto &wgh_data_mem = internalBlobMemory[0];
-    const auto &wgh_stat_mem = internalBlobMemory[1];
-    const auto &wgh_bias_mem = internalBlobMemory[2];
+    auto args = primArgs;

-    std::unordered_map<int, memory> args {
-        {DNNL_ARG_SRC_LAYER,     src_data_mem->GetPrimitive()},
-        {DNNL_ARG_WEIGHTS_LAYER, wgh_data_mem->GetPrimitive()},
-        {DNNL_ARG_WEIGHTS_ITER,  wgh_stat_mem->GetPrimitive()},
-        {DNNL_ARG_BIAS,          wgh_bias_mem->GetPrimitive()},
-        {DNNL_ARG_DST_LAYER,     dst_data_mem->GetPrimitive()},
-        {DNNL_ARG_SCRATCHPAD,    scratchpadMem->GetPrimitive()}
-    };
+    args[DNNL_ARG_SRC_LAYER] = src_data_mem->GetPrimitive();
+    args[DNNL_ARG_DST_LAYER] = dst_data_mem->GetPrimitive();

    int state_i_tags[] {DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C};
    int state_o_tags[] {DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C};
@ -1180,6 +1170,11 @@ void RNN::cleanup() {
    }
 }

+RNN::RnnDnnlExecutor::RnnDnnlExecutor(const dnnl::primitive_desc& pd) : DnnlExecutor(pd) {
+    wghts_iter_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc(1));
+    bias_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc(2));
+}
+
 }   // namespace node
 }   // namespace intel_cpu
 }   // namespace ov
--- a/src/plugins/intel_cpu/src/nodes/rnn.h
+++ b/src/plugins/intel_cpu/src/nodes/rnn.h
@ -68,7 +68,24 @@ private:

    void copyWeightsData();

-    using executorPtr = std::shared_ptr<DnnlExecutor>;
+    class RnnDnnlExecutor : public DnnlExecutor {
+        public:
+            RnnDnnlExecutor(const dnnl::primitive_desc& pd);
+
+            DnnlMemoryDescPtr getWeightIterDesc() const {
+                return wghts_iter_md;
+            }
+
+            DnnlMemoryDescPtr getBiasDesc() const {
+                return bias_md;
+            }
+
+        private:
+            DnnlMemoryDescPtr wghts_iter_md;
+            DnnlMemoryDescPtr bias_md;
+    };
+
+    using executorPtr = std::shared_ptr<RnnDnnlExecutor>;
    executorPtr execPtr = nullptr;

    /** Specify mode Cell or Seq. true - Cell, false - Seq */
@ -143,9 +160,6 @@ private:
    static constexpr size_t optimalBatchSize = 16lu;
    static constexpr size_t batchDimDummyValue = 64lu;

-    bool wasMemoryPrepared = false;
-    MemoryPtr scratchpadMem;
-
    float inputScale    = 0.f;
    float inputShift    = 0.f;
    std::vector<float> weightsScales;
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp
@ -167,11 +167,6 @@ std::vector<std::string> disabledTestPatterns() {
        // The kernel does not have such garbage. The diff 0.000000745 is taken into account in calculations and affects further type conversion.
        // Reorder->GridSample->Reorder also does not work here. Potential fix is to use nearest conversion instead of truncation.
        R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*(i32|i8).*)",
-        // // Issue: 95915
-        R"(smoke_dynamic/AUGRUCellCPUTest.CompareWithRefs/IS=\(\[\?\.1\]_\[\?\.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
-        R"(smoke_dynamic/GRUCellCPUTest.CompareWithRefs/IS=\(\[\?.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
-        R"(nightly_dynamic_bf16/RNNSequenceCPUTest.*activations=\(relu\).*)",
-        R"(smoke_dynamic_BatchSizeOne/RNNSequenceCPUTest.*IS=\(\[1\.\?\.10\]_\[1\.1\.10\]_\[\?\]_\)_TS=\{\(1\.2\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.4\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.8\.10\)_\(1\.1\.10\)_\(1\)\}_seqMode=PURE_SEQ_activations=\(relu\)_clip=0_direction=forward_netPrec=f32__inFmts=ncw\.ntc_outFmts=ncw\.ncw_primitive=ref_any)", // NOLINT
        // 98151. Not valid sorting for slices in reference.
        R"(.*UniqueLayerTestCPU.*axis.*True.*)",
    };
--- a/src/plugins/intel_cpu/tests/functional/single_layer_tests/rnn_sequence.cpp
+++ b/src/plugins/intel_cpu/tests/functional/single_layer_tests/rnn_sequence.cpp
@ -100,6 +100,12 @@ protected:
            selectedType = makeSelectedTypeStr(selectedType, netPrecision);
        }

+        if (selectedType.find("BF16") != std::string::npos) {
+            rel_threshold = 5e-2;
+        } else if (selectedType.find("FP32") != std::string::npos) {
+            rel_threshold = 1e-4;
+        }
+
        auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes);
        const size_t batchSize = inputDynamicShapes[0][0].is_static() ? inputDynamicShapes[0][0].get_length() :
            inputDynamicShapes[1][0].is_static() ? inputDynamicShapes[1][0].get_length() :