[CPU] Rnn weights repacking (#16992)

This commit is contained in:
Maksim Kutakov 2023-04-24 13:48:57 +02:00 committed by GitHub
parent f410658d32
commit f8522a6ea1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 101 additions and 75 deletions

View File

@ -771,6 +771,52 @@ void Node::initDescriptor(const NodeConfig& config) {
selectedPD->setConfig(updatedConfig);
}
void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) {
size_t minSize = indx + 1;
if (internalBlobMemory.size() < minSize) {
internalBlobMemory.resize(minSize);
}
if (minSize > internalBlobs.size()) {
IE_THROW() << "Can't prepare memory for internal blob, requested index: " << indx <<
" is out of bounds of the internalBlobs vector of size " << internalBlobs.size();
}
const auto &internalBlob = internalBlobs[indx];
auto create = [&] () {
// TODO [DS]: internal blobs should be removed or rewritten using Memory object
auto newDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(internalBlob->getTensorDesc());
Memory memory{ engine };
memory.Create(newDesc, internalBlob->buffer());
MemoryPtr _ptr = std::make_shared<Memory>(engine);
_ptr->Create(intDesc);
node::Reorder::reorderData(memory, *_ptr, context->getParamsCache());
return _ptr;
};
MemoryPtr ptr;
auto weightCache = context->getWeightsCache();
if (weightCache != nullptr && memory::format_kind::blocked == intDesc->getDnnlDesc().get_format_kind()) {
const auto& format = intDesc->serializeFormat();
const uint64_t data_hash = weightCache->GetHashFunc().hash(
internalBlob->buffer(), internalBlob->byteSize());
const std::string string_hash = name + "_" + std::to_string(indx)
+ "_" + format
+ "_" + std::to_string(internalBlob->byteSize())
+ "_" + std::to_string(data_hash);
ptr = *weightCache->findOrCreate(string_hash, create);
} else {
ptr = create();
}
internalBlobMemory[indx] = ptr;
}
void Node::prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs) {
if (internalBlobs.size() != intDescs.size()) {
IE_THROW() << "Can't prepare memory for internal blob, internal blob and internal descs number do not match "
@ -779,38 +825,7 @@ void Node::prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs) {
internalBlobMemory.clear();
for (size_t i = 0; i < internalBlobs.size(); i++) {
const auto &internalBlob = internalBlobs[i];
auto create = [&] () {
// TODO [DS]: internal blobs should be removed or rewritten using Memory object
auto newDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(internalBlob->getTensorDesc());
Memory memory{ engine };
memory.Create(newDesc, internalBlob->buffer());
MemoryPtr _ptr = std::make_shared<Memory>(engine);
_ptr->Create(*intDescs[i]);
_ptr->SetData(memory);
return _ptr;
};
MemoryPtr ptr;
auto weightCache = context->getWeightsCache();
if (weightCache != nullptr) {
const uint64_t data_hash = weightCache->GetHashFunc().hash(
internalBlob->buffer(), internalBlob->byteSize());
const std::string string_hash = name + "_" + std::to_string(i)
+ "_" + std::to_string(internalBlob->byteSize())
+ "_" + std::to_string(data_hash);
ptr = *weightCache->findOrCreate(string_hash, create);
} else {
ptr = create();
}
internalBlobMemory.push_back(ptr);
prepareMemory(intDescs[i], i);
}
}

View File

@ -648,6 +648,7 @@ protected:
bool dynBatchSupport = false);
void prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs);
void prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx);
void prepareMemory(dnnl::primitive_desc_iterator& itpd);
MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr weightDesc);

View File

@ -1072,40 +1072,38 @@ void RNN::prepareParams() {
key.wDescs,
key.attr);
return std::make_shared<DnnlExecutor>(descPtr);
return descPtr ? std::make_shared<RnnDnnlExecutor>(descPtr) : nullptr;
};
auto cache = context->getParamsCache();
auto result = cache->getOrCreate(key, builder);
auto prevExecPtr = execPtr;
execPtr = result.first;
if (!execPtr) {
IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
}
scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
if (!wasMemoryPrepared || wFormatWasChanged) {
auto pd = execPtr->getPrimitiveDesc();
auto query_weights_md = [&](int idx = 0) -> dnnl::memory::desc {
auto what = dnnl::convert_to_c(dnnl::query::weights_md);
const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md(pd, what, idx);
if (!cdesc)
IE_THROW() << "query_weights_md failed for node " << getName() << " idx " << idx << ".";
dnnl_memory_desc_t cloned_md = nullptr;
dnnl_memory_desc_clone(&cloned_md, cdesc);
return dnnl::memory::desc(cloned_md);
};
std::vector<DnnlMemoryDescPtr> intDescs {
DnnlExtensionUtils::makeDescriptor(query_weights_md(0)),
DnnlExtensionUtils::makeDescriptor(query_weights_md(1)),
DnnlExtensionUtils::makeDescriptor(query_weights_md(2))
};
prepareMemory(intDescs);
wasMemoryPrepared = true;
if (!primArgs.count(DNNL_ARG_WEIGHTS_LAYER) || !prevExecPtr ||
!execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
prepareMemory(execPtr->getWeightDesc(), 0);
primArgs[DNNL_ARG_WEIGHTS_LAYER] = internalBlobMemory[0]->GetPrimitive();
}
if (!primArgs.count(DNNL_ARG_WEIGHTS_ITER) || !prevExecPtr ||
!execPtr->getWeightIterDesc()->isCompatible(*(prevExecPtr->getWeightIterDesc()))) {
prepareMemory(execPtr->getWeightIterDesc(), 1);
primArgs[DNNL_ARG_WEIGHTS_ITER] = internalBlobMemory[1]->GetPrimitive();
}
if (!primArgs.count(DNNL_ARG_BIAS) || !prevExecPtr ||
!execPtr->getBiasDesc()->isCompatible(*(prevExecPtr->getBiasDesc()))) {
prepareMemory(execPtr->getBiasDesc(), 2);
primArgs[DNNL_ARG_BIAS] = internalBlobMemory[2]->GetPrimitive();
}
auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
}
std::shared_ptr<MemoryDesc> RNN::getSrcMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) {
@ -1123,18 +1121,10 @@ void RNN::execute(dnnl::stream strm) {
const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
const auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
const auto &wgh_data_mem = internalBlobMemory[0];
const auto &wgh_stat_mem = internalBlobMemory[1];
const auto &wgh_bias_mem = internalBlobMemory[2];
auto args = primArgs;
std::unordered_map<int, memory> args {
{DNNL_ARG_SRC_LAYER, src_data_mem->GetPrimitive()},
{DNNL_ARG_WEIGHTS_LAYER, wgh_data_mem->GetPrimitive()},
{DNNL_ARG_WEIGHTS_ITER, wgh_stat_mem->GetPrimitive()},
{DNNL_ARG_BIAS, wgh_bias_mem->GetPrimitive()},
{DNNL_ARG_DST_LAYER, dst_data_mem->GetPrimitive()},
{DNNL_ARG_SCRATCHPAD, scratchpadMem->GetPrimitive()}
};
args[DNNL_ARG_SRC_LAYER] = src_data_mem->GetPrimitive();
args[DNNL_ARG_DST_LAYER] = dst_data_mem->GetPrimitive();
int state_i_tags[] {DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C};
int state_o_tags[] {DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C};
@ -1180,6 +1170,11 @@ void RNN::cleanup() {
}
}
RNN::RnnDnnlExecutor::RnnDnnlExecutor(const dnnl::primitive_desc& pd) : DnnlExecutor(pd) {
wghts_iter_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc(1));
bias_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc(2));
}
} // namespace node
} // namespace intel_cpu
} // namespace ov

View File

@ -68,7 +68,24 @@ private:
void copyWeightsData();
using executorPtr = std::shared_ptr<DnnlExecutor>;
class RnnDnnlExecutor : public DnnlExecutor {
public:
RnnDnnlExecutor(const dnnl::primitive_desc& pd);
DnnlMemoryDescPtr getWeightIterDesc() const {
return wghts_iter_md;
}
DnnlMemoryDescPtr getBiasDesc() const {
return bias_md;
}
private:
DnnlMemoryDescPtr wghts_iter_md;
DnnlMemoryDescPtr bias_md;
};
using executorPtr = std::shared_ptr<RnnDnnlExecutor>;
executorPtr execPtr = nullptr;
/** Specify mode Cell or Seq. true - Cell, false - Seq */
@ -143,9 +160,6 @@ private:
static constexpr size_t optimalBatchSize = 16lu;
static constexpr size_t batchDimDummyValue = 64lu;
bool wasMemoryPrepared = false;
MemoryPtr scratchpadMem;
float inputScale = 0.f;
float inputShift = 0.f;
std::vector<float> weightsScales;

View File

@ -167,11 +167,6 @@ std::vector<std::string> disabledTestPatterns() {
// The kernel does not have such garbage. The diff 0.000000745 is taken into account in calculations and affects further type conversion.
// Reorder->GridSample->Reorder also does not work here. Potential fix is to use nearest conversion instead of truncation.
R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*(i32|i8).*)",
// // Issue: 95915
R"(smoke_dynamic/AUGRUCellCPUTest.CompareWithRefs/IS=\(\[\?\.1\]_\[\?\.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
R"(smoke_dynamic/GRUCellCPUTest.CompareWithRefs/IS=\(\[\?.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
R"(nightly_dynamic_bf16/RNNSequenceCPUTest.*activations=\(relu\).*)",
R"(smoke_dynamic_BatchSizeOne/RNNSequenceCPUTest.*IS=\(\[1\.\?\.10\]_\[1\.1\.10\]_\[\?\]_\)_TS=\{\(1\.2\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.4\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.8\.10\)_\(1\.1\.10\)_\(1\)\}_seqMode=PURE_SEQ_activations=\(relu\)_clip=0_direction=forward_netPrec=f32__inFmts=ncw\.ntc_outFmts=ncw\.ncw_primitive=ref_any)", // NOLINT
// 98151. Not valid sorting for slices in reference.
R"(.*UniqueLayerTestCPU.*axis.*True.*)",
};

View File

@ -100,6 +100,12 @@ protected:
selectedType = makeSelectedTypeStr(selectedType, netPrecision);
}
if (selectedType.find("BF16") != std::string::npos) {
rel_threshold = 5e-2;
} else if (selectedType.find("FP32") != std::string::npos) {
rel_threshold = 1e-4;
}
auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes);
const size_t batchSize = inputDynamicShapes[0][0].is_static() ? inputDynamicShapes[0][0].get_length() :
inputDynamicShapes[1][0].is_static() ? inputDynamicShapes[1][0].get_length() :