[CPU] Rnn weights repacking (#16992)
This commit is contained in:
parent
f410658d32
commit
f8522a6ea1
@ -771,6 +771,52 @@ void Node::initDescriptor(const NodeConfig& config) {
|
||||
selectedPD->setConfig(updatedConfig);
|
||||
}
|
||||
|
||||
void Node::prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx) {
|
||||
size_t minSize = indx + 1;
|
||||
if (internalBlobMemory.size() < minSize) {
|
||||
internalBlobMemory.resize(minSize);
|
||||
}
|
||||
|
||||
if (minSize > internalBlobs.size()) {
|
||||
IE_THROW() << "Can't prepare memory for internal blob, requested index: " << indx <<
|
||||
" is out of bounds of the internalBlobs vector of size " << internalBlobs.size();
|
||||
}
|
||||
|
||||
const auto &internalBlob = internalBlobs[indx];
|
||||
|
||||
auto create = [&] () {
|
||||
// TODO [DS]: internal blobs should be removed or rewritten using Memory object
|
||||
auto newDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(internalBlob->getTensorDesc());
|
||||
|
||||
Memory memory{ engine };
|
||||
memory.Create(newDesc, internalBlob->buffer());
|
||||
|
||||
MemoryPtr _ptr = std::make_shared<Memory>(engine);
|
||||
_ptr->Create(intDesc);
|
||||
node::Reorder::reorderData(memory, *_ptr, context->getParamsCache());
|
||||
return _ptr;
|
||||
};
|
||||
|
||||
MemoryPtr ptr;
|
||||
auto weightCache = context->getWeightsCache();
|
||||
if (weightCache != nullptr && memory::format_kind::blocked == intDesc->getDnnlDesc().get_format_kind()) {
|
||||
const auto& format = intDesc->serializeFormat();
|
||||
const uint64_t data_hash = weightCache->GetHashFunc().hash(
|
||||
internalBlob->buffer(), internalBlob->byteSize());
|
||||
|
||||
const std::string string_hash = name + "_" + std::to_string(indx)
|
||||
+ "_" + format
|
||||
+ "_" + std::to_string(internalBlob->byteSize())
|
||||
+ "_" + std::to_string(data_hash);
|
||||
|
||||
ptr = *weightCache->findOrCreate(string_hash, create);
|
||||
} else {
|
||||
ptr = create();
|
||||
}
|
||||
|
||||
internalBlobMemory[indx] = ptr;
|
||||
}
|
||||
|
||||
void Node::prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs) {
|
||||
if (internalBlobs.size() != intDescs.size()) {
|
||||
IE_THROW() << "Can't prepare memory for internal blob, internal blob and internal descs number do not match "
|
||||
@ -779,38 +825,7 @@ void Node::prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs) {
|
||||
|
||||
internalBlobMemory.clear();
|
||||
for (size_t i = 0; i < internalBlobs.size(); i++) {
|
||||
const auto &internalBlob = internalBlobs[i];
|
||||
|
||||
auto create = [&] () {
|
||||
// TODO [DS]: internal blobs should be removed or rewritten using Memory object
|
||||
auto newDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(internalBlob->getTensorDesc());
|
||||
|
||||
Memory memory{ engine };
|
||||
memory.Create(newDesc, internalBlob->buffer());
|
||||
|
||||
MemoryPtr _ptr = std::make_shared<Memory>(engine);
|
||||
_ptr->Create(*intDescs[i]);
|
||||
_ptr->SetData(memory);
|
||||
|
||||
return _ptr;
|
||||
};
|
||||
|
||||
MemoryPtr ptr;
|
||||
auto weightCache = context->getWeightsCache();
|
||||
if (weightCache != nullptr) {
|
||||
const uint64_t data_hash = weightCache->GetHashFunc().hash(
|
||||
internalBlob->buffer(), internalBlob->byteSize());
|
||||
|
||||
const std::string string_hash = name + "_" + std::to_string(i)
|
||||
+ "_" + std::to_string(internalBlob->byteSize())
|
||||
+ "_" + std::to_string(data_hash);
|
||||
|
||||
ptr = *weightCache->findOrCreate(string_hash, create);
|
||||
} else {
|
||||
ptr = create();
|
||||
}
|
||||
|
||||
internalBlobMemory.push_back(ptr);
|
||||
prepareMemory(intDescs[i], i);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -648,6 +648,7 @@ protected:
|
||||
bool dynBatchSupport = false);
|
||||
|
||||
void prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs);
|
||||
void prepareMemory(const DnnlMemoryDescPtr& intDesc, size_t indx);
|
||||
void prepareMemory(dnnl::primitive_desc_iterator& itpd);
|
||||
|
||||
MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr weightDesc);
|
||||
|
@ -1072,40 +1072,38 @@ void RNN::prepareParams() {
|
||||
key.wDescs,
|
||||
key.attr);
|
||||
|
||||
return std::make_shared<DnnlExecutor>(descPtr);
|
||||
return descPtr ? std::make_shared<RnnDnnlExecutor>(descPtr) : nullptr;
|
||||
};
|
||||
|
||||
auto cache = context->getParamsCache();
|
||||
auto result = cache->getOrCreate(key, builder);
|
||||
|
||||
auto prevExecPtr = execPtr;
|
||||
execPtr = result.first;
|
||||
|
||||
if (!execPtr) {
|
||||
IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
|
||||
}
|
||||
|
||||
scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
|
||||
|
||||
if (!wasMemoryPrepared || wFormatWasChanged) {
|
||||
auto pd = execPtr->getPrimitiveDesc();
|
||||
auto query_weights_md = [&](int idx = 0) -> dnnl::memory::desc {
|
||||
auto what = dnnl::convert_to_c(dnnl::query::weights_md);
|
||||
const_dnnl_memory_desc_t cdesc = dnnl_primitive_desc_query_md(pd, what, idx);
|
||||
if (!cdesc)
|
||||
IE_THROW() << "query_weights_md failed for node " << getName() << " idx " << idx << ".";
|
||||
dnnl_memory_desc_t cloned_md = nullptr;
|
||||
dnnl_memory_desc_clone(&cloned_md, cdesc);
|
||||
|
||||
return dnnl::memory::desc(cloned_md);
|
||||
};
|
||||
std::vector<DnnlMemoryDescPtr> intDescs {
|
||||
DnnlExtensionUtils::makeDescriptor(query_weights_md(0)),
|
||||
DnnlExtensionUtils::makeDescriptor(query_weights_md(1)),
|
||||
DnnlExtensionUtils::makeDescriptor(query_weights_md(2))
|
||||
};
|
||||
prepareMemory(intDescs);
|
||||
wasMemoryPrepared = true;
|
||||
if (!primArgs.count(DNNL_ARG_WEIGHTS_LAYER) || !prevExecPtr ||
|
||||
!execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
|
||||
prepareMemory(execPtr->getWeightDesc(), 0);
|
||||
primArgs[DNNL_ARG_WEIGHTS_LAYER] = internalBlobMemory[0]->GetPrimitive();
|
||||
}
|
||||
|
||||
if (!primArgs.count(DNNL_ARG_WEIGHTS_ITER) || !prevExecPtr ||
|
||||
!execPtr->getWeightIterDesc()->isCompatible(*(prevExecPtr->getWeightIterDesc()))) {
|
||||
prepareMemory(execPtr->getWeightIterDesc(), 1);
|
||||
primArgs[DNNL_ARG_WEIGHTS_ITER] = internalBlobMemory[1]->GetPrimitive();
|
||||
}
|
||||
|
||||
if (!primArgs.count(DNNL_ARG_BIAS) || !prevExecPtr ||
|
||||
!execPtr->getBiasDesc()->isCompatible(*(prevExecPtr->getBiasDesc()))) {
|
||||
prepareMemory(execPtr->getBiasDesc(), 2);
|
||||
primArgs[DNNL_ARG_BIAS] = internalBlobMemory[2]->GetPrimitive();
|
||||
}
|
||||
|
||||
auto scratchpadMem = getScratchPadMem(execPtr->getScratchPadDesc());
|
||||
primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
|
||||
}
|
||||
|
||||
std::shared_ptr<MemoryDesc> RNN::getSrcMemDesc(dnnl::primitive_desc_iterator& primitive_desc_it, size_t idx) {
|
||||
@ -1123,18 +1121,10 @@ void RNN::execute(dnnl::stream strm) {
|
||||
const auto src_data_mem = getParentEdgeAt(0)->getMemoryPtr();
|
||||
const auto dst_data_mem = getChildEdgeAt(0)->getMemoryPtr();
|
||||
|
||||
const auto &wgh_data_mem = internalBlobMemory[0];
|
||||
const auto &wgh_stat_mem = internalBlobMemory[1];
|
||||
const auto &wgh_bias_mem = internalBlobMemory[2];
|
||||
auto args = primArgs;
|
||||
|
||||
std::unordered_map<int, memory> args {
|
||||
{DNNL_ARG_SRC_LAYER, src_data_mem->GetPrimitive()},
|
||||
{DNNL_ARG_WEIGHTS_LAYER, wgh_data_mem->GetPrimitive()},
|
||||
{DNNL_ARG_WEIGHTS_ITER, wgh_stat_mem->GetPrimitive()},
|
||||
{DNNL_ARG_BIAS, wgh_bias_mem->GetPrimitive()},
|
||||
{DNNL_ARG_DST_LAYER, dst_data_mem->GetPrimitive()},
|
||||
{DNNL_ARG_SCRATCHPAD, scratchpadMem->GetPrimitive()}
|
||||
};
|
||||
args[DNNL_ARG_SRC_LAYER] = src_data_mem->GetPrimitive();
|
||||
args[DNNL_ARG_DST_LAYER] = dst_data_mem->GetPrimitive();
|
||||
|
||||
int state_i_tags[] {DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C};
|
||||
int state_o_tags[] {DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C};
|
||||
@ -1180,6 +1170,11 @@ void RNN::cleanup() {
|
||||
}
|
||||
}
|
||||
|
||||
RNN::RnnDnnlExecutor::RnnDnnlExecutor(const dnnl::primitive_desc& pd) : DnnlExecutor(pd) {
|
||||
wghts_iter_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc(1));
|
||||
bias_md = DnnlExtensionUtils::makeDescriptor(pd.weights_desc(2));
|
||||
}
|
||||
|
||||
} // namespace node
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
@ -68,7 +68,24 @@ private:
|
||||
|
||||
void copyWeightsData();
|
||||
|
||||
using executorPtr = std::shared_ptr<DnnlExecutor>;
|
||||
class RnnDnnlExecutor : public DnnlExecutor {
|
||||
public:
|
||||
RnnDnnlExecutor(const dnnl::primitive_desc& pd);
|
||||
|
||||
DnnlMemoryDescPtr getWeightIterDesc() const {
|
||||
return wghts_iter_md;
|
||||
}
|
||||
|
||||
DnnlMemoryDescPtr getBiasDesc() const {
|
||||
return bias_md;
|
||||
}
|
||||
|
||||
private:
|
||||
DnnlMemoryDescPtr wghts_iter_md;
|
||||
DnnlMemoryDescPtr bias_md;
|
||||
};
|
||||
|
||||
using executorPtr = std::shared_ptr<RnnDnnlExecutor>;
|
||||
executorPtr execPtr = nullptr;
|
||||
|
||||
/** Specify mode Cell or Seq. true - Cell, false - Seq */
|
||||
@ -143,9 +160,6 @@ private:
|
||||
static constexpr size_t optimalBatchSize = 16lu;
|
||||
static constexpr size_t batchDimDummyValue = 64lu;
|
||||
|
||||
bool wasMemoryPrepared = false;
|
||||
MemoryPtr scratchpadMem;
|
||||
|
||||
float inputScale = 0.f;
|
||||
float inputShift = 0.f;
|
||||
std::vector<float> weightsScales;
|
||||
|
@ -167,11 +167,6 @@ std::vector<std::string> disabledTestPatterns() {
|
||||
// The kernel does not have such garbage. The diff 0.000000745 is taken into account in calculations and affects further type conversion.
|
||||
// Reorder->GridSample->Reorder also does not work here. Potential fix is to use nearest conversion instead of truncation.
|
||||
R"(.*GridSampleLayerTestCPU.*(BILINEAR|BICUBIC).*(i32|i8).*)",
|
||||
// // Issue: 95915
|
||||
R"(smoke_dynamic/AUGRUCellCPUTest.CompareWithRefs/IS=\(\[\?\.1\]_\[\?\.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
|
||||
R"(smoke_dynamic/GRUCellCPUTest.CompareWithRefs/IS=\(\[\?.1\]_\[\?\.1\]_\)_TS=\{\(1\.1\)_\(1\.1\)\}_\{\(3\.1\)_\(3\.1\)\}_\{\(5\.1\)_\(5\.1\)\}_decompose=0_activations=\(sigmoid\.tanh\)_clip=0_linear=0_netPrec=f32__inFmts=nc\.nc_outFmts=nc_primitive=ref_any_PluginConf_ENFORCE_BF16=YES)", // NOLINT
|
||||
R"(nightly_dynamic_bf16/RNNSequenceCPUTest.*activations=\(relu\).*)",
|
||||
R"(smoke_dynamic_BatchSizeOne/RNNSequenceCPUTest.*IS=\(\[1\.\?\.10\]_\[1\.1\.10\]_\[\?\]_\)_TS=\{\(1\.2\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.4\.10\)_\(1\.1\.10\)_\(1\)\}_\{\(1\.8\.10\)_\(1\.1\.10\)_\(1\)\}_seqMode=PURE_SEQ_activations=\(relu\)_clip=0_direction=forward_netPrec=f32__inFmts=ncw\.ntc_outFmts=ncw\.ncw_primitive=ref_any)", // NOLINT
|
||||
// 98151. Not valid sorting for slices in reference.
|
||||
R"(.*UniqueLayerTestCPU.*axis.*True.*)",
|
||||
};
|
||||
|
@ -100,6 +100,12 @@ protected:
|
||||
selectedType = makeSelectedTypeStr(selectedType, netPrecision);
|
||||
}
|
||||
|
||||
if (selectedType.find("BF16") != std::string::npos) {
|
||||
rel_threshold = 5e-2;
|
||||
} else if (selectedType.find("FP32") != std::string::npos) {
|
||||
rel_threshold = 1e-4;
|
||||
}
|
||||
|
||||
auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes);
|
||||
const size_t batchSize = inputDynamicShapes[0][0].is_static() ? inputDynamicShapes[0][0].get_length() :
|
||||
inputDynamicShapes[1][0].is_static() ? inputDynamicShapes[1][0].get_length() :
|
||||
|
Loading…
Reference in New Issue
Block a user