[CPU] Simple fix of redundant const-weight reordering for brgconv node in dynamic model (#16305)

This commit is contained in:
Tingqian Li
2023-03-29 16:27:08 +08:00
committed by GitHub
parent 556d469f6b
commit 05ab0f32d7
12 changed files with 163 additions and 110 deletions

View File

@@ -450,11 +450,19 @@ void Graph::InitDescriptors() {
node->filterSupportedPrimitiveDescriptors();
#ifdef CPU_DEBUG_CAPS
DEBUG_LOG("==================");
for (auto & pd : node->getSupportedPrimitiveDescriptors())
DEBUG_LOG("#", node->getExecIndex(),
" ", node->getName(),
" SupportedPrimitiveDescriptor:\n", pd);
const auto& SPDs = node->getSupportedPrimitiveDescriptors();
for (int i = 0; i < SPDs.size(); i++) {
DEBUG_LOG("#",
node->getExecIndex(),
" ",
node->getName(),
" SupportedPrimitiveDescriptors [",
i,
"/",
SPDs.size(),
"]: \n",
SPDs[i]);
}
#endif
}

View File

@@ -821,6 +821,51 @@ void Node::prepareMemory(dnnl::primitive_desc_iterator& itpd) {
Node::prepareMemory(intDescs);
}
MemoryPtr Node::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
if (!getParentEdgeAt(1)->getParent()->isConstant())
IE_THROW() << "Weight input is not const for node " << getName() << ".";
auto edgeMem = getParentEdgeAt(1)->getMemoryPtr();
if (!edgeMem)
IE_THROW() << "Cannot get const weights edgeMem for node " << getName() << ".";
auto constDnnlMemOutDesc = edgeMem->GetDescWithType<DnnlMemoryDesc>();
auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
auto create = [&] () {
auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
Memory srcMemory{ getEngine() };
srcMemory.Create(newSrcDesc, edgeMem->GetData());
MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
_ptr->Create(weightDesc);
node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
return _ptr;
};
MemoryPtr ptr;
const auto& format = weightDesc->serializeFormat();
auto itr = privateWeightCache.find(format);
if (privateWeightCache.end() != itr) {
ptr = itr->second;
} else {
auto weightCache = context->getWeightsCache();
if (weightCache != nullptr) {
const std::string string_hash = getName() + "_" + format
+ "_" + std::to_string(edgeMem->GetSize())
+ "_" + std::to_string(reinterpret_cast<uint64_t>(edgeMem->GetData()));
ptr = *weightCache->findOrCreate(string_hash, create);
} else {
ptr = create();
}
privateWeightCache[format] = ptr;
}
return ptr;
}
bool Node::isInPlace() {
if (inplace == InPlaceType::Unknown) {
auto selected_pd = getSelectedPrimitiveDescriptor();

View File

@@ -619,6 +619,8 @@ protected:
void prepareMemory(const std::vector<DnnlMemoryDescPtr>& intDescs);
void prepareMemory(dnnl::primitive_desc_iterator& itpd);
MemoryPtr prepareWeightMemory(DnnlMemoryDescPtr weightDesc);
bool isDynamic = false;
bool isInputTensorAtPortEmpty(size_t port) const;
@@ -687,6 +689,14 @@ private:
enum LOOK { LOOK_UP = 1, LOOK_DOWN = 2 };
ConstantType checkConstant(LOOK look, std::vector<NodePtr>& checkNodes);
// we cannot rely on per-NUMA weightCache for caching weights because:
// 1.it may not exist(in single stream configuration)
// 2.it only holds weak references, the life-cycle of cached item
// is still under control of strong references outside of cache.
// privateWeightCache is for holding strong references to constant weight
// copies of same content with different layouts.
std::unordered_map<std::string, MemoryPtr> privateWeightCache;
#ifdef CPU_DEBUG_CAPS
friend class Verbose;
#endif

View File

@@ -57,6 +57,8 @@ struct ConvKey {
dnnl::primitive_attr attr;
impl_desc_type implType;
bool constWeight;
size_t hash() const;
bool operator==(const ConvKey& rhs) const;
};
@@ -80,6 +82,7 @@ size_t ConvKey::hash() const {
seed = hash_combine(seed, get_attr_hash(*attr.get()));
seed = hash_combine(seed, implType);
seed = hash_combine(seed, constWeight);
return seed;
}
@@ -103,7 +106,7 @@ bool ConvKey::operator==(const ConvKey &rhs) const {
retVal = retVal && paddingL == rhs.paddingL;
retVal = retVal && paddingR == rhs.paddingR;
retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType;
retVal = retVal && *attr.get() == *rhs.attr.get() && implType == rhs.implType && constWeight == rhs.constWeight;
return retVal;
}
@@ -851,6 +854,14 @@ createDescriptorInternal(const dnnl::engine& engine,
}
} // namespace
static memory::data_type deriveWeightDataType(memory::data_type src_dt) {
memory::data_type wdt = src_dt;
if (one_of(src_dt, memory::data_type::s8, memory::data_type::u8)) {
wdt = memory::data_type::s8;
}
return wdt;
}
void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
const std::vector<MemoryDescPtr>& outputDesc) {
MemoryDescPtr inpDesc;
@@ -874,12 +885,7 @@ void Convolution::createDescriptor(const std::vector<MemoryDescPtr>& inputDesc,
const auto& inDnnlDesc = definedInpMemDesc->getDnnlDesc();
const auto& outDnnlDesc = definedOutMemDesc->getDnnlDesc();
memory::data_type dt = inDnnlDesc.get_data_type();
memory::data_type wdt = dt;
if (one_of(dt, memory::data_type::s8, memory::data_type::u8)) {
wdt = memory::data_type::s8;
}
memory::data_type wdt = deriveWeightDataType(inDnnlDesc.get_data_type());
dnnl::memory::desc weightDnnlDesc(DnnlExtensionUtils::convertToDnnlDims(weightDims), wdt, memory::format_tag::any);
dnnl::memory::desc biasDnnlDesc;
@@ -1143,6 +1149,11 @@ bool Convolution::isPossibleToSkipInitConfig(const dnnl::primitive_desc &desc) c
}
std::shared_ptr<MemoryDesc> Convolution::getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) {
if (idx == 1) {
// report original plain layout for weight since it needs to be reordered dynamically at runtime
return std::make_shared<CpuBlockedMemoryDesc>(getOriginalInputPrecisionAtPort(idx),
Shape(getInputShapeAtPort(idx).getStaticDims()));
}
auto desc = idx > 0 ? primitive_desc_it.weights_desc(idx - 1) : primitive_desc_it.src_desc(idx);
if (getInputShapeAtPort(idx).isDynamic()) {
return DnnlExtensionUtils::makeUndefinedDesc(desc, getInputShapeAtPort(idx));
@@ -1352,10 +1363,17 @@ void Convolution::prepareParams() {
paddingL,
paddingR,
*pAttrLocal,
selected_pd->getImplementationType()};
selected_pd->getImplementationType(),
getParentEdgeAt(1)->getParent()->isConstant()};
auto engine = getEngine();
auto builder = [&engine](const ConvKey& key) -> executorPtr {
// remove the requirement on weight memory layout to let primitive
// report the best layout for weight to be reordered dynamically at runtime
auto wghDescAny =
dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
deriveWeightDataType(key.inp0->getDataType()),
memory::format_tag::any);
auto createDnnlConvDesc = [](const dnnl::engine engine,
const dnnl::memory::desc& srcDesc,
const dnnl::memory::desc& wghDesc,
@@ -1390,7 +1408,7 @@ void Convolution::prepareParams() {
const auto alg = (key.implType & impl_desc_type::winograd) ? dnnl::algorithm::convolution_winograd : dnnl::algorithm::convolution_direct;
dnnl::primitive_desc desc = createDnnlConvDesc(engine,
key.inp0->getDnnlDesc(),
key.inp1->getDnnlDesc(),
wghDescAny,
key.out->getDnnlDesc(),
key.bias,
key.stride,
@@ -1401,7 +1419,6 @@ void Convolution::prepareParams() {
key.attr);
auto itpd = desc;
executorPtr execPtr = nullptr;
while (static_cast<bool>(itpd)) {
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
@@ -1412,7 +1429,8 @@ void Convolution::prepareParams() {
key.inp0->getDnnlDesc(),
key.inp1->getDnnlDesc(),
key.out->getDnnlDesc(),
engine);
engine,
key.constWeight);
break;
}
@@ -1425,16 +1443,13 @@ void Convolution::prepareParams() {
auto inDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp0->getShape().getStaticDims()),
key.inp0->getDataType(),
memory::format_tag::any);
auto wghDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.inp1->getShape().getStaticDims()),
key.inp1->getDataType(),
memory::format_tag::any);
auto outDesc = dnnl::memory::desc(DnnlExtensionUtils::convertToDnnlDims(key.out->getShape().getStaticDims()),
key.out->getDataType(),
memory::format_tag::any);
auto reorderConvDesc = createDnnlConvDesc(engine,
inDesc,
wghDesc,
wghDescAny,
outDesc,
key.bias,
key.stride,
@@ -1450,13 +1465,15 @@ void Convolution::prepareParams() {
key.inp0->getDnnlDesc(),
key.inp1->getDnnlDesc(),
key.out->getDnnlDesc(),
engine);
engine,
key.constWeight);
}
}
return execPtr;
};
auto prevExecPtr = execPtr;
execPtr = nullptr;
auto cache = context->getParamsCache();
auto result = cache->getOrCreate(key, builder);
@@ -1465,9 +1482,22 @@ void Convolution::prepareParams() {
if (execPtr) {
primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
if (key.constWeight) {
// const weight preparation/reordering needs to be done once at next execution
// when the input weight data is guaranteed to be ready (considering possible const-folding
// subgraphs inserted between constant weight node and conv)
auto it = primArgs.find(DNNL_ARG_WEIGHTS);
if (it == primArgs.end() || !prevExecPtr ||
!execPtr->getWeightDesc()->isCompatible(*(prevExecPtr->getWeightDesc()))) {
pendingConstWeightReorder = true;
}
} else {
// non-const weight will be reordered by executor on every exec
primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
}
if (withBiases) {
primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
}
@@ -1497,12 +1527,14 @@ Convolution::ConvolutionExecutor::ConvolutionExecutor(const dnnl::convolution_fo
const dnnl::memory::desc& inMemDesc,
const dnnl::memory::desc& weightMemDesc,
const dnnl::memory::desc& outMemDesc,
const dnnl::engine& engine) : DnnlExecutor(pd) {
const dnnl::engine& engine,
bool constWeight) : DnnlExecutor(pd) {
if (inMemDesc != getDnnlSrcDesc()) {
inputReorders.insert({DNNL_ARG_SRC, IntermReorder(inMemDesc, getDnnlSrcDesc(), engine)});
}
if (weightMemDesc != getDnnlWeightDesc()) {
if (!constWeight && weightMemDesc != getDnnlWeightDesc()) {
// const weight will be reordered at first execution
inputReorders.insert({DNNL_ARG_WEIGHTS, IntermReorder(weightMemDesc, getDnnlWeightDesc(), engine)});
}
@@ -1516,6 +1548,11 @@ void Convolution::execute(dnnl::stream strm) {
IE_THROW() << "Can't execute Convolution node with name: " << getName() << ", because executor is not compiled";
}
if (pendingConstWeightReorder) {
primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(execPtr->getWeightDesc())->GetPrimitive();
pendingConstWeightReorder = false;
}
execPtr->exec(primArgs, strm);
}
@@ -1630,13 +1667,8 @@ void Convolution::appendZeroPointsArgs() {
}
}
// brgconv will be enabled by default:
// 1, static shape(dynamic shape may change weights layout if the input shape changes and cause performance issue: 86948)
// 2, hw supports avx512+
// brgconv will be enabled by default when HW supports avx512+
void Convolution::initTryBrgconvFlag() {
if (isDynamicNode())
return;
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core)) {
shouldTryBrgconv = true;
}

View File

@@ -94,8 +94,10 @@ private:
const dnnl::memory::desc& inMemDesc,
const dnnl::memory::desc& weightMemDesc,
const dnnl::memory::desc& outMemDesc,
const dnnl::engine& engine);
const dnnl::engine& engine,
bool constWeight);
};
bool pendingConstWeightReorder = false;
void prepareParams() override;
void execute(dnnl::stream strm) override;

View File

@@ -913,51 +913,6 @@ bool FullyConnected::canBeExecutedInConv1x1() const {
return retVal;
}
MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
if (!getParentEdgeAt(1)->getParent()->isConstant())
IE_THROW() << "Weight input is not const for node " << getName() << ".";
auto blob = getParentEdgeAt(1)->getMemoryPtr();
if (!blob)
IE_THROW() << "Cannot get const weights blob for node " << getName() << ".";
auto constDnnlMemOutDesc = blob->GetDescWithType<DnnlMemoryDesc>();
auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().get_dims());
auto create = [&] () {
auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
Memory srcMemory{ getEngine() };
srcMemory.Create(newSrcDesc, blob->GetData());
MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
_ptr->Create(weightDesc);
node::Reorder::reorderData(srcMemory, *_ptr, context->getParamsCache());
return _ptr;
};
MemoryPtr ptr;
const auto& format = weightDesc->serializeFormat();
auto itr = privateWeightCache.find(format);
if (privateWeightCache.end() != itr) {
ptr = itr->second;
} else {
auto weightCache = context->getWeightsCache();
if (weightCache != nullptr) {
const std::string string_hash = getName() + "_" + format
+ "_" + std::to_string(blob->GetSize())
+ "_" + std::to_string(reinterpret_cast<uint64_t>(blob->GetData()));
ptr = *weightCache->findOrCreate(string_hash, create);
} else {
ptr = create();
}
privateWeightCache[format] = ptr;
}
return ptr;
}
bool FullyConnected::useSparseWeightsDecompression() {
// minSparseRate == 1 means that sparse feature is switched off
if (minSparseRate == 1.f) {

View File

@@ -83,11 +83,6 @@ private:
bool useConv1x1 = false;
impl_desc_type implementationTypeIP;
MemoryDescPtr weightDescIP;
// when weightCache is not enabled (such as stream=1), brgconv weights may change due to
// different shapes. Weights will be cached in privateWeightCache.
// When weightCache is enabled, it holds weight ptr reference since weightCache does not hold the
// reference
std::unordered_map<std::string, MemoryPtr> privateWeightCache;
dnnl::primitive_attr attr;
static dnnl::convolution_forward::primitive_desc
@@ -99,7 +94,6 @@ private:
const dnnl::engine& engine);
bool canBeExecutedInConv1x1() const;
MemoryPtr prepareWeightMemory(const DnnlMemoryDescPtr weightDesc);
// sparse weights
bool useSparseWeights = false;

View File

@@ -21,6 +21,17 @@
namespace ov {
namespace intel_cpu {
namespace {
size_t replace_all(std::string & inout, std::string what, std::string with) {
std::size_t count{};
for (std::string::size_type pos{}; inout.npos != (pos = inout.find(what.data(), pos, what.length()));
pos += with.length(), ++count) {
inout.replace(pos, what.length(), with.data(), with.length());
}
return count;
}
}
DebugLogEnabled::DebugLogEnabled(const char* file, const char* func, int line, const char* name) {
// check ENV
const char* p_filters = std::getenv("OV_CPU_DEBUG_LOG");
@@ -96,19 +107,27 @@ std::ostream & operator<<(std::ostream & os, const MemoryDesc& desc) {
}
std::ostream & operator<<(std::ostream & os, const NodeDesc& desc) {
os << " ImplementationType: " << impl_type_to_string(desc.getImplementationType()) << std::endl;
std::stringstream ss;
ss << " " << impl_type_to_string(desc.getImplementationType()) << "(";
const char * sep = "";
for (auto & conf : desc.getConfig().inConfs) {
os << " inConfs: " << *conf.getMemDesc();
if (conf.inPlace() >= 0) os << " inPlace:" << conf.inPlace();
if (conf.constant()) os << " constant";
os << std::endl;
ss << sep << *conf.getMemDesc();
if (conf.inPlace() >= 0) ss << " inPlace:" << conf.inPlace();
if (conf.constant()) ss << " constant";
sep = ",";
}
ss << ") -> (";
sep = "";
for (auto & conf : desc.getConfig().outConfs) {
os << " outConfs: " << *conf.getMemDesc();
if (conf.inPlace() >= 0) os << " inPlace:" << conf.inPlace();
if (conf.constant()) os << " constant";
os << std::endl;
ss << sep << *conf.getMemDesc();
if (conf.inPlace() >= 0) ss << " inPlace:" << conf.inPlace();
if (conf.constant()) ss << " constant";
sep = ",";
}
ss << ")" << std::endl;
auto str = ss.str();
replace_all(str, "0 - ?", "?");
os << str;
return os;
}
@@ -137,15 +156,7 @@ std::ostream & operator<<(std::ostream & os, const Node &c_node) {
}
return true;
};
auto replace_all = [](std::string& inout, std::string what, std::string with) {
std::size_t count{};
for (std::string::size_type pos{};
inout.npos != (pos = inout.find(what.data(), pos, what.length()));
pos += with.length(), ++count) {
inout.replace(pos, what.length(), with.data(), with.length());
}
return count;
};
auto nodeDesc = node.getSelectedPrimitiveDescriptor();
std::stringstream leftside;

View File

@@ -42,8 +42,6 @@ const std::vector<LayerTestsDefinitions::ElementwiseBranchSelectionTestValues> p
},
{ 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 2.55f }, { 0.f }, { 2.55f } },
{
{"Constant", "convolution1"},
{"Constant", "convolution2"},
{"fakeQuantizeBefore1", "convolution1"},
{"fakeQuantizeBefore2", "convolution2"},
{"maxPool", "result"}
@@ -75,8 +73,6 @@ const std::vector<LayerTestsDefinitions::ElementwiseBranchSelectionTestValues> p
},
{ 256ul, ngraph::Shape { 1, 1, 1, 1 }, { 0.f }, { 2.55f }, { 0.f }, { 2.55f } },
{
{"Constant", "convolution1"},
{"Constant", "convolution2"},
{"fakeQuantizeBefore1", "convolution1"},
{"fakeQuantizeBefore2", "convolution2"},
{"maxPool", "result"}

View File

@@ -16,7 +16,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvAdd, ConvEltwise,
::testing::Values(convInputShape),
::testing::Values(convInputShape),
::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Add>())), // non-tokenizable
::testing::Values(6), // num nodes = 6: Convert + Convolution + 4 Reorders on Convs in&outs
::testing::Values(5), // num nodes = 5: Convert + Convolution + 3 Reorders on Convs in&outs
::testing::Values(0), // num subgraphs = 0: No subgraph since all ops eltwises fused into Convolution
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ConvEltwise::getTestCaseName);
@@ -26,7 +26,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvMul, ConvEltwise,
::testing::Values(convInputShape),
::testing::Values(convInputShape),
::testing::Values(std::shared_ptr<ov::Node> (std::make_shared<ov::op::v1::Multiply>())), // fully-tokenizable
::testing::Values(7), //num nodes = 7: Convert + Convolution + Subgraph + Reorders
::testing::Values(6), //num nodes = 6: Convert + Convolution + Subgraph + Reorders
::testing::Values(1), // num subgraphs = 1: Mul (2 inputs) can't be fused into Conv => Subgraph is created
::testing::Values(CommonTestUtils::DEVICE_CPU)),
ConvEltwise::getTestCaseName);

View File

@@ -121,8 +121,8 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Combine(
::testing::ValuesIn(testValuesLegacyFuse),
::testing::ValuesIn(operations),
// reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + reorder(ABcd16b16a) + Convolution + reorder(nchw)
::testing::Values(std::pair<size_t, size_t>{6, 0}),
// reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + Convolution(with internal weight reordering) + reorder(nchw)
::testing::Values(std::pair<size_t, size_t>{5, 0}),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
FakeQuantizeDecompositionTest::getTestCaseName);

View File

@@ -71,9 +71,9 @@ namespace {
TEST_P(ConcatConstantInPlaceTest, smoke_ConcatConstantInPlaceTest_CPU) {
Run();
if (this->GetParam() == Precision::BF16)
CheckNumberOfNodesWithType(executableNetwork, "Reorder", 4);
else
CheckNumberOfNodesWithType(executableNetwork, "Reorder", 3);
else
CheckNumberOfNodesWithType(executableNetwork, "Reorder", 2);
}
INSTANTIATE_TEST_SUITE_P(smoke_ConcatConstantInPlaceTest_CPU, ConcatConstantInPlaceTest,