[CPU] Use brgconv1x1 instead of inner product in some cases (#13715)

This commit is contained in:
Luo Cheng 2022-11-21 19:00:49 +08:00 committed by GitHub
parent 47e80200dd
commit cf7b174bf9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 525 additions and 118 deletions

View File

@ -156,5 +156,13 @@ DnnlMemoryDescPtr DnnlExtensionUtils::query_md(const const_dnnl_primitive_desc_t
return DnnlExtensionUtils::makeDescriptor(*cdesc);
}
std::string DnnlExtensionUtils::query_impl_info_str(const const_dnnl_primitive_desc_t& pd) {
const char *res;
dnnl_status_t status = dnnl_primitive_desc_query(pd, dnnl_query_impl_info_str, 0, &res);
if (status != dnnl_success)
IE_THROW() << "query_impl_info_str failed.";
return std::string(res);
}
} // namespace intel_cpu
} // namespace ov

View File

@ -49,6 +49,7 @@ public:
static size_t getMemSizeForDnnlDesc(const dnnl::memory::desc& desc);
static std::shared_ptr<DnnlMemoryDesc> query_md(const const_dnnl_primitive_desc_t& pd, const dnnl::query& what, int idx = 0);
static std::string query_impl_info_str(const const_dnnl_primitive_desc_t& pd);
};
} // namespace intel_cpu

View File

@ -50,6 +50,8 @@
#include <transformations/utils/utils.hpp>
#include <low_precision/low_precision.hpp>
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
using namespace dnnl;
using namespace InferenceEngine;
@ -843,6 +845,13 @@ void Graph::CreatePrimitives() {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, node->profiling.createPrimitive);
DEBUG_LOG(*node);
node->createPrimitive();
#ifdef CPU_DEBUG_CAPS
if (node->prim) {
auto pd_c = (*node->prim).get_primitive_desc();
auto* pd = reinterpret_cast<const dnnl_primitive_desc*>(pd_c);
DEBUG_LOG("verbose##", node->getName(), "##", pd->info(), "\n");
}
#endif
}
}

View File

@ -60,6 +60,8 @@
#include "nodes/common/cpu_convert.h"
#include "memory_desc/cpu_memory_desc_utils.h"
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
using namespace dnnl;
using namespace openvino;
@ -528,6 +530,13 @@ void Node::executeDynamic(dnnl::stream strm) {
DEBUG_LOG(" prepareParams() on #", getExecIndex(), " ", getTypeStr(), " ", algToString(getAlgorithm()),
" ", getName(), " ", getOriginalLayers());
prepareParams();
#ifdef CPU_DEBUG_CAPS
if (prim) {
auto pd_c = (*prim).get_primitive_desc();
auto* pd = reinterpret_cast<const dnnl_primitive_desc*>(pd_c);
DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
}
#endif
}
executeDynamicImpl(strm);
}

View File

@ -54,5 +54,35 @@ Primitive DnnlExecutor::getExecPrim() const {
return execPrim;
}
const_dnnl_primitive_desc_t DnnlExecutor::getPrimitiveDesc() const {
return (*execPrim).get_primitive_desc();
}
dnnl::memory::desc DnnlExecutor::getSrcDesc() const {
auto pd = getPrimitiveDesc();
auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::src_md);
return md->getDnnlDesc();
}
dnnl::memory::desc DnnlExecutor::getWeightDesc() const {
auto pd = getPrimitiveDesc();
auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::weights_md);
return md->getDnnlDesc();
}
dnnl::memory::desc DnnlExecutor::getDstDesc() const {
auto pd = getPrimitiveDesc();
auto md = DnnlExtensionUtils::query_md(pd, dnnl::query::dst_md);
return md->getDnnlDesc();
}
impl_desc_type DnnlExecutor::getImplementationType() const {
auto pd = getPrimitiveDesc();
return parse_impl_name(DnnlExtensionUtils::query_impl_info_str(pd));
}
} // namespace intel_cpu
} // namespace ov

View File

@ -6,6 +6,7 @@
#include <cpu_memory.h>
#include <primitive.h>
#include <onednn/iml_type_mapper.h>
namespace ov {
namespace intel_cpu {
@ -30,6 +31,11 @@ class DnnlExecutor {
bool needReordering() const;
virtual ~DnnlExecutor() = default;
Primitive getExecPrim() const;
const_dnnl_primitive_desc_t getPrimitiveDesc() const;
dnnl::memory::desc getSrcDesc() const;
dnnl::memory::desc getWeightDesc() const;
dnnl::memory::desc getDstDesc() const;
impl_desc_type getImplementationType() const;
protected:
DnnlExecutor() = default;

View File

@ -24,6 +24,8 @@
#include "utils/cpu_utils.hpp"
#include <common/primitive_hashing_utils.hpp>
#include <cpu/cpu_primitive.hpp>
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
using namespace dnnl;
using namespace InferenceEngine;
@ -1445,9 +1447,14 @@ void Convolution::prepareParams() {
Node::appendPostOpArgs(*pAttrLocal, primArgs, convPostOpsArgs[preferLegacyPostOps]);
auto pd = (*(execPtr->getExecPrim())).get_primitive_desc();
auto pd = execPtr->getPrimitiveDesc();
auto scratchpadMem = getScratchPadMem(pd);
primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
#ifdef CPU_DEBUG_CAPS
if (result.second == CacheEntryBase::LookUpStatus::Miss) {
DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
}
#endif
} else {
IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
}

View File

@ -23,6 +23,8 @@
#include <ie_ngraph_utils.hpp>
#include "convolution_shape_inference.hpp"
#include <common/primitive_hashing_utils.hpp>
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
using namespace dnnl;
using namespace InferenceEngine;
@ -935,9 +937,14 @@ void Deconvolution::prepareParams() {
}
Node::appendPostOpArgs(*pAttrLocal, primArgs, postOpsArgs);
auto pd = (*(execPtr->getExecPrim())).get_primitive_desc();
auto pd = execPtr->getPrimitiveDesc();
auto scratchpadMem = getScratchPadMem(pd);
primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
#ifdef CPU_DEBUG_CAPS
if (result.second == CacheEntryBase::LookUpStatus::Miss) {
DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
}
#endif
} else {
IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
}

View File

@ -5,6 +5,8 @@
#include "fullyconnected.h"
#include "eltwise.h"
#include "fake_quantize.h"
#include "input.h"
#include "reorder.h"
#include "ngraph_transformations/op/fully_connected.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <string>
@ -12,10 +14,14 @@
#include <dnnl_extension_utils.h>
#include <onednn/dnnl.h>
#include "utils/general_utils.h"
#include "cpu/x64/cpu_isa_traits.hpp"
#include <memory_desc/cpu_memory_desc_utils.h>
#include "memory_desc/dnnl_blocked_memory_desc.h"
#include "utils/cpu_utils.hpp"
#include <common/primitive_hashing_utils.hpp>
#include <common/primitive_desc.hpp>
#include <common/primitive_desc_iface.hpp>
#include "onednn/dnnl.h"
using namespace dnnl;
using namespace InferenceEngine;
@ -32,6 +38,7 @@ struct FCKey {
DnnlMemoryDescCPtr out;
dnnl::primitive_attr attr;
impl_desc_type implType;
bool useConv1x1;
size_t hash() const;
bool operator==(const FCKey& rhs) const;
@ -51,6 +58,7 @@ size_t FCKey::hash() const {
seed = hash_combine(seed, get_attr_hash(*attr.get()));
seed = hash_combine(seed, implType);
seed = hash_combine(seed, useConv1x1);
return seed;
}
@ -69,7 +77,7 @@ bool FCKey::operator==(const FCKey &rhs) const {
retVal = retVal && out && rhs.out && out->getDnnlDesc() == rhs.out->getDnnlDesc();
}
retVal = retVal && *attr.get() == *rhs.attr.get() &&
implType == rhs.implType;
implType == rhs.implType && useConv1x1 == rhs.useConv1x1;
return retVal;
}
@ -205,14 +213,11 @@ void FullyConnected::getSupportedDescriptors() {
void FullyConnected::prepareParams() {
auto srcMemPtr = getParentEdgesAtPort(0)[0]->getMemoryPtr();
auto wghMemPtr = getParentEdgesAtPort(1)[0]->getMemoryPtr();
auto dstMemPtr = getChildEdgesAtPort(0)[0]->getMemoryPtr();
if (!dstMemPtr || !dstMemPtr->isAllocated())
IE_THROW() << "Destination memory hasn't been allocated.";
if (!srcMemPtr || !srcMemPtr->isAllocated())
IE_THROW() << "Input memory hasn't been allocated.";
if (!wghMemPtr || !wghMemPtr->isAllocated())
IE_THROW() << "Weight memory hasn't been allocated.";
MemoryPtr biasMemPtr = nullptr;
if (withBiases) {
biasMemPtr = getParentEdgesAtPort(2)[0]->getMemoryPtr();
@ -220,7 +225,7 @@ void FullyConnected::prepareParams() {
IE_THROW() << "Input memory hasn't been allocated.";
}
const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor();
NodeDesc *selected_pd = getSelectedPrimitiveDescriptor();
if (selected_pd == nullptr)
IE_THROW() << "Preferable primitive descriptor is not set for node " << getName() << ".";
@ -228,7 +233,7 @@ void FullyConnected::prepareParams() {
setPostOps(*attr, dstMemPtr->getStaticDims());
(*attr).set_scratchpad_mode(dnnl::scratchpad_mode::user);
DnnlMemoryDescCPtr weightDesc = wghMemPtr->GetDescWithType<DnnlMemoryDesc>();
DnnlMemoryDescPtr weightDesc = MemoryDescUtils::convertToDnnlMemoryDesc(weightDescIP);
DnnlMemoryDescCPtr biasDesc = nullptr;
if (biasMemPtr) {
biasDesc = biasMemPtr->GetDescWithType<DnnlMemoryDesc>();
@ -237,60 +242,88 @@ void FullyConnected::prepareParams() {
DnnlMemoryDescCPtr inDesc = srcMemPtr->GetDescWithType<DnnlMemoryDesc>();
DnnlMemoryDescCPtr outDesc = dstMemPtr->GetDescWithType<DnnlMemoryDesc>();
useConv1x1 = canBeExecutedInConv1x1();
FCKey key = {inDesc,
weightDesc,
biasDesc,
outDesc,
*attr,
selected_pd->getImplementationType()};
implementationTypeIP,
useConv1x1};
auto engine = getEngine();
auto builder = [&engine](const FCKey& key) -> std::shared_ptr<dnnl::primitive> {
auto inDesc = key.inp0->getDnnlDesc();
if (inDesc.dims().size() == 3) {
auto inDims = inDesc.dims();
auto normalizedInDims = {inDims[0] * inDims[1], inDims[2]};
inDesc = inDesc.reshape(normalizedInDims);
}
auto builder = [&engine](const FCKey& key) -> executorPtr {
executorPtr execPtr = nullptr;
if (key.useConv1x1) {
auto desc = createDescriptorInternalForConv(key.inp0, key.inp1, key.bias, key.out);
primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr);
convolution_forward::primitive_desc prim_desc;
auto outDesc = key.out->getDnnlDesc();
if (outDesc.dims().size() == 3) {
auto outDims = outDesc.dims();
auto normalizedOutDims = { outDims[0] * outDims[1], outDims[2] };
outDesc = outDesc.reshape(normalizedOutDims);
}
while (static_cast<bool>(itpd)) {
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
std::shared_ptr<dnnl::inner_product_forward::desc> fcDsc;
if (key.bias) {
fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
inDesc,
key.inp1->getDnnlDesc(),
key.bias->getDnnlDesc(),
outDesc);
} else {
fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
inDesc,
key.inp1->getDnnlDesc(),
outDesc);
}
DnnlDesriptor desc(fcDsc);
primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr);
inner_product_forward::primitive_desc prim_desc;
while (static_cast<bool>(itpd)) {
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type == key.implType) {
prim_desc = itpd.get();
break;
if (impl_type == brgconv_avx512_1x1) {
prim_desc = itpd.get();
break;
}
if (!itpd.next_impl()) {
break;
}
}
if (!itpd.next_impl()) {
return nullptr;
if (prim_desc) {
execPtr = std::make_shared<ExecutorConv1x1>(prim_desc);
}
}
// fallback
if (!execPtr) {
auto inDesc = key.inp0->getDnnlDesc();
if (inDesc.dims().size() == 3) {
auto inDims = inDesc.dims();
auto normalizedInDims = {inDims[0] * inDims[1], inDims[2]};
inDesc = inDesc.reshape(normalizedInDims);
}
return std::make_shared<inner_product_forward>(prim_desc);
auto outDesc = key.out->getDnnlDesc();
if (outDesc.dims().size() == 3) {
auto outDims = outDesc.dims();
auto normalizedOutDims = { outDims[0] * outDims[1], outDims[2] };
outDesc = outDesc.reshape(normalizedOutDims);
}
std::shared_ptr<dnnl::inner_product_forward::desc> fcDsc;
if (key.bias) {
fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
inDesc,
key.inp1->getDnnlDesc(),
key.bias->getDnnlDesc(),
outDesc);
} else {
fcDsc = std::make_shared<dnnl::inner_product_forward::desc>(dnnl::prop_kind::forward_scoring,
inDesc,
key.inp1->getDnnlDesc(),
outDesc);
}
DnnlDesriptor desc(fcDsc);
primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr);
inner_product_forward::primitive_desc prim_desc;
while (static_cast<bool>(itpd)) {
impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str());
if (impl_type == key.implType) {
prim_desc = itpd.get();
break;
}
if (!itpd.next_impl()) {
return nullptr;
}
}
execPtr = std::make_shared<ExecutorInnerProduct>(prim_desc);
}
return execPtr;
};
auto cache = getRuntimeCache();
@ -300,41 +333,62 @@ void FullyConnected::prepareParams() {
IE_THROW() << "Primitive descriptor was not found for node " << getName() << ".";
}
prim = result.first;
auto prevExecPtr = execPtr;
execPtr = result.first;
primArgs[DNNL_ARG_SRC] = srcMemPtr->GetPrimitive();
primArgs[DNNL_ARG_WEIGHTS] = wghMemPtr->GetPrimitive();
primArgs[DNNL_ARG_DST] = dstMemPtr->GetPrimitive();
if (withBiases) {
primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
}
appendPostOpArgs(*attr, primArgs, postOpsArgs);
auto pd = (*prim).get_primitive_desc();
auto scratchpadMem = getScratchPadMem(pd);
primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
auto reshapeMemory = [this](int argType) {
auto param = primArgs.find(argType);
if (param != primArgs.end()) {
auto oldMem = param->second;
auto dims = oldMem.get_desc().dims();
if (dims.size() == 3) {
std::vector<dnnl::memory::dim> normalizedDims({dims[0] * dims[1], dims[2]});
dnnl::memory::desc newMemDesc(oldMem.get_desc().reshape(normalizedDims));
dnnl::memory newMem(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
primArgs.at(argType) = newMem;
if (execPtr) {
// no executor yet or shapes changed
if (!prevExecPtr || prevExecPtr->getSrcDesc() != execPtr->getSrcDesc()) {
auto oldMem = srcMemPtr->GetPrimitive();
// fast path: wanted is same with parent node output, typical is static shape with inner product
if (execPtr->getSrcDesc() == inDesc->getDnnlDesc()) {
primArgs[DNNL_ARG_SRC] = std::move(oldMem);
} else {
primArgs[DNNL_ARG_SRC] = dnnl::memory(execPtr->getSrcDesc(), oldMem.get_engine(), oldMem.get_data_handle());
}
}
};
reshapeMemory(DNNL_ARG_SRC);
reshapeMemory(DNNL_ARG_DST);
if (!prevExecPtr || prevExecPtr->getDstDesc() != execPtr->getDstDesc()) {
auto oldMem = dstMemPtr->GetPrimitive();
if (execPtr->getDstDesc() == outDesc->getDnnlDesc()) {
primArgs[DNNL_ARG_DST] = std::move(oldMem);
} else {
primArgs[DNNL_ARG_DST] = dnnl::memory(execPtr->getDstDesc(), oldMem.get_engine(), oldMem.get_data_handle());
}
}
if (!prevExecPtr || prevExecPtr->getWeightDesc() != execPtr->getWeightDesc()) {
primArgs[DNNL_ARG_WEIGHTS] = prepareWeightMemory(DnnlExtensionUtils::makeDescriptor(execPtr->getWeightDesc()))->GetPrimitive();
}
// changed shapes may also cause the kernel type changed
selected_pd->setImplementationType(execPtr->getImplementationType());
// maybe expected 1x1 conv is not created, update the flag depends on the real type
useConv1x1 = execPtr->getImplementationType() == brgconv_avx512_1x1;
if (withBiases) {
primArgs[DNNL_ARG_BIAS] = biasMemPtr->GetPrimitive();
}
appendPostOpArgs(*attr, primArgs, postOpsArgs);
auto pd = execPtr->getPrimitiveDesc();
auto scratchpadMem = getScratchPadMem(pd);
primArgs[DNNL_ARG_SCRATCHPAD] = scratchpadMem->GetPrimitive();
#ifdef CPU_DEBUG_CAPS
if (result.second == CacheEntryBase::LookUpStatus::Miss) {
DEBUG_LOG("verbose##", getName(), "##", pd->info(), "\n");
}
#endif
} else {
IE_THROW() << "Executor is not created for node " << getName() << ".";
}
}
void FullyConnected::setDynamicBatchLim(int lim) {
dynBatchLim = lim;
if (!execPtr) {
IE_THROW() << "Can't set dynamic batch for FullyConnected node with name: " << getName() << ", because executor is not compiled";
}
if (execPtr->needReordering()) {
IE_THROW() << "Can't execute FullyConnected node with dynamic batch via executor with reorders";
}
auto setBatchPrimArgs = [this](int argType, const dnnl::memory& oldMem) {
dnnl::memory::desc newMemDesc(oldMem.get_desc());
@ -350,31 +404,38 @@ void FullyConnected::setDynamicBatchLim(int lim) {
primArgs.at(argType) = dnnl::memory(newMemDesc, oldMem.get_engine(), oldMem.get_data_handle());
};
setBatchPrimArgs(DNNL_ARG_SRC, getParentEdgesAtPort(0)[0]->getMemory().GetPrimitive());
setBatchPrimArgs(DNNL_ARG_DST, getChildEdgesAtPort(0)[0]->getMemory().GetPrimitive());
if (useConv1x1) {
Node::setDynamicBatchLim(lim);
} else {
dynBatchLim = lim;
setBatchPrimArgs(DNNL_ARG_SRC, getParentEdgesAtPort(0)[0]->getMemory().GetPrimitive());
setBatchPrimArgs(DNNL_ARG_DST, getChildEdgesAtPort(0)[0]->getMemory().GetPrimitive());
}
}
void FullyConnected::execute(dnnl::stream strm) {
if (prim) {
// in cases parameter -> FullyConnected or dynamic shapes
// we keep old pointer to data in primArgs on second iteration with same input shapes
auto updateMemoryPtr = [this](int argType) {
auto param = primArgs.find(argType);
if (param != primArgs.end()) {
if (argType == DNNL_ARG_SRC && getInputShapeAtPort(DATA_ID).getRank() == 3) {
primArgs.at(argType).set_data_handle(getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
}
if (argType == DNNL_ARG_DST && getOutputShapeAtPort(0).getRank() == 3) {
primArgs.at(argType).set_data_handle(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
}
}
};
updateMemoryPtr(DNNL_ARG_SRC);
updateMemoryPtr(DNNL_ARG_DST);
(*prim).execute(strm, primArgs);
if (!execPtr) {
IE_THROW() << "Can't execute FullyConnected node with name: " << getName() << ", because executor is not compiled";
}
// in cases parameter -> FullyConnected or dynamic shapes
// we keep old pointer to data in primArgs on second iteration with same input shapes
auto updateMemoryPtr = [this](int argType) {
auto param = primArgs.find(argType);
if (param != primArgs.end()) {
if (argType == DNNL_ARG_SRC && (getInputShapeAtPort(DATA_ID).getRank() == 3 || useConv1x1)) {
primArgs.at(argType).set_data_handle(getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
}
if (argType == DNNL_ARG_DST && (getOutputShapeAtPort(0).getRank() == 3 || useConv1x1)) {
primArgs.at(argType).set_data_handle(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetData());
}
}
};
updateMemoryPtr(DNNL_ARG_SRC);
updateMemoryPtr(DNNL_ARG_DST);
execPtr->exec(primArgs, strm);
}
void FullyConnected::executeDynamicImpl(dnnl::stream strm) {
@ -647,6 +708,166 @@ InferenceEngine::Precision FullyConnected::getRuntimePrecision() const {
return getMaxPrecision(inputPrecisions);
}
void FullyConnected::initOptimalPrimitiveDescriptor() {
Node::initOptimalPrimitiveDescriptor();
auto selectedPD = getSelectedPrimitiveDescriptor();
implementationTypeIP = selectedPD->getImplementationType();
// if convolution selected the reorder for ip is useless. Will do the reoder for ip in prepareParams
auto constParent = getParentEdgeAt(1)->getParent();
auto selectedParentPD = constParent->getSelectedPrimitiveDescriptor();
auto config = selectedPD->getConfig();
weightDescIP = config.inConfs[1].getMemDesc();
config.inConfs[1].setMemDesc(selectedParentPD->getConfig().outConfs[0].getMemDesc());
selectedPD->setConfig(config);
}
DnnlDesriptor FullyConnected::createDescriptorInternalForConv(DnnlMemoryDescCPtr inputDescPtr,
DnnlMemoryDescCPtr weightDescPtr,
DnnlMemoryDescCPtr biasDescPtr,
DnnlMemoryDescCPtr outputDescPtr) {
const dnnl::memory::desc &inputDesc = inputDescPtr->getDnnlDesc();
const dnnl::memory::desc &outputDesc = outputDescPtr->getDnnlDesc();
const dnnl::memory::desc &weightDesc = weightDescPtr->getDnnlDesc();
// make a fake shape: N, IC, W
auto inDims = inputDesc.dims();
dnnl::memory::dims normalizedInDims;
if (inDims.size() == 3) {
normalizedInDims = {inDims[0], inDims[2], inDims[1]};
} else if (inDims.size() == 2) {
normalizedInDims = {dnnl::memory::dim{1}, inDims[1], inDims[0]};
}
auto convInDesc = dnnl::memory::desc(normalizedInDims, inputDesc.data_type(), memory::format_tag::nwc);
// make a fake shape: N, OC, W
auto outDims = outputDesc.dims();
dnnl::memory::dims normalizedOutDims;
if (outDims.size() == 3) {
normalizedOutDims = { outDims[0], outDims[2], outDims[1]};
} else if (outDims.size() == 2) {
normalizedOutDims = { dnnl::memory::dim{1}, outDims[1], outDims[0]};
}
auto convOutDesc = dnnl::memory::desc(normalizedOutDims, outputDesc.data_type(), memory::format_tag::nwc);
// make a fake shape: OC, IC, 1
auto weightDims = weightDesc.dims();
dnnl::memory::dims normalizedWeightDims;
normalizedWeightDims = {static_cast<dnnl::memory::dim>(weightDims[0]),
static_cast<dnnl::memory::dim>(weightDims[1]),
dnnl::memory::dim{1}};
auto convWeightDescAny = dnnl::memory::desc(normalizedWeightDims, weightDesc.data_type(), dnnl::memory::format_tag::any);
std::shared_ptr<dnnl::convolution_forward::desc> desc;
if (biasDescPtr) {
desc = std::make_shared<dnnl::convolution_forward::desc>(prop_kind::forward_scoring, dnnl::algorithm::convolution_direct,
convInDesc, convWeightDescAny, biasDescPtr->getDnnlDesc(), convOutDesc,
dnnl::memory::dims{1}, // stride
dnnl::memory::dims{0}, // dilation
dnnl::memory::dims{0}, // paddingL
dnnl::memory::dims{0}); // paddingR
} else {
desc = std::make_shared<dnnl::convolution_forward::desc>(prop_kind::forward_scoring, dnnl::algorithm::convolution_direct,
convInDesc, convWeightDescAny, convOutDesc,
dnnl::memory::dims{1}, // stride
dnnl::memory::dims{0}, // dilation
dnnl::memory::dims{0}, // paddingL
dnnl::memory::dims{0}); // paddingR
}
return DnnlDesriptor(desc);
}
bool FullyConnected::canBeExecutedInConv1x1() const {
bool retVal = false;
const auto inRank = getInputShapeAtPort(DATA_ID).getRank();
const auto weightRank = getInputShapeAtPort(WEIGHTS_ID).getRank();
// disable rank=4:
// if layout is nhwc:
// A matrix: N * IC * H * W --> N * (IC*H*W), the M, N', K of matrix multiply will be:
// M = 1, K = (IC*H*W), when M = 1 it should not be efficient since acts as a vector multiply
// if layout is nchw/nChw16c: brg1x1 not support. Although jit supports, it should have similar
// problems with the above.
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) &&
getOriginalInputPrecisionAtPort(DATA_ID) == InferenceEngine::Precision::FP32 &&
one_of(inRank, 2, 3) && weightRank == 2) {
auto dstMemPtr = getChildEdgesAtPort(0)[0]->getMemoryPtr();
DnnlMemoryDescCPtr outDesc = dstMemPtr->GetDescWithType<DnnlMemoryDesc>();
// brg convolution does not support stride
if (outDesc->getDnnlDesc().data.offset0 == 0)
retVal = true;
}
if (retVal) {
auto srcMemPtr = getParentEdgesAtPort(0)[0]->getMemoryPtr();
const auto& srcDims = srcMemPtr->getStaticDims();
auto weightMemPtr = getParentEdgesAtPort(1)[0]->getMemoryPtr();
const auto& weightDims = weightMemPtr->getStaticDims();
Dim M, N, K;
M = srcDims[inRank - 2];
K = srcDims[inRank - 1];
N = weightDims[0];
if (!(M >= 49 && M <= 3136 &&
K >= 96 && K <= 4096 &&
N >= 96 && N <= K * 4))
retVal = false;
}
return retVal;
}
FullyConnected::ExecutorInnerProduct::ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd) {
execPrim.reset(new dnnl::inner_product_forward(pd));
}
FullyConnected::ExecutorConv1x1::ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd) {
execPrim.reset(new dnnl::convolution_forward(pd));
}
MemoryPtr FullyConnected::prepareWeightMemory(DnnlMemoryDescPtr weightDesc) {
if (!getParentEdgeAt(1)->getParent()->isConstant())
IE_THROW() << "Weight input is not const for node " << getName() << ".";
auto blob = getParentEdgeAt(1)->getMemoryPtr();
if (!blob)
IE_THROW() << "Cannot get const weights blob for node " << getName() << ".";
auto constDnnlMemOutDesc = blob->GetDescWithType<DnnlMemoryDesc>();
auto weightSrcDesc = constDnnlMemOutDesc->getDnnlDesc();
weightSrcDesc = weightSrcDesc.reshape(weightDesc->getDnnlDesc().dims());
auto create = [&] () {
auto newSrcDesc = DnnlExtensionUtils::makeDescriptor(weightSrcDesc);
Memory srcMemory{ getEngine() };
srcMemory.Create(newSrcDesc, blob->GetData());
MemoryPtr _ptr = std::make_shared<Memory>(getEngine());
_ptr->Create(weightDesc);
node::Reorder::reorderData(srcMemory, *_ptr, getRuntimeCache());
return _ptr;
};
MemoryPtr ptr;
const auto& format = weightDesc->serializeFormat();
auto itr = privateWeightCache.find(format);
if (privateWeightCache.end() != itr) {
ptr = itr->second;
} else {
if (weightCache != nullptr) {
const std::string string_hash = getName() + "_" + format
+ "_" + std::to_string(blob->GetSize())
+ "_" + std::to_string(reinterpret_cast<uint64_t>(blob->GetData()));
ptr = *weightCache->findOrCreate(string_hash, create);
} else {
ptr = create();
}
privateWeightCache[format] = ptr;
}
return ptr;
}
} // namespace node
} // namespace intel_cpu
} // namespace ov

View File

@ -9,6 +9,7 @@
#include <memory>
#include <string>
#include <vector>
#include "common/dnnl_executor.h"
namespace ov {
namespace intel_cpu {
@ -40,6 +41,7 @@ public:
}
void initSupportedPrimitiveDescriptors() override;
void initOptimalPrimitiveDescriptor() override;
std::shared_ptr<MemoryDesc> getSrcMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
std::shared_ptr<MemoryDesc> getDstMemDesc(dnnl::primitive_desc_iterator &primitive_desc_it, size_t idx) override;
@ -75,6 +77,35 @@ private:
static const size_t WEIGHTS_ID = 1;
static const size_t BIAS_ID = 2;
dnnl::memory::data_type outputDataType;
using executorPtr = std::shared_ptr<DnnlExecutor>;
executorPtr execPtr = nullptr;
bool useConv1x1 = false;
impl_desc_type implementationTypeIP;
MemoryDescPtr weightDescIP;
// when weightCache is not enabled (such as stream=1), brgconv weights may change due to
// different shapes. Weights will be cached in privateWeightCache.
// When weightCache is enabled, it holds weight ptr reference since weightCache does not hold the
// reference
std::unordered_map<std::string, MemoryPtr> privateWeightCache;
class ExecutorInnerProduct : public DnnlExecutor {
public:
ExecutorInnerProduct(const dnnl::inner_product_forward::primitive_desc& pd);
};
class ExecutorConv1x1 : public DnnlExecutor {
public:
ExecutorConv1x1(const dnnl::convolution_forward::primitive_desc& pd);
};
static DnnlDesriptor createDescriptorInternalForConv(DnnlMemoryDescCPtr inputDescPtr,
DnnlMemoryDescCPtr weightDescPtr,
DnnlMemoryDescCPtr biasDescPtr,
DnnlMemoryDescCPtr outputDescPtr);
bool canBeExecutedInConv1x1() const;
MemoryPtr prepareWeightMemory(const DnnlMemoryDescPtr weightDesc);
};
} // namespace node

View File

@ -412,7 +412,7 @@ std::string Reorder::getReorderArgs(const MemoryDesc &parentDesc, const MemoryDe
return inArgs + "_" + outArgs;
}
void Reorder::reorderData(const Memory &input, const Memory &output) {
void Reorder::reorderData(const Memory &input, const Memory &output, MultiCachePtr cache) {
if (!input.getDesc().isDefined() || !output.getDesc().isDefined())
IE_THROW() << "Can't reorder data with dynamic shapes";
@ -427,17 +427,44 @@ void Reorder::reorderData(const Memory &input, const Memory &output) {
auto copySize = output.GetSize();
cpu_memcpy(dstPtr, srcPtr, copySize);
} else {
std::unique_ptr<dnnl::reorder> pReorder;
dnnl::memory srcMemory;
auto getReorder = [] (MultiCachePtr& cache, const dnnl::memory& srcMemory, const dnnl::memory& dstMemory)
-> std::shared_ptr<dnnl::reorder> {
const auto& engine = dstMemory.get_engine();
auto builder = [&engine](const ReorderKey& key) -> std::shared_ptr<dnnl::reorder> {
dnnl::primitive_attr attr;
reorder::primitive_desc pd = dnnl::reorder::primitive_desc(engine, key.src, engine, key.dest, attr, true);
DEBUG_LOG(key.src, "->", key.dest);
if (!pd)
return nullptr;
return std::make_shared<dnnl::reorder>(pd);
};
std::shared_ptr<dnnl::reorder> reorder;
auto src_desc = srcMemory.get_desc();
auto dst_desc = dstMemory.get_desc();
ReorderKey key = {src_desc, dst_desc};
if (!cache) {
reorder = builder(key);
} else {
auto result = cache->getOrCreate(key, builder);
reorder = std::move(result.first);
}
return reorder;
};
std::shared_ptr<dnnl::reorder> pReorder;
std::vector<uint8_t> tmpBuff;
try {
pReorder = std::unique_ptr<dnnl::reorder>(new dnnl::reorder(input.GetPrimitive(), output.GetPrimitive()));
srcMemory = input.GetPrimitive();
}
catch (const dnnl::error& err) {
if (dnnl_unimplemented == err.status && output.GetDataType() != input.GetDataType() && Convert::isSupportedDesc(input.getDesc()) &&
Convert::isSupportedDesc(output.getDesc())) {
auto srcMemory = input.GetPrimitive();
auto dstMemory = output.GetPrimitive();
auto engine = output.getEngine();
// try directly reorder
pReorder = getReorder(cache, srcMemory, dstMemory);
if (!pReorder) {
// try precision conversion then do the reorder
if (output.GetDataType() != input.GetDataType() && Convert::isSupportedDesc(input.getDesc()) &&
Convert::isSupportedDesc(output.getDesc())) {
//we probably could not make the reorder because there is no one supporting this precision conversion
//lets try to convert data first using cpu_convert
auto data = static_cast<const uint8_t *>(input.GetPtr());
@ -447,19 +474,20 @@ void Reorder::reorderData(const Memory &input, const Memory &output) {
cpu_convert(data, tmpBuff.data(), DnnlExtensionUtils::DataTypeToIEPrecision(input.GetDataType()),
outPrc, input.GetSize() / input.getDesc().getPrecision().size());
Memory tmpMem(output.getEngine());
Memory tmpMem(engine);
auto tmpDesc = input.getDesc().cloneWithNewPrecision(outPrc);
tmpMem.Create(std::move(tmpDesc), tmpBuff.data());
pReorder = std::unique_ptr<dnnl::reorder>(new dnnl::reorder(tmpMem.GetPrimitive(), output.GetPrimitive()));
srcMemory = tmpMem.GetPrimitive();
} else {
throw;
pReorder = getReorder(cache, srcMemory, dstMemory);
}
if (!pReorder) {
IE_THROW() << "No reorder available for the following tensor descriptors: "
<< input.getDesc().serializeFormat() << " and " << output.getDesc().serializeFormat();
}
}
if (pReorder) {
dnnl::stream loc_stream(output.getEngine(), dnnl::stream::flags::in_order);
auto dstMemory = output.GetPrimitive();
dnnl::stream loc_stream(engine, dnnl::stream::flags::in_order);
pReorder->execute(loc_stream, srcMemory, dstMemory);
} else {
IE_THROW() << "Could not make onednn reorder.";

View File

@ -65,7 +65,7 @@ public:
static std::string getReorderArgs(const MemoryDesc &parentDesc, const MemoryDesc &childDesc);
static void reorderData(const Memory &input, const Memory &output);
static void reorderData(const Memory &input, const Memory &output, MultiCachePtr cache = nullptr);
private:
std::shared_ptr<MemoryDesc> input;

View File

@ -14,7 +14,7 @@ Primitive::operator bool() const {
return prim ? true : false;
}
dnnl::primitive Primitive::operator*() {
dnnl::primitive Primitive::operator*() const {
return *prim;
}

View File

@ -19,7 +19,7 @@ public:
Primitive();
operator bool() const;
Primitive& operator=(const std::shared_ptr<dnnl::primitive>& primitive);
dnnl::primitive operator*();
dnnl::primitive operator*() const;
void reset(dnnl::primitive* primitive);
private:

View File

@ -248,6 +248,16 @@ std::vector<CPUSpecificParams> filterSpecificParams_BrgemmAmx() {
return specificParams;
}
std::vector<CPUSpecificParams> filterSpecificParams_Brgconv1x1() {
std::vector<CPUSpecificParams> specificParams;
if (with_cpu_x86_avx512_core()) {
specificParams.push_back(CPUSpecificParams{{}, {}, {"brgconv_avx512_1x1"}, "brgconv_avx512_1x1"});
}
return specificParams;
}
/* ============= FullyConnected ============= */
namespace fullyConnected {
@ -534,7 +544,7 @@ INSTANTIATE_TEST_SUITE_P(nightly_FC_3D, MatMulLayerCPUTest, testParams3D_nightly
INSTANTIATE_TEST_SUITE_P(nightly_FC_3D_BF16, MatMulLayerCPUTest, testParams3DBF16_nightly, MatMulLayerCPUTest::getTestCaseName);
const std::vector<ShapeRelatedParams> IS2D_Brgemm_smoke = {
{static_shapes_to_test_representation({{59, 120}, {120, 120}}), {true, false}},
{static_shapes_to_test_representation({{39, 120}, {120, 120}}), {true, false}},
{static_shapes_to_test_representation({{59, 16}, {16, 120}}), {true, false}},
{static_shapes_to_test_representation({{59, 16}, {16, 120}}), {true, true}},
@ -596,6 +606,46 @@ const auto testParams2D_Brgemm_smoke = ::testing::Combine(fullyConnectedParams2D
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_Brgemm, MatMulLayerCPUTest, testParams2D_Brgemm_smoke, MatMulLayerCPUTest::getTestCaseName);
const std::vector<ShapeRelatedParams> IS2D_Brgconv1x1_smoke = {
{static_shapes_to_test_representation({{49, 96}, {96, 96}}), {true, false}},
{static_shapes_to_test_representation({{256, 188}, {188, 120}}), {true, false}},
{static_shapes_to_test_representation({{256, 188}, {188, 120}}), {true, true}},
{static_shapes_to_test_representation({{71, 128}, {128, 200}}), {false, false}},
{static_shapes_to_test_representation({{71, 128}, {128, 200}}), {false, true}},
{
{
{{-1, -1}, {{49, 96}, {59, 96}, {69, 96}, {79, 96}}},
{{96, 96}, {{96, 96}, {96, 96}, {96, 96}, {96, 96}}}
},
{false, false}
},
{
{
{{{0, 200}, {0, 200}}, {{98, 128}, {199, 128}}},
{{128, 166}, {{128, 166}, {128, 166}}}
},
{true, true}
},
};
const auto fullyConnectedParams2D_Brgconv1x1_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_Brgconv1x1_smoke),
::testing::Values(ElementType::f32),
::testing::Values(ElementType::undefined),
::testing::Values(ElementType::undefined),
::testing::Values(helpers::InputLayerType::CONSTANT),
::testing::Values(CommonTestUtils::DEVICE_CPU),
::testing::Values(emptyAdditionalConfig));
const auto testParams2D_Brgconv1x1_smoke = ::testing::Combine(fullyConnectedParams2D_Brgconv1x1_smoke,
::testing::Values(MatMulNodeType::FullyConnected),
::testing::ValuesIn(fusingParamsSet2D_Brgemm_smoke),
::testing::ValuesIn(filterSpecificParams_Brgconv1x1()));
INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_Brgconv1x1, MatMulLayerCPUTest, testParams2D_Brgconv1x1_smoke, MatMulLayerCPUTest::getTestCaseName);
const auto fullyConnectedParams2D_Brgemm_Amx_smoke = ::testing::Combine(::testing::ValuesIn(IS2D_Brgemm_smoke),
::testing::Values(ElementType::f32),
::testing::Values(ElementType::undefined),