[CPU] Optimize post ops processing

This commit is contained in:
dmitrygo
2022-03-11 16:39:02 +03:00
committed by Luo Cheng
parent 8ee5514629
commit 5e1a5aef3e
3 changed files with 151 additions and 65 deletions

View File

@@ -513,7 +513,6 @@ void Convolution::getSupportedDescriptors() {
void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, bool initWeights = false) {
dnnl::post_ops ops;
const bool useLegacyPostOps = true; // @todo remove after issue with performance of binary post ops fixed
auto getBinPostOpShape = [&](){
const auto outShape = getOutputShapeAtPort(0).getStaticDims();
@@ -536,7 +535,7 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims,
}
ops.append_sum(1.0, DnnlExtensionUtils::IEPrecisionToDataType(eltwisePrecision));
} else {
if (useLegacyPostOps || eltwiseNode->getOneDnnAlgorithm() != dnnl::algorithm::undef) {
if (eltwiseNode->getOneDnnAlgorithm() != dnnl::algorithm::undef) {
eltwiseNode->appendPostOps(ops, dims, postOpsArgs);
} else {
eltwiseNode->appendBinPostOps(ops, getBinPostOpShape(), postOpsArgs);
@@ -546,83 +545,111 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims,
}
if (auto* fakeQuantizeNode = dynamic_cast<FakeQuantize *>(node.get())) {
if (useLegacyPostOps) {
if (i == 0) {
bool hasSubsequentSum = false;
bool hasSubsequentFQ = false;
for (int j = i + 1; j < fusedWith.size(); j++) {
auto &nextNode = fusedWith[j];
if (i == 0) {
bool hasSubsequentSum = false;
bool hasSubsequentFQ = false;
for (int j = i + 1; j < fusedWith.size(); j++) {
auto &nextNode = fusedWith[j];
auto *nextEltwiseNode = dynamic_cast<Eltwise *>(nextNode.get());
if (nextEltwiseNode && nextEltwiseNode->isSpecialConvolutionAddFusing()) {
hasSubsequentSum = true;
}
auto *nextQuantizeNode = dynamic_cast<FakeQuantize *>(nextNode.get());
if (nextQuantizeNode) {
hasSubsequentFQ = true;
}
auto *nextEltwiseNode = dynamic_cast<Eltwise *>(nextNode.get());
if (nextEltwiseNode && nextEltwiseNode->isSpecialConvolutionAddFusing()) {
hasSubsequentSum = true;
}
if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQCommon &&
hasSubsequentSum &&
hasSubsequentFQ) {
std::vector<float> fqScale = fakeQuantizeNode->getFQScales();
if (!fqScale.empty()) {
size_t size = fqScale.size();
size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
if (size == 1) {
fqScale.resize(OC);
for (size_t k = 0; k < OC; k++)
fqScale[k] = fqScale[0];
}
attr.set_output_scales(1 << 1, fqScale);
continue;
}
auto *nextQuantizeNode = dynamic_cast<FakeQuantize *>(nextNode.get());
if (nextQuantizeNode) {
hasSubsequentFQ = true;
}
}
if (node == fusedWith[fusedWith.size() - 1] &&
outputDataType == memory::data_type::u8 &&
fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization
/*levels == 256*/) {
auto &cl = fakeQuantizeNode->getCropLow();
auto &isc = fakeQuantizeNode->getInputScale();
auto &ish = fakeQuantizeNode->getInputShift();
if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQCommon &&
hasSubsequentSum &&
hasSubsequentFQ) {
std::vector<float> fqScale = fakeQuantizeNode->getFQScales();
if (!fqScale.empty()) {
size_t size = fqScale.size();
size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
if (size == 1) {
fqScale.resize(OC);
for (size_t k = 0; k < OC; k++)
fqScale[k] = fqScale[0];
}
if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
std::all_of(isc.cbegin(), isc.cend(), [&](float val) { return val == isc[0]; }) &&
std::all_of(ish.cbegin(), ish.cend(), [&](float val) { return val == ish[0]; })) {
ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, isc[0], ish[0]);
attr.set_output_scales(1 << 1, fqScale);
continue;
}
// } else if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; })) {
// std::vector<float> new_isc = isc;
// new_isc.resize(rnd_up(isc.size(), 16), 0);
//
// std::vector<float> new_ish = ish;
// new_ish.resize(rnd_up(ish.size(), 16), 0);
//
// fakeQuantizeNode->setInputScale(new_isc);
// fakeQuantizeNode->setInputShift(new_ish);
//
// ops.append_depthwise(mkldnn::algorithm::depthwise_scale_shift,
// &fakeQuantizeNode->getInputScale()[0],
// &fakeQuantizeNode->getInputShift()[0]);
//
// continue;
// }
}
fakeQuantizeNode->appendPostOps(ops, dims, postOpsArgs);
if (node == fusedWith[fusedWith.size() - 1]) {
auto &cl = fakeQuantizeNode->getCropLow();
auto &ch = fakeQuantizeNode->getCropHigh();
auto &isc = fakeQuantizeNode->getInputScale();
auto &ish = fakeQuantizeNode->getInputShift();
auto &osc = fakeQuantizeNode->getOutputScale();
auto &osh = fakeQuantizeNode->getOutputShift();
if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization) {
if (outputDataType == memory::data_type::u8 &&
std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return val == 0.0f; })) {
std::vector<float> outScale = isc;
if (!outScale.empty()) {
size_t size = outScale.size();
size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
if (size == 1) {
outScale.resize(OC);
for (size_t k = 0; k < OC; k++)
outScale[k] = outScale[0];
}
continue;
} else {
fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), postOpsArgs);
attr.set_output_scales(1 << 1, outScale);
continue;
}
}
}
if (outputDataType == memory::data_type::s8 &&
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < 0.0001f; }) &&
std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) &&
std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < 0.0001f; })) {
bool isCropAligned = true;
for (int i = 0; i < std::max(cl.size(), isc.size()); i++) {
if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > 0.0001f) {
isCropAligned = false;
}
}
for (int i = 0; i < std::max(ch.size(), isc.size()); i++) {
if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > 0.0001f) {
isCropAligned = false;
}
}
if (isCropAligned) {
std::vector<float> outScale = isc;
if (!outScale.empty()) {
size_t size = outScale.size();
size_t OC = getOutputShapeAtPort(0).getStaticDims()[1];
if (size == 1) {
outScale.resize(OC);
for (size_t k = 0; k < OC; k++)
outScale[k] = outScale[0];
}
attr.set_output_scales(1 << 1, outScale);
continue;
}
}
}
}
}
fakeQuantizeNode->appendBinPostOpsOptimized(ops, getBinPostOpShape(), postOpsArgs,
node == fusedWith[fusedWith.size() - 1], outputDataType);
continue;
}
auto* convolutionNode = dynamic_cast<Convolution *>(node.get());

View File

@@ -1919,6 +1919,63 @@ void FakeQuantize::appendBinPostOps(dnnl::post_ops& ops, const VectorDims& postO
appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
}
void FakeQuantize::appendBinPostOpsOptimized(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& binaryPostOpsMem,
bool isLastPostOp, dnnl::memory::data_type outDataType) {
static const size_t bufferAlignment = 1;
initializePostOpData(postOpDims, bufferAlignment);
VectorDims broadcastBinaryShape(postOpDims.size(), 1);
auto appendBinary = [&](const dnnl::algorithm alg, const size_t dataSize, MemoryPtr &memPtr, const void *data) {
DnnlBlockedMemoryDesc memoryDesc(Precision::FP32, dataSize == 1 ? Shape(broadcastBinaryShape) : Shape(postOpDims));
ops.append_binary(alg, memoryDesc.getDnnlDesc());
if (!memPtr) {
memPtr.reset(new Memory(getEngine()));
memPtr->Create(memoryDesc, data);
binaryPostOpsMem.push_back(memPtr);
}
};
dnnl::algorithm alg = getAlgorithm() == Algorithm::FQCommon || getAlgorithm() == Algorithm::FQRequantization
? dnnl::algorithm::quantization_quantize_dequantize
: dnnl::algorithm::quantization_quantize;
if (isLastPostOp &&
outDataType == memory::data_type::u8 &&
getAlgorithm() == Algorithm::FQQuantization
/*levels == 256*/) {
auto &cl = getCropLow();
auto &isc = getInputScale();
auto &ish = getInputShift();
if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
std::all_of(isc.cbegin(), isc.cend(), [&](float val) { return val == isc[0]; }) &&
std::all_of(ish.cbegin(), ish.cend(), [&](float val) { return val == ish[0]; })) {
ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, isc[0], ish[0]);
return;
} else if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; })) {
appendBinary(dnnl::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
appendBinary(dnnl::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
return;
}
}
appendBinary(dnnl::algorithm::binary_min, cropHighSize, cropHighMemory, &cropHighData.shifts_[0]);
appendBinary(dnnl::algorithm::binary_max, cropLowSize, cropLowMemory, &cropLowData.shifts_[0]);
appendBinary(dnnl::algorithm::binary_mul, inputScaleSize, inputScaleMemory, &inputScaleData.scales_[0]);
appendBinary(dnnl::algorithm::binary_add, inputShiftSize, inputShiftMemory, &inputShiftData.shifts_[0]);
if (alg == dnnl::algorithm::quantization_quantize_dequantize) {
ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_round_half_to_even, 0, 0);
}
appendBinary(dnnl::algorithm::binary_mul, outputScaleSize, outputScaleMemory, &outputScaleData.scales_[0]);
appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
}
FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
bool isBinarization = _jqp.op_type == Algorithm::FQBinarization;
if (mayiuse(cpu::x64::avx512_core)) {

View File

@@ -127,6 +127,8 @@ public:
void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& postOpsMem) override;
void appendPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<const void*>& postOpsMem) override;
void appendBinPostOps(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& binaryPostOpsMem) override;
void appendBinPostOpsOptimized(dnnl::post_ops& ops, const VectorDims &postOpDims, std::vector<MemoryPtr>& binaryPostOpsMem,
bool isLastPostOp, dnnl::memory::data_type outDataType);
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;