Optimize FullyConnected FakeQuantize post-ops (#11819)
* Optimize FullyConnected FakeQuantize post-ops * matmul bias fuse * Add simplifyToScale for FakeQuantize and use it in FC and Conv. * Add fakequantize documentation * Update doc and fix accuracy issue * Update doc * Fix accuracy regression * Generalize the judgment Criteria about fake quantization with scale * Update document Co-authored-by: Zhang Yi3 <yi3.zhang@intel.com> Co-authored-by: xuchen-intel <chen.xu@intel.com>
This commit is contained in:
parent
35ee842446
commit
a571539107
@ -157,6 +157,7 @@ std::shared_ptr<opset1::FakeQuantize> FakeQuantizeTransformation::fuseElementwis
|
|||||||
if (ov::is_type<opset1::Convolution>(fq::getDataNode(eltwise)) ||
|
if (ov::is_type<opset1::Convolution>(fq::getDataNode(eltwise)) ||
|
||||||
ov::is_type<opset1::GroupConvolution>(fq::getDataNode(eltwise)) ||
|
ov::is_type<opset1::GroupConvolution>(fq::getDataNode(eltwise)) ||
|
||||||
ov::is_type<opset1::ConvolutionBackpropData>(fq::getDataNode(eltwise)) ||
|
ov::is_type<opset1::ConvolutionBackpropData>(fq::getDataNode(eltwise)) ||
|
||||||
|
ov::is_type<opset1::MatMul>(fq::getDataNode(eltwise)) ||
|
||||||
ov::is_type<opset1::GroupConvolutionBackpropData>(fq::getDataNode(eltwise))) {
|
ov::is_type<opset1::GroupConvolutionBackpropData>(fq::getDataNode(eltwise))) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
184
src/plugins/intel_cpu/src/docs/fake_quantize.md
Normal file
184
src/plugins/intel_cpu/src/docs/fake_quantize.md
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
# FakeQuantize in OpenVINO
|
||||||
|
https://docs.openvino.ai/latest/openvino_docs_ops_quantization_FakeQuantize_1.html
|
||||||
|
|
||||||
|
definition:
|
||||||
|
```
|
||||||
|
if x <= min(input_low, input_high):
|
||||||
|
output = output_low
|
||||||
|
elif x > max(input_low, input_high):
|
||||||
|
output = output_high
|
||||||
|
else:
|
||||||
|
# input_low < x <= input_high
|
||||||
|
output = round((x - input_low) / (input_high - input_low) \* (levels-1)) / (levels-1) \* (output_high - output_low) + output_low
|
||||||
|
```
|
||||||
|
|
||||||
|
- x <= min(input_low, input_high): output = output_low
|
||||||
|
- x > max(input_low, input_high): output = output_high
|
||||||
|
- input_low < x <= input_high:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
q = round(\frac{x - il}{ih - il} * (levels-1)) \\
|
||||||
|
output = q * \frac{oh - ol}{levels-1} + ol
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
simplified, suppose (ih > il):
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
q = round(\frac{(x - il)}{(ih - il)} * (levels-1)) \\
|
||||||
|
q = clamp(q, 0, levels-1) \\
|
||||||
|
output = q * \frac{(oh - ol)}{levels-1} + ol
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
----------------------------
|
||||||
|
## Interpretation as Q+DQ
|
||||||
|
give names to parameters scale(S) & shift(Z)
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
S_i &= \frac{ih - il}{levels-1} \\
|
||||||
|
Z_i &= \frac{-il}{S_i}\\
|
||||||
|
S_{out} &= \frac{oh - ol}{levels-1} \\
|
||||||
|
Z_{out} &= \frac{-ol}{S_o}
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
using these paramerter, FQ becomes
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
q' &= round(x*\frac{1}{S_i} + Z_i) \tag{a}
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
q_{U} &= clamp(q', 0, levels-1) \tag{b}
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
output &= (q_{U} - Z_{out})* S_o \tag{c}
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
$q_U$ is unsigned quantized tensor. a small change can make it a signed quantized tensor:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
Z_0 &= \frac{levels}{2} \\
|
||||||
|
q' &= round(x*\frac{1}{S_i} + Z_i - Z_0) \\
|
||||||
|
q_{I} &= clamp(q', -Z_0, Z_0-1) \\
|
||||||
|
output &= (q_{I} + Z_0 - Z_{out})* S_o
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
here the center value Z0 is substracted before clamp to make a signed quantized value $q_I$ and it was added back later after clamp for mathematical equivalence.
|
||||||
|
|
||||||
|
notice:
|
||||||
|
- equation (a) is traditional quantization x into q only if Zi is integer:
|
||||||
|
- equation (c) is traditional dequantization only if Zo is integer:
|
||||||
|
|
||||||
|
thus inputLow/inputHigh/outputLow/outputHigh is gently tuned from statistical result to satisfy these requirements.
|
||||||
|
|
||||||
|
# Symetric quantization
|
||||||
|
In symetric quantize: choose `il` to be `-ih` results in non-integer zero points (since levels is even number)
|
||||||
|
|
||||||
|
$$
|
||||||
|
Z_i = \frac{-il*(levels-1)}{ih - il} = (levels-1)/2
|
||||||
|
$$
|
||||||
|
|
||||||
|
in symetric quantization, Zi is choosen to be `levels/2`, thus we can increase the range a little by push il to be smaller number
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
(levels-1)/Z_i = -(ih - il)/il = 1 - ih/il \\
|
||||||
|
2(1-1/levels) = 1 - ih/il \\
|
||||||
|
il = -ih/(1 - 2/levels)
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
for example:
|
||||||
|
- levels=256, U8, Zi=128, il = -1.0078740157480315 * ih
|
||||||
|
|
||||||
|
I8 is better choice for symetric quantization beause we can also make zero-point to be 0 if we use I8 for symetric quantization:
|
||||||
|
|
||||||
|
$$
|
||||||
|
q'_{U8} = clamp(round(x*\frac{1}{S_i} + 128), 0, 255)
|
||||||
|
$$
|
||||||
|
|
||||||
|
$$
|
||||||
|
q'_{I8} = clamp(round(x*\frac{1}{S_i}), -128, 127)
|
||||||
|
$$
|
||||||
|
|
||||||
|
# Asymetric quantization
|
||||||
|
|
||||||
|
In Asymetric quantization, there is a special case where inputLow=outputLow=0, we can use U8 equation and in this case Zi==Zo=0.
|
||||||
|
|
||||||
|
Otherwise, there is no easy way, either `U8` or `I8` requires non-zero zero-points.
|
||||||
|
|
||||||
|
# Quantize-only FQ
|
||||||
|
|
||||||
|
The actual tensor in memory is stored in quantized form, so FQ is splited as:
|
||||||
|
|
||||||
|
- `Quantize(clamp)` which is fused into `Producer` node as post ops.
|
||||||
|
- `Dequantize` is fused into `Consumer` node capable of benefit from quantized representation with additinal zero-point and scales information.
|
||||||
|
|
||||||
|
In CPU plugin, most FQ has been split by LPT into `Quantize-only FQ` (with Zo==0 and S_o==1) followed by a Dequantize (further represented as and splitted into a `Subtract` and a `Multiply`)
|
||||||
|
|
||||||
|
Many oneDNN primitive has standard support for `Quantize-only FQ` post-ops, which is zero-point & output scale, and this usually is the last post-op before storing to memory as quantized tensor.
|
||||||
|
|
||||||
|
To recognize a `Quantize-only FQ` that can be optimized with output-scales post-op, we need to check following two cases:
|
||||||
|
|
||||||
|
- output U8
|
||||||
|
- Zi=0 (i.e. inputLow==0)
|
||||||
|
- So=1
|
||||||
|
- Zo=0
|
||||||
|
|
||||||
|
$$
|
||||||
|
q'_{U8} = clamp(round(x*\frac{1}{S_i}), 0, 255)
|
||||||
|
$$
|
||||||
|
|
||||||
|
- output I8
|
||||||
|
- Zi=128 (which can be optimized as output I8 with Zi=0)
|
||||||
|
- So=1
|
||||||
|
- Zout=128 (outputLow = -128)
|
||||||
|
|
||||||
|
$$
|
||||||
|
q'_{I8} = clamp(round(x*\frac{1}{S_i}), -128, 127)
|
||||||
|
$$
|
||||||
|
|
||||||
|
`Quantize-only FQ` post-ops optimization example:
|
||||||
|
- `Quantize-only FQ` is the only post-ops of parent node. We optimize FQ by setting the output-scales of parent node. For example, in below pattern, we set
|
||||||
|
$\frac{1}{S_i}$ as the output scale of `conv` or `inner_produce` to optimize the pattern.
|
||||||
|
```
|
||||||
|
conv --> FQ
|
||||||
|
inner_product --> FQ
|
||||||
|
```
|
||||||
|
- `Quantize-only FQ` is the last post-ops and `eltwise` post-ops is before FQ. We optimize FQ by setting the output-scales of `eltwise` node. For example, the below pattern, we set $\frac{1}{S_i}$ as the output scale of `eltwise`
|
||||||
|
```
|
||||||
|
conv --> ... --> eltwise --> FQ
|
||||||
|
inner_product --> ... --> eltwise --> FQ
|
||||||
|
```
|
||||||
|
|
||||||
|
# FQCommon
|
||||||
|
|
||||||
|
The actual tensor is stored in memory as floating point type. So `round` is not needed in this case. The output can be simplified as:
|
||||||
|
|
||||||
|
$$
|
||||||
|
\begin{align}
|
||||||
|
y =(x-il)*\frac{oh-ol}{ih-il} + ol \\
|
||||||
|
y =x*\frac{oh-ol}{ih-il} + c \\
|
||||||
|
c = -il*\frac{oh-ol}{ih-il} + ol
|
||||||
|
\end{align}
|
||||||
|
$$
|
||||||
|
|
||||||
|
If the following conditions are ture, FQ can be optimized with output-scales $\frac{oh-ol}{ih-il}$.
|
||||||
|
|
||||||
|
$$
|
||||||
|
|c/(oh-ol)| = |\frac{ol}{oh-ol} -\frac{il}{ih-il}| < 0.01
|
||||||
|
$$
|
@ -592,6 +592,7 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims,
|
|||||||
|
|
||||||
if (auto* fakeQuantizeNode = dynamic_cast<FakeQuantize *>(node.get())) {
|
if (auto* fakeQuantizeNode = dynamic_cast<FakeQuantize *>(node.get())) {
|
||||||
const Dim OC = dims[1];
|
const Dim OC = dims[1];
|
||||||
|
auto scale = fakeQuantizeNode->simplifyToScale(outputDataType, OC);
|
||||||
if (i == 0) {
|
if (i == 0) {
|
||||||
bool hasSubsequentSum = false;
|
bool hasSubsequentSum = false;
|
||||||
bool hasSubsequentFQ = false;
|
bool hasSubsequentFQ = false;
|
||||||
@ -627,92 +628,24 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node == fusedWith[fusedWith.size() - 1]) {
|
if (node == fusedWith[fusedWith.size() - 1] && !scale.empty()) {
|
||||||
auto &cl = fakeQuantizeNode->getCropLow();
|
attr.set_output_scales(1 << 1, scale);
|
||||||
auto &ch = fakeQuantizeNode->getCropHigh();
|
continue;
|
||||||
auto &isc = fakeQuantizeNode->getInputScale();
|
|
||||||
auto &ish = fakeQuantizeNode->getInputShift();
|
|
||||||
auto &osc = fakeQuantizeNode->getOutputScale();
|
|
||||||
auto &osh = fakeQuantizeNode->getOutputShift();
|
|
||||||
if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization) {
|
|
||||||
if (outputDataType == memory::data_type::u8 &&
|
|
||||||
std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
|
|
||||||
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return val == 0.0f; })) {
|
|
||||||
std::vector<float> outScale = isc;
|
|
||||||
if (!outScale.empty()) {
|
|
||||||
size_t size = outScale.size();
|
|
||||||
if (size == 1) {
|
|
||||||
outScale.resize(OC);
|
|
||||||
for (size_t k = 0; k < OC; k++)
|
|
||||||
outScale[k] = outScale[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
attr.set_output_scales(1 << 1, outScale);
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (outputDataType == memory::data_type::s8 &&
|
|
||||||
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < 0.0001f; }) &&
|
|
||||||
std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) &&
|
|
||||||
std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < 0.0001f; })) {
|
|
||||||
bool isCropAligned = true;
|
|
||||||
for (int i = 0; i < std::max(cl.size(), isc.size()); i++) {
|
|
||||||
if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > 0.0001f) {
|
|
||||||
isCropAligned = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < std::max(ch.size(), isc.size()); i++) {
|
|
||||||
if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > 0.0001f) {
|
|
||||||
isCropAligned = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isCropAligned) {
|
|
||||||
std::vector<float> outScale = isc;
|
|
||||||
if (!outScale.empty()) {
|
|
||||||
size_t size = outScale.size();
|
|
||||||
if (size == 1) {
|
|
||||||
outScale.resize(OC);
|
|
||||||
for (size_t k = 0; k < OC; k++)
|
|
||||||
outScale[k] = outScale[0];
|
|
||||||
}
|
|
||||||
|
|
||||||
attr.set_output_scales(1 << 1, outScale);
|
|
||||||
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node == fusedWith[fusedWith.size() - 1] &&
|
if (node == fusedWith[fusedWith.size() - 1] && !scale.empty()) {
|
||||||
outputDataType == memory::data_type::u8 &&
|
if (ops.len() == 1 && ops.kind(0) == primitive::kind::sum &&
|
||||||
fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization &&
|
outputDataType == memory::data_type::u8 &&
|
||||||
ops.len() == 1 && ops.kind(0) == primitive::kind::sum
|
std::all_of(scale.cbegin(), scale.cend(), [&](float val) { return val == scale[0]; })) {
|
||||||
/*levels == 256*/) {
|
|
||||||
auto &cl = fakeQuantizeNode->getCropLow();
|
|
||||||
auto &isc = fakeQuantizeNode->getInputScale();
|
|
||||||
auto &ish = fakeQuantizeNode->getInputShift();
|
|
||||||
|
|
||||||
if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
|
|
||||||
std::all_of(isc.cbegin(), isc.cend(), [&](float val) { return val == isc[0]; }) &&
|
|
||||||
std::all_of(ish.cbegin(), ish.cend(), [&](float val) { return val == 0; })) {
|
|
||||||
std::vector<float> outScales;
|
std::vector<float> outScales;
|
||||||
int mask = 1 << 1;
|
int mask = 1 << 1;
|
||||||
attr.get_output_scales(mask, outScales);
|
attr.get_output_scales(mask, outScales);
|
||||||
|
|
||||||
for (int j = 0; j < outScales.size(); j++) {
|
for (int j = 0; j < outScales.size(); j++) {
|
||||||
outScales[j] *= isc[0];
|
outScales[j] *= scale[0];
|
||||||
}
|
}
|
||||||
attr.set_output_scales(mask, outScales);
|
attr.set_output_scales(mask, outScales);
|
||||||
|
ops.get()->entry_[0].sum.scale = scale[0];
|
||||||
ops.get()->entry_[0].sum.scale = isc[0];
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1168,15 +1168,18 @@ FakeQuantize::FakeQuantize(const std::shared_ptr<ngraph::Node>& op, const dnnl::
|
|||||||
float oh = outputHighData[isOutputHighBroadcasted ? 0 : i];
|
float oh = outputHighData[isOutputHighBroadcasted ? 0 : i];
|
||||||
|
|
||||||
isFakeQuantization = isFakeQuantization && il == ol && ih == oh;
|
isFakeQuantization = isFakeQuantization && il == ol && ih == oh;
|
||||||
isFakeQuantizationWithScale = isFakeQuantizationWithScale && ol != 0 && oh != 0 && (il / ol - ih / oh < 0.1f);
|
isFakeQuantizationWithScale = isFakeQuantizationWithScale && il != ih && ol != oh &&
|
||||||
|
(abs(ol / (oh - ol) - il / (ih - il)) < 0.001f);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isFakeQuantizationWithScale) {
|
if (isFakeQuantizationWithScale) {
|
||||||
for (int i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) {
|
for (int i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) {
|
||||||
float il = inputLowData[isInputLowBroadcasted ? 0 : i];
|
float il = inputLowData[isInputLowBroadcasted ? 0 : i];
|
||||||
float ol = outputLowData[isOutputLowBroadcasted ? 0 : i];
|
float ol = outputLowData[isOutputLowBroadcasted ? 0 : i];
|
||||||
|
float ih = inputHighData[isInputHighBroadcasted ? 0 : i];
|
||||||
|
float oh = outputHighData[isOutputHighBroadcasted ? 0 : i];
|
||||||
|
|
||||||
fqScales.push_back(1 / (il / ol));
|
fqScales.push_back(1 / ((ih - il) / (oh - ol)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1976,6 +1979,64 @@ void FakeQuantize::appendBinPostOpsOptimized(dnnl::post_ops& ops, const VectorDi
|
|||||||
appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
|
appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<float> FakeQuantize::simplifyToScale(dnnl::memory::data_type outDataType, size_t OC) {
|
||||||
|
auto &cl = getCropLow();
|
||||||
|
auto &ch = getCropHigh();
|
||||||
|
auto &isc = getInputScale();
|
||||||
|
auto &ish = getInputShift();
|
||||||
|
auto &osc = getOutputScale();
|
||||||
|
auto &osh = getOutputShift();
|
||||||
|
|
||||||
|
std::vector<float> outScale;
|
||||||
|
|
||||||
|
if (outDataType == memory::data_type::u8 &&
|
||||||
|
getAlgorithm() == Algorithm::FQQuantization &&
|
||||||
|
std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) &&
|
||||||
|
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return val == 0.0f; })) {
|
||||||
|
outScale = isc;
|
||||||
|
if (!outScale.empty()) {
|
||||||
|
size_t size = outScale.size();
|
||||||
|
if (size == 1 && Shape::UNDEFINED_DIM != OC) {
|
||||||
|
outScale.resize(OC);
|
||||||
|
for (size_t k = 0; k < OC; k++)
|
||||||
|
outScale[k] = outScale[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outDataType == memory::data_type::s8 &&
|
||||||
|
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < 0.0001f; }) &&
|
||||||
|
std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) &&
|
||||||
|
std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < 0.0001f; })) {
|
||||||
|
bool isCropAligned = true;
|
||||||
|
for (int i = 0; i < std::max(cl.size(), isc.size()); i++) {
|
||||||
|
if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > 0.0001f) {
|
||||||
|
isCropAligned = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < std::max(ch.size(), isc.size()); i++) {
|
||||||
|
if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > 0.0001f) {
|
||||||
|
isCropAligned = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isCropAligned) {
|
||||||
|
outScale = isc;
|
||||||
|
if (!outScale.empty()) {
|
||||||
|
size_t size = outScale.size();
|
||||||
|
if (size == 1 && Shape::UNDEFINED_DIM != OC) {
|
||||||
|
outScale.resize(OC);
|
||||||
|
for (size_t k = 0; k < OC; k++)
|
||||||
|
outScale[k] = outScale[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return outScale;
|
||||||
|
}
|
||||||
|
|
||||||
FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
|
FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) {
|
||||||
bool isBinarization = _jqp.op_type == Algorithm::FQBinarization;
|
bool isBinarization = _jqp.op_type == Algorithm::FQBinarization;
|
||||||
if (mayiuse(cpu::x64::avx512_core)) {
|
if (mayiuse(cpu::x64::avx512_core)) {
|
||||||
|
@ -131,7 +131,7 @@ public:
|
|||||||
bool isLastPostOp, dnnl::memory::data_type outDataType);
|
bool isLastPostOp, dnnl::memory::data_type outDataType);
|
||||||
|
|
||||||
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
|
static bool isSupportedOperation(const std::shared_ptr<const ngraph::Node>& op, std::string& errorMessage) noexcept;
|
||||||
|
std::vector<float> simplifyToScale(dnnl::memory::data_type outDataType, size_t OC);
|
||||||
enum BroadcastingPolicy {
|
enum BroadcastingPolicy {
|
||||||
PerChannel, // all FQ operations are per channel
|
PerChannel, // all FQ operations are per channel
|
||||||
PerTensor, // all FQ operations are per tensor
|
PerTensor, // all FQ operations are per tensor
|
||||||
|
@ -165,7 +165,7 @@ void FullyConnected::getSupportedDescriptors() {
|
|||||||
IE_THROW()<< errorPrefix << " has incorrect number of output edges";
|
IE_THROW()<< errorPrefix << " has incorrect number of output edges";
|
||||||
|
|
||||||
auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID));
|
auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID));
|
||||||
auto outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID));
|
outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID));
|
||||||
|
|
||||||
if (inputDataType == memory::data_type::f32) {
|
if (inputDataType == memory::data_type::f32) {
|
||||||
outputDataType = memory::data_type::f32;
|
outputDataType = memory::data_type::f32;
|
||||||
@ -393,9 +393,46 @@ void FullyConnected::setPostOps(dnnl::primitive_attr &attr, const VectorDims &di
|
|||||||
return binaryShape;
|
return binaryShape;
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto &node : fusedWith) {
|
const auto channelAxis = getFusingAxis();
|
||||||
|
size_t OC = getOutputShapeAtPort(0).getDims()[channelAxis];
|
||||||
|
|
||||||
|
for (int i = 0; i < fusedWith.size(); i++) {
|
||||||
|
auto& node = fusedWith[i];
|
||||||
|
|
||||||
if (auto* fakeQuantizeNode = dynamic_cast<FakeQuantize *>(node.get())) {
|
if (auto* fakeQuantizeNode = dynamic_cast<FakeQuantize *>(node.get())) {
|
||||||
fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), postOpsArgs);
|
auto scale = fakeQuantizeNode->simplifyToScale(outputDataType, OC);
|
||||||
|
|
||||||
|
if (fusedWith.size() == 1 && !scale.empty()) {
|
||||||
|
attr.set_output_scales(1 << 1, scale);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node == fusedWith[fusedWith.size() - 1] && !scale.empty()) {
|
||||||
|
if (ops.len() == 1 && ops.kind(0) == primitive::kind::sum &&
|
||||||
|
outputDataType == memory::data_type::u8 &&
|
||||||
|
std::all_of(scale.cbegin(), scale.cend(), [&](float val) { return val == scale[0]; })) {
|
||||||
|
std::vector<float> outScales;
|
||||||
|
int mask = 1 << 1;
|
||||||
|
attr.get_output_scales(mask, outScales);
|
||||||
|
for (int j = 0; j < outScales.size(); j++) {
|
||||||
|
outScales[j] *= scale[0];
|
||||||
|
}
|
||||||
|
attr.set_output_scales(mask, outScales);
|
||||||
|
ops.get()->entry_[0].sum.scale = scale[0];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ops.len() != 0 && ops.kind(ops.len() - 1) == primitive::kind::eltwise &&
|
||||||
|
std::all_of(scale.cbegin(), scale.cend(), [&](float val) { return val == scale[0]; })) {
|
||||||
|
auto len = ops.len();
|
||||||
|
ops.get()->entry_[len - 1].eltwise.scale = scale[0];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fakeQuantizeNode->appendBinPostOpsOptimized(ops, getBinPostOpShape(), postOpsArgs,
|
||||||
|
node == fusedWith[fusedWith.size() - 1], outputDataType);
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,6 +74,7 @@ private:
|
|||||||
static const size_t DATA_ID = 0;
|
static const size_t DATA_ID = 0;
|
||||||
static const size_t WEIGHTS_ID = 1;
|
static const size_t WEIGHTS_ID = 1;
|
||||||
static const size_t BIAS_ID = 2;
|
static const size_t BIAS_ID = 2;
|
||||||
|
dnnl::memory::data_type outputDataType;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace node
|
} // namespace node
|
||||||
|
Loading…
Reference in New Issue
Block a user