diff --git a/src/common/low_precision_transformations/src/fake_quantize.cpp b/src/common/low_precision_transformations/src/fake_quantize.cpp index 5c502211374..52fc92af57e 100644 --- a/src/common/low_precision_transformations/src/fake_quantize.cpp +++ b/src/common/low_precision_transformations/src/fake_quantize.cpp @@ -157,6 +157,7 @@ std::shared_ptr FakeQuantizeTransformation::fuseElementwis if (ov::is_type(fq::getDataNode(eltwise)) || ov::is_type(fq::getDataNode(eltwise)) || ov::is_type(fq::getDataNode(eltwise)) || + ov::is_type(fq::getDataNode(eltwise)) || ov::is_type(fq::getDataNode(eltwise))) { return nullptr; } diff --git a/src/plugins/intel_cpu/src/docs/fake_quantize.md b/src/plugins/intel_cpu/src/docs/fake_quantize.md new file mode 100644 index 00000000000..23e78151d7c --- /dev/null +++ b/src/plugins/intel_cpu/src/docs/fake_quantize.md @@ -0,0 +1,184 @@ +# FakeQuantize in OpenVINO +https://docs.openvino.ai/latest/openvino_docs_ops_quantization_FakeQuantize_1.html + +definition: +``` + if x <= min(input_low, input_high): + output = output_low +elif x > max(input_low, input_high): + output = output_high +else: + # input_low < x <= input_high + output = round((x - input_low) / (input_high - input_low) \* (levels-1)) / (levels-1) \* (output_high - output_low) + output_low +``` + + - x <= min(input_low, input_high): output = output_low + - x > max(input_low, input_high): output = output_high + - input_low < x <= input_high: + +$$ +\begin{align} + q = round(\frac{x - il}{ih - il} * (levels-1)) \\ + output = q * \frac{oh - ol}{levels-1} + ol +\end{align} +$$ + +simplified, suppose (ih > il): + +$$ +\begin{align} + q = round(\frac{(x - il)}{(ih - il)} * (levels-1)) \\ + q = clamp(q, 0, levels-1) \\ + output = q * \frac{(oh - ol)}{levels-1} + ol +\end{align} +$$ + +---------------------------- +## Interpretation as Q+DQ +give names to parameters scale(S) & shift(Z) + +$$ +\begin{align} + S_i &= \frac{ih - il}{levels-1} \\ + Z_i &= \frac{-il}{S_i}\\ + S_{out} &= \frac{oh - ol}{levels-1} \\ + Z_{out} &= \frac{-ol}{S_o} +\end{align} +$$ + +using these paramerter, FQ becomes + +$$ +\begin{align} + q' &= round(x*\frac{1}{S_i} + Z_i) \tag{a} +\end{align} +$$ + +$$ +\begin{align} + q_{U} &= clamp(q', 0, levels-1) \tag{b} +\end{align} +$$ + +$$ +\begin{align} + output &= (q_{U} - Z_{out})* S_o \tag{c} +\end{align} +$$ + +$q_U$ is unsigned quantized tensor. a small change can make it a signed quantized tensor: + +$$ +\begin{align} + Z_0 &= \frac{levels}{2} \\ + q' &= round(x*\frac{1}{S_i} + Z_i - Z_0) \\ + q_{I} &= clamp(q', -Z_0, Z_0-1) \\ + output &= (q_{I} + Z_0 - Z_{out})* S_o +\end{align} +$$ + +here the center value Z0 is substracted before clamp to make a signed quantized value $q_I$ and it was added back later after clamp for mathematical equivalence. + +notice: + - equation (a) is traditional quantization x into q only if Zi is integer: + - equation (c) is traditional dequantization only if Zo is integer: + +thus inputLow/inputHigh/outputLow/outputHigh is gently tuned from statistical result to satisfy these requirements. + +# Symetric quantization +In symetric quantize: choose `il` to be `-ih` results in non-integer zero points (since levels is even number) + +$$ + Z_i = \frac{-il*(levels-1)}{ih - il} = (levels-1)/2 +$$ + +in symetric quantization, Zi is choosen to be `levels/2`, thus we can increase the range a little by push il to be smaller number + +$$ +\begin{align} + (levels-1)/Z_i = -(ih - il)/il = 1 - ih/il \\ + 2(1-1/levels) = 1 - ih/il \\ + il = -ih/(1 - 2/levels) +\end{align} +$$ + +for example: + - levels=256, U8, Zi=128, il = -1.0078740157480315 * ih + +I8 is better choice for symetric quantization beause we can also make zero-point to be 0 if we use I8 for symetric quantization: + +$$ + q'_{U8} = clamp(round(x*\frac{1}{S_i} + 128), 0, 255) +$$ + +$$ + q'_{I8} = clamp(round(x*\frac{1}{S_i}), -128, 127) +$$ + +# Asymetric quantization + +In Asymetric quantization, there is a special case where inputLow=outputLow=0, we can use U8 equation and in this case Zi==Zo=0. + +Otherwise, there is no easy way, either `U8` or `I8` requires non-zero zero-points. + +# Quantize-only FQ + +The actual tensor in memory is stored in quantized form, so FQ is splited as: + + - `Quantize(clamp)` which is fused into `Producer` node as post ops. + - `Dequantize` is fused into `Consumer` node capable of benefit from quantized representation with additinal zero-point and scales information. + +In CPU plugin, most FQ has been split by LPT into `Quantize-only FQ` (with Zo==0 and S_o==1) followed by a Dequantize (further represented as and splitted into a `Subtract` and a `Multiply`) + +Many oneDNN primitive has standard support for `Quantize-only FQ` post-ops, which is zero-point & output scale, and this usually is the last post-op before storing to memory as quantized tensor. + +To recognize a `Quantize-only FQ` that can be optimized with output-scales post-op, we need to check following two cases: + + - output U8 + - Zi=0 (i.e. inputLow==0) + - So=1 + - Zo=0 + +$$ + q'_{U8} = clamp(round(x*\frac{1}{S_i}), 0, 255) +$$ + + - output I8 + - Zi=128 (which can be optimized as output I8 with Zi=0) + - So=1 + - Zout=128 (outputLow = -128) + +$$ + q'_{I8} = clamp(round(x*\frac{1}{S_i}), -128, 127) +$$ + +`Quantize-only FQ` post-ops optimization example: +- `Quantize-only FQ` is the only post-ops of parent node. We optimize FQ by setting the output-scales of parent node. For example, in below pattern, we set +$\frac{1}{S_i}$ as the output scale of `conv` or `inner_produce` to optimize the pattern. +``` + conv --> FQ + inner_product --> FQ +``` +- `Quantize-only FQ` is the last post-ops and `eltwise` post-ops is before FQ. We optimize FQ by setting the output-scales of `eltwise` node. For example, the below pattern, we set $\frac{1}{S_i}$ as the output scale of `eltwise` +``` + conv --> ... --> eltwise --> FQ + inner_product --> ... --> eltwise --> FQ +``` + +# FQCommon + +The actual tensor is stored in memory as floating point type. So `round` is not needed in this case. The output can be simplified as: + +$$ +\begin{align} + y =(x-il)*\frac{oh-ol}{ih-il} + ol \\ + y =x*\frac{oh-ol}{ih-il} + c \\ + c = -il*\frac{oh-ol}{ih-il} + ol +\end{align} +$$ + + If the following conditions are ture, FQ can be optimized with output-scales $\frac{oh-ol}{ih-il}$. + + $$ + |c/(oh-ol)| = |\frac{ol}{oh-ol} -\frac{il}{ih-il}| < 0.01 + $$ \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/nodes/conv.cpp b/src/plugins/intel_cpu/src/nodes/conv.cpp index 40996cd7788..66b41ec947d 100644 --- a/src/plugins/intel_cpu/src/nodes/conv.cpp +++ b/src/plugins/intel_cpu/src/nodes/conv.cpp @@ -592,6 +592,7 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, if (auto* fakeQuantizeNode = dynamic_cast(node.get())) { const Dim OC = dims[1]; + auto scale = fakeQuantizeNode->simplifyToScale(outputDataType, OC); if (i == 0) { bool hasSubsequentSum = false; bool hasSubsequentFQ = false; @@ -627,92 +628,24 @@ void Convolution::setPostOps(dnnl::primitive_attr &attr, const VectorDims &dims, } } - if (node == fusedWith[fusedWith.size() - 1]) { - auto &cl = fakeQuantizeNode->getCropLow(); - auto &ch = fakeQuantizeNode->getCropHigh(); - auto &isc = fakeQuantizeNode->getInputScale(); - auto &ish = fakeQuantizeNode->getInputShift(); - auto &osc = fakeQuantizeNode->getOutputScale(); - auto &osh = fakeQuantizeNode->getOutputShift(); - if (fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization) { - if (outputDataType == memory::data_type::u8 && - std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) && - std::all_of(ish.cbegin(), ish.cend(), [](float val) { return val == 0.0f; })) { - std::vector outScale = isc; - if (!outScale.empty()) { - size_t size = outScale.size(); - if (size == 1) { - outScale.resize(OC); - for (size_t k = 0; k < OC; k++) - outScale[k] = outScale[0]; - } - - attr.set_output_scales(1 << 1, outScale); - - continue; - } - } - } - - if (outputDataType == memory::data_type::s8 && - std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < 0.0001f; }) && - std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) && - std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < 0.0001f; })) { - bool isCropAligned = true; - for (int i = 0; i < std::max(cl.size(), isc.size()); i++) { - if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > 0.0001f) { - isCropAligned = false; - } - } - - for (int i = 0; i < std::max(ch.size(), isc.size()); i++) { - if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > 0.0001f) { - isCropAligned = false; - } - } - - if (isCropAligned) { - std::vector outScale = isc; - if (!outScale.empty()) { - size_t size = outScale.size(); - if (size == 1) { - outScale.resize(OC); - for (size_t k = 0; k < OC; k++) - outScale[k] = outScale[0]; - } - - attr.set_output_scales(1 << 1, outScale); - - continue; - } - } - } + if (node == fusedWith[fusedWith.size() - 1] && !scale.empty()) { + attr.set_output_scales(1 << 1, scale); + continue; } } - if (node == fusedWith[fusedWith.size() - 1] && - outputDataType == memory::data_type::u8 && - fakeQuantizeNode->getAlgorithm() == Algorithm::FQQuantization && - ops.len() == 1 && ops.kind(0) == primitive::kind::sum - /*levels == 256*/) { - auto &cl = fakeQuantizeNode->getCropLow(); - auto &isc = fakeQuantizeNode->getInputScale(); - auto &ish = fakeQuantizeNode->getInputShift(); - - if (std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) && - std::all_of(isc.cbegin(), isc.cend(), [&](float val) { return val == isc[0]; }) && - std::all_of(ish.cbegin(), ish.cend(), [&](float val) { return val == 0; })) { + if (node == fusedWith[fusedWith.size() - 1] && !scale.empty()) { + if (ops.len() == 1 && ops.kind(0) == primitive::kind::sum && + outputDataType == memory::data_type::u8 && + std::all_of(scale.cbegin(), scale.cend(), [&](float val) { return val == scale[0]; })) { std::vector outScales; int mask = 1 << 1; attr.get_output_scales(mask, outScales); - for (int j = 0; j < outScales.size(); j++) { - outScales[j] *= isc[0]; + outScales[j] *= scale[0]; } attr.set_output_scales(mask, outScales); - - ops.get()->entry_[0].sum.scale = isc[0]; - + ops.get()->entry_[0].sum.scale = scale[0]; continue; } } diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp index 6f9ebd4404b..a6014ca6fb5 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.cpp @@ -1168,15 +1168,18 @@ FakeQuantize::FakeQuantize(const std::shared_ptr& op, const dnnl:: float oh = outputHighData[isOutputHighBroadcasted ? 0 : i]; isFakeQuantization = isFakeQuantization && il == ol && ih == oh; - isFakeQuantizationWithScale = isFakeQuantizationWithScale && ol != 0 && oh != 0 && (il / ol - ih / oh < 0.1f); + isFakeQuantizationWithScale = isFakeQuantizationWithScale && il != ih && ol != oh && + (abs(ol / (oh - ol) - il / (ih - il)) < 0.001f); } if (isFakeQuantizationWithScale) { for (int i = 0; i < std::max(inputLowAxisSize, std::max(outputLowAxisSize, std::max(inputHighAxisSize, outputHighAxisSize))); i++) { float il = inputLowData[isInputLowBroadcasted ? 0 : i]; float ol = outputLowData[isOutputLowBroadcasted ? 0 : i]; + float ih = inputHighData[isInputHighBroadcasted ? 0 : i]; + float oh = outputHighData[isOutputHighBroadcasted ? 0 : i]; - fqScales.push_back(1 / (il / ol)); + fqScales.push_back(1 / ((ih - il) / (oh - ol))); } } @@ -1976,6 +1979,64 @@ void FakeQuantize::appendBinPostOpsOptimized(dnnl::post_ops& ops, const VectorDi appendBinary(dnnl::algorithm::binary_add, outputShiftSize, outputShiftMemory, &outputShiftData.shifts_[0]); } +std::vector FakeQuantize::simplifyToScale(dnnl::memory::data_type outDataType, size_t OC) { + auto &cl = getCropLow(); + auto &ch = getCropHigh(); + auto &isc = getInputScale(); + auto &ish = getInputShift(); + auto &osc = getOutputScale(); + auto &osh = getOutputShift(); + + std::vector outScale; + + if (outDataType == memory::data_type::u8 && + getAlgorithm() == Algorithm::FQQuantization && + std::all_of(cl.cbegin(), cl.cend(), [](float val) { return val == 0.0f; }) && + std::all_of(ish.cbegin(), ish.cend(), [](float val) { return val == 0.0f; })) { + outScale = isc; + if (!outScale.empty()) { + size_t size = outScale.size(); + if (size == 1 && Shape::UNDEFINED_DIM != OC) { + outScale.resize(OC); + for (size_t k = 0; k < OC; k++) + outScale[k] = outScale[0]; + } + } + } + + if (outDataType == memory::data_type::s8 && + std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < 0.0001f; }) && + std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) && + std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < 0.0001f; })) { + bool isCropAligned = true; + for (int i = 0; i < std::max(cl.size(), isc.size()); i++) { + if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > 0.0001f) { + isCropAligned = false; + } + } + + for (int i = 0; i < std::max(ch.size(), isc.size()); i++) { + if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > 0.0001f) { + isCropAligned = false; + } + } + + if (isCropAligned) { + outScale = isc; + if (!outScale.empty()) { + size_t size = outScale.size(); + if (size == 1 && Shape::UNDEFINED_DIM != OC) { + outScale.resize(OC); + for (size_t k = 0; k < OC; k++) + outScale[k] = outScale[0]; + } + } + } + } + + return outScale; +} + FakeQuantize::FakeQuantizeJitExecutor::FakeQuantizeJitExecutor(const jit_quantize_params &_jqp) { bool isBinarization = _jqp.op_type == Algorithm::FQBinarization; if (mayiuse(cpu::x64::avx512_core)) { diff --git a/src/plugins/intel_cpu/src/nodes/fake_quantize.h b/src/plugins/intel_cpu/src/nodes/fake_quantize.h index f18866a0c8e..31039af2409 100644 --- a/src/plugins/intel_cpu/src/nodes/fake_quantize.h +++ b/src/plugins/intel_cpu/src/nodes/fake_quantize.h @@ -131,7 +131,7 @@ public: bool isLastPostOp, dnnl::memory::data_type outDataType); static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - + std::vector simplifyToScale(dnnl::memory::data_type outDataType, size_t OC); enum BroadcastingPolicy { PerChannel, // all FQ operations are per channel PerTensor, // all FQ operations are per tensor diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp index 2bcef119e16..7618de1dfa7 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.cpp @@ -165,7 +165,7 @@ void FullyConnected::getSupportedDescriptors() { IE_THROW()<< errorPrefix << " has incorrect number of output edges"; auto inputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalInputPrecisionAtPort(DATA_ID)); - auto outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID)); + outputDataType = DnnlExtensionUtils::IEPrecisionToDataType(getOriginalOutputPrecisionAtPort(DATA_ID)); if (inputDataType == memory::data_type::f32) { outputDataType = memory::data_type::f32; @@ -393,9 +393,46 @@ void FullyConnected::setPostOps(dnnl::primitive_attr &attr, const VectorDims &di return binaryShape; }; - for (auto &node : fusedWith) { + const auto channelAxis = getFusingAxis(); + size_t OC = getOutputShapeAtPort(0).getDims()[channelAxis]; + + for (int i = 0; i < fusedWith.size(); i++) { + auto& node = fusedWith[i]; + if (auto* fakeQuantizeNode = dynamic_cast(node.get())) { - fakeQuantizeNode->appendBinPostOps(ops, getBinPostOpShape(), postOpsArgs); + auto scale = fakeQuantizeNode->simplifyToScale(outputDataType, OC); + + if (fusedWith.size() == 1 && !scale.empty()) { + attr.set_output_scales(1 << 1, scale); + continue; + } + + if (node == fusedWith[fusedWith.size() - 1] && !scale.empty()) { + if (ops.len() == 1 && ops.kind(0) == primitive::kind::sum && + outputDataType == memory::data_type::u8 && + std::all_of(scale.cbegin(), scale.cend(), [&](float val) { return val == scale[0]; })) { + std::vector outScales; + int mask = 1 << 1; + attr.get_output_scales(mask, outScales); + for (int j = 0; j < outScales.size(); j++) { + outScales[j] *= scale[0]; + } + attr.set_output_scales(mask, outScales); + ops.get()->entry_[0].sum.scale = scale[0]; + continue; + } + + if (ops.len() != 0 && ops.kind(ops.len() - 1) == primitive::kind::eltwise && + std::all_of(scale.cbegin(), scale.cend(), [&](float val) { return val == scale[0]; })) { + auto len = ops.len(); + ops.get()->entry_[len - 1].eltwise.scale = scale[0]; + continue; + } + } + + fakeQuantizeNode->appendBinPostOpsOptimized(ops, getBinPostOpShape(), postOpsArgs, + node == fusedWith[fusedWith.size() - 1], outputDataType); + continue; } diff --git a/src/plugins/intel_cpu/src/nodes/fullyconnected.h b/src/plugins/intel_cpu/src/nodes/fullyconnected.h index 89d1a3445b5..b1944d2e1d0 100644 --- a/src/plugins/intel_cpu/src/nodes/fullyconnected.h +++ b/src/plugins/intel_cpu/src/nodes/fullyconnected.h @@ -74,6 +74,7 @@ private: static const size_t DATA_ID = 0; static const size_t WEIGHTS_ID = 1; static const size_t BIAS_ID = 2; + dnnl::memory::data_type outputDataType; }; } // namespace node