[GNA] Improved accuracy on model after Accuracy Aware (#7576)
* improved accuracy on model after Accuracy Aware * refactoring+test * removed unnessary FakeQuantI8/I16 * added comments, moved fake_quantized from UpdateInputScaleFromNetwork(), removed _Np template param from QuantDescTmpl
This commit is contained in:
@@ -16,7 +16,6 @@ struct GNAFlags {
|
||||
float pwlMaxErrorPercent = 1.0f;
|
||||
bool gna_openmp_multithreading = false;
|
||||
bool sw_fp32 = false;
|
||||
bool fake_quantized = false;
|
||||
bool performance_counting = false;
|
||||
bool input_low_precision = false;
|
||||
};
|
||||
|
||||
@@ -39,14 +39,12 @@ struct QuantDescTmpl {
|
||||
InferenceEngine::TPrecision<Op> _Op;
|
||||
InferenceEngine::TPrecision<Wp> _Wp;
|
||||
InferenceEngine::TPrecision<Bp> _Bp;
|
||||
InferenceEngine::TPrecision<Np> _Np;
|
||||
|
||||
QuantDescTmpl() = default;
|
||||
QuantDescTmpl(InferenceEngine::TPrecision<Ip> _Ip,
|
||||
InferenceEngine::TPrecision<Op> _Op,
|
||||
InferenceEngine::TPrecision<Wp> _Wp,
|
||||
InferenceEngine::TPrecision<Bp> _Bp,
|
||||
InferenceEngine::TPrecision<Np> _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) {
|
||||
InferenceEngine::TPrecision<Bp> _Bp) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp) {
|
||||
}
|
||||
|
||||
InferenceEngine::Precision getInputPrecision() const {
|
||||
@@ -58,9 +56,6 @@ struct QuantDescTmpl {
|
||||
InferenceEngine::Precision getBiasesPrecision() const {
|
||||
return _Bp;
|
||||
}
|
||||
InferenceEngine::Precision getNetPrecision() const {
|
||||
return _Np;
|
||||
}
|
||||
InferenceEngine::Precision getOutputPrecision() const {
|
||||
return _Op;
|
||||
}
|
||||
@@ -74,23 +69,16 @@ typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::X>::value_t
|
||||
|
||||
|
||||
struct QuantI16 : public QuantDescTmpl<PRECISION_TYPE(I16, I32, I16, I32, MIXED)> {
|
||||
QuantI16() {
|
||||
_Np = InferenceEngine::Precision::MIXED;
|
||||
}
|
||||
};
|
||||
struct QuantI8 : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna_compound_bias_t, P_TYPE(MIXED)> {
|
||||
QuantI8() {
|
||||
_Np = InferenceEngine::Precision::MIXED;
|
||||
}
|
||||
};
|
||||
// Low precision path quantizer (I8 inputs, weights, biases)
|
||||
struct QuantI8_I8 : public QuantDescTmpl<PRECISION_TYPE(I8, I32, I8, I8, MIXED)> {
|
||||
QuantI8_I8() {
|
||||
_Np = InferenceEngine::Precision::MIXED;
|
||||
}
|
||||
};
|
||||
|
||||
// for support proper trait instantiation for quantization function callback
|
||||
struct FakeQuant : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(MIXED), P_TYPE(MIXED), P_TYPE(MIXED)> {
|
||||
};
|
||||
struct FakeQuantI16 : public QuantI16 {};
|
||||
struct FakeQuantI8 : public QuantI8 {};
|
||||
|
||||
@@ -654,9 +642,24 @@ class DataQuantizer<Desc, InferenceEngine::WeightableLayer *> : public DataQuant
|
||||
public:
|
||||
explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
|
||||
bool operator()(InferenceEngine::WeightableLayer *wl) const {
|
||||
quantizeWeightsBiases<typename Desc::MandatoryType>(Desc::mandatory(), wl, Quant<typename Desc::MandatoryType>());
|
||||
(*this)(wl, typename Desc::MandatoryType());
|
||||
return true;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void operator()(InferenceEngine::WeightableLayer *wl, const T&) const {
|
||||
quantizeWeightsBiases<T>(T(), wl, Quant<T>());
|
||||
}
|
||||
|
||||
void operator()(InferenceEngine::WeightableLayer *wl, const FakeQuant&) const {
|
||||
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
|
||||
IE_ASSERT(quantData->_weights_quant.IsStatsSet());
|
||||
if (quantData->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
|
||||
quantizeWeightsBiases<FakeQuantI8>(FakeQuantI8(), wl, Quant<FakeQuantI8>());
|
||||
} else {
|
||||
quantizeWeightsBiases<FakeQuantI16>(FakeQuantI16(), wl, Quant<FakeQuantI16>());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<class Desc>
|
||||
@@ -691,13 +694,18 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* Major of layers will be executed in I16 mode
|
||||
* most of auto generated primitives like one for aligning support
|
||||
* GNA 1.0, 2.0 doesn’t support I8 for convolution layer.
|
||||
* Some layers will be switched into I16 mode to not lose accuracy while memory and
|
||||
* runtime performance of layers like scaleshifts still OK since it is O(N).
|
||||
*/
|
||||
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
|
||||
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
|
||||
using QuantI8_I8 = frontend::QuantPair<frontend::QuantI8_I8, frontend::QuantI8_I8>;
|
||||
|
||||
|
||||
using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
|
||||
using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
|
||||
using FakeQuant = frontend::QuantPair<frontend::FakeQuant, frontend::FakeQuantI16>;
|
||||
|
||||
enum class QuantizedDataType {
|
||||
input,
|
||||
|
||||
@@ -83,9 +83,7 @@ class ModelQuantizer {
|
||||
scaleIndex++;
|
||||
}
|
||||
|
||||
bool isFakeQuantize = std::is_same<T, FakeQuantI8>() || std::is_same<T, FakeQuantI16>();
|
||||
propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), T::optional().getWeightsPrecision().size(),
|
||||
T::mandatory().getInputPrecision().size(), isFakeQuantize);
|
||||
propagateScaleFactor(sortedNewNet);
|
||||
|
||||
// sorted order gives possibility for propagate quantisation along depended layers
|
||||
for (auto &&layer : sortedNewNet) {
|
||||
@@ -96,9 +94,8 @@ class ModelQuantizer {
|
||||
}
|
||||
|
||||
private :
|
||||
void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int mandWeightsBytesSize,
|
||||
int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize) const {
|
||||
ScaleFactorCalculator sf(net, mandWeightsBytesSize, optWeightsBytesSize, inputsBytesSize, fakeQuantize);
|
||||
void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net) const {
|
||||
ScaleFactorCalculator<T> sf(net);
|
||||
|
||||
int infiniteLoopCount = 0;
|
||||
std::vector<std::string> infiniteLoopPattern;
|
||||
|
||||
@@ -22,6 +22,10 @@
|
||||
#include "round_float_define.hpp"
|
||||
|
||||
namespace GNAPluginNS {
|
||||
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorCalculator;
|
||||
|
||||
namespace frontend {
|
||||
static const float MIN_SEARCH_WEIGHTS_VAL = 1.0f;
|
||||
static const float MAX_SEARCH_WEIGHTS_VAL = 1024.0f;
|
||||
@@ -133,7 +137,8 @@ static float selectBestWeightsScaleFactors(float inScale, float outScale, std::v
|
||||
for (size_t j = 0; j < slopes.size(); ++j) {
|
||||
auto s = gna_slope(slopes[j], inScale * weightScale, outScale);
|
||||
auto slope = static_cast<uint32_t>(s.slope * s.slope_scale);
|
||||
if (slope < static_cast<uint32_t>(std::numeric_limits<int16_t>::min()) && slope > static_cast<uint32_t>(std::numeric_limits<int16_t>::max())) {
|
||||
if (slope < static_cast<uint32_t>(std::numeric_limits<int16_t>::min()) &&
|
||||
slope > static_cast<uint32_t>(std::numeric_limits<int16_t>::max())) {
|
||||
sd += std::numeric_limits<int8_t>::max();
|
||||
continue;
|
||||
}
|
||||
@@ -206,24 +211,23 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) {
|
||||
* @brief calculates output scale factor per layer
|
||||
* @tparam T
|
||||
*/
|
||||
template<class T>
|
||||
template<typename T, typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer {
|
||||
public:
|
||||
/**
|
||||
* @brief calculates weights scale factor to fit dynamic range into target bitsize,
|
||||
* also calculates output scale factor for the given layer
|
||||
* @param cnnLayer
|
||||
* @param weightsSize
|
||||
* @param result
|
||||
* @return
|
||||
*/
|
||||
bool operator()(T cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, bool fakeQuantize, int infiniteLoopCount) {
|
||||
bool operator()(T cnnLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::CNNLayer*, QUANT_DESC> {
|
||||
private :
|
||||
const float activation_scale_factor = 2048.f;
|
||||
const float low_prec_activation_scale_factor = 4.f;
|
||||
@@ -450,11 +454,14 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
||||
}
|
||||
|
||||
public :
|
||||
bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, bool fakeQuantize,
|
||||
int infiniteLoopCount) {
|
||||
bool operator()(InferenceEngine::CNNLayer *cnnLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
|
||||
if ( !cnnLayer ) {
|
||||
IE_THROW() << "Incorrect Convolutional Layer pointer \n";
|
||||
}
|
||||
|
||||
int inputsSize = ScaleFactorCalculator<QUANT_DESC>::GetInputsBytesSize();
|
||||
bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
|
||||
|
||||
LayerInfo layerInfo(*cnnLayer);
|
||||
// TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
|
||||
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
|
||||
@@ -656,8 +663,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*, QUANT_DESC> {
|
||||
private:
|
||||
bool requantizeEltwiseInput(InferenceEngine::EltwiseLayer* eltwiseLayer, uint8_t inputIx, int16_t maxValue,
|
||||
bool fakeQuantize, ScaleFactorUpdateResult &result) {
|
||||
@@ -726,11 +733,12 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
||||
}
|
||||
|
||||
public:
|
||||
bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
|
||||
bool fakeQuantize, int infiniteLoopCount) {
|
||||
bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
|
||||
if ( !eltwiseLayer ) {
|
||||
THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
|
||||
}
|
||||
int inputsSize = ScaleFactorCalculator<QUANT_DESC>::GetInputsBytesSize();
|
||||
bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
|
||||
bool lowPrecision = (inputsSize == sizeof(int8_t));
|
||||
|
||||
auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
|
||||
@@ -836,15 +844,16 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*, QUANT_DESC> {
|
||||
public:
|
||||
bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
|
||||
bool fakeQuantize, int infiniteLoopCount) {
|
||||
bool operator()(InferenceEngine::ConcatLayer* concatLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
|
||||
if ( !concatLayer ) {
|
||||
THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
|
||||
}
|
||||
|
||||
bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
|
||||
|
||||
if (concatLayer->insData.size() < 2) {
|
||||
THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
|
||||
}
|
||||
@@ -1061,8 +1070,8 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*, QUANT_DESC> {
|
||||
private:
|
||||
std::vector<std::tuple<uint16_t const, float const, float const>> thresholds {
|
||||
// tuple values: scale factor threshold, scale factor reduction factor for I16 precision, for I8 precision
|
||||
@@ -1074,14 +1083,15 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||
};
|
||||
|
||||
public:
|
||||
bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
|
||||
bool fakeQuantize, int infiniteLoopCount) {
|
||||
bool operator()(InferenceEngine::WeightableLayer *wl, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
|
||||
if ( !wl ) {
|
||||
THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n";
|
||||
} else if (!wl->_weights) {
|
||||
THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
|
||||
}
|
||||
|
||||
int inputsSize = ScaleFactorCalculator<QUANT_DESC>::GetInputsBytesSize();
|
||||
bool fakeQuantize = ScaleFactorCalculator<QUANT_DESC>::IsFakeQuantize();
|
||||
auto prevLayer = CNNNetPrevLayer(wl);
|
||||
auto quantDataForInputLayer =
|
||||
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
|
||||
@@ -1111,6 +1121,7 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||
}
|
||||
|
||||
// TODO: pass 8 bits somehow
|
||||
int weightsSize = ScaleFactorCalculator<QUANT_DESC>::GetMandatoryWeightsBytesSize(wl);
|
||||
if (!quant->_weights_quant.IsScaleSet()) {
|
||||
size_t scaleRange = 0;
|
||||
if (weightsSize == 2) {
|
||||
@@ -1217,19 +1228,20 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*, QUANT_DESC> :
|
||||
public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*, QUANT_DESC> {
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*, QUANT_DESC> :
|
||||
public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*, QUANT_DESC> {
|
||||
};
|
||||
|
||||
template<>
|
||||
class ScaleFactorPerLayer<InferenceEngine::GemmLayer*> {
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorPerLayer<InferenceEngine::GemmLayer*, QUANT_DESC> {
|
||||
public:
|
||||
bool operator() (InferenceEngine::GemmLayer* gemmLayer, int weightsSize, int inputSize, ScaleFactorUpdateResult &result,
|
||||
bool fakeQuantize, int infiniteLoopCount) {
|
||||
bool operator() (InferenceEngine::GemmLayer* gemmLayer, ScaleFactorUpdateResult &result, int infiniteLoopCount) {
|
||||
if ( !gemmLayer ) {
|
||||
THROW_GNA_EXCEPTION << "Incorrect Gemm Layer pointer \n";
|
||||
}
|
||||
@@ -1278,21 +1290,16 @@ public:
|
||||
* @brief scale factor calculator will calculate only output scale factors for the layer
|
||||
* if scale factor propagation not possible, it will fall indicate a restart condition
|
||||
*/
|
||||
template<typename QUANT_DESC>
|
||||
class ScaleFactorCalculator {
|
||||
using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
|
||||
Cnt net;
|
||||
mutable Cnt::const_iterator idx;
|
||||
mutable bool needRestart = false;
|
||||
int mandWeightsBytesSize;
|
||||
int optWeightsBytesSize;
|
||||
bool isFakeQuantize;
|
||||
int inputsBytesSize;
|
||||
int infiniteLoopCount = 0;
|
||||
|
||||
public:
|
||||
ScaleFactorCalculator(Cnt &net, int mandWeightsBytesSize, int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize)
|
||||
: net(net), mandWeightsBytesSize(mandWeightsBytesSize), optWeightsBytesSize(optWeightsBytesSize),
|
||||
inputsBytesSize(inputsBytesSize), isFakeQuantize(fakeQuantize) {
|
||||
ScaleFactorCalculator(Cnt &net) : net(net) {
|
||||
idx = std::begin(this->net);
|
||||
}
|
||||
bool needToRestart() const {
|
||||
@@ -1311,13 +1318,7 @@ class ScaleFactorCalculator {
|
||||
bool operator()(T ptr) const {
|
||||
needRestart = false;
|
||||
frontend::ScaleFactorUpdateResult result;
|
||||
auto weightsBytesSize = mandWeightsBytesSize;
|
||||
|
||||
if (LayerInfo(ptr).isConvolution() || LayerInfo(ptr).isScaleShift()) {
|
||||
weightsBytesSize = optWeightsBytesSize;
|
||||
}
|
||||
|
||||
if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputsBytesSize, result, isFakeQuantize, infiniteLoopCount)) {
|
||||
if (!frontend::ScaleFactorPerLayer<T, QUANT_DESC>()(ptr, result, infiniteLoopCount)) {
|
||||
return false;
|
||||
}
|
||||
if (result) {
|
||||
@@ -1337,6 +1338,39 @@ class ScaleFactorCalculator {
|
||||
needRestart = true;
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template<class T>
|
||||
static int GetMandatoryWeightsBytesSize(T ptr) {
|
||||
auto info = LayerInfo(ptr);
|
||||
if (info.isConvolution() || info.isScaleShift()) {
|
||||
return GetOptionalWeightsBytesSize();
|
||||
}
|
||||
|
||||
if (IsFakeQuantize()) {
|
||||
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*ptr);
|
||||
if (quantData->_weights_quant.IsStatsSet()) {
|
||||
if (quantData->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
|
||||
return frontend::FakeQuantI8().getWeightsPrecision().size();
|
||||
} else {
|
||||
return frontend::FakeQuantI16().getWeightsPrecision().size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return QUANT_DESC::mandatory().getWeightsPrecision().size();
|
||||
}
|
||||
|
||||
static int GetOptionalWeightsBytesSize() {
|
||||
return QUANT_DESC::optional().getWeightsPrecision().size();
|
||||
}
|
||||
|
||||
static int GetInputsBytesSize() {
|
||||
return QUANT_DESC::mandatory().getInputPrecision().size();
|
||||
}
|
||||
|
||||
static bool IsFakeQuantize() {
|
||||
return std::is_same<QUANT_DESC, FakeQuant>();
|
||||
}
|
||||
}; // class ScaleFactorCalculator
|
||||
|
||||
} // namespace GNAPluginNS
|
||||
|
||||
@@ -429,43 +429,7 @@ void GNAPlugin::InitGNADevice() {
|
||||
graphCompiler.setGNAMemoryPtr(gnamem);
|
||||
}
|
||||
|
||||
void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork & network) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UpdateGnaQuantModeFromNetwork");
|
||||
// fp32 emulation mode dont need any modifications to configuration
|
||||
if (config.gnaFlags.sw_fp32) return;
|
||||
|
||||
// search for FQ layers
|
||||
// only supports cases of int16 or int8
|
||||
auto it = details::CNNNetworkIterator(network), end = details::CNNNetworkIterator();
|
||||
for (; it != end; it++) {
|
||||
if (!LayerInfo(*it).isFakeQuantize()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
GNAFakeQuantizeLayer fqLayer(*it);
|
||||
auto inputLayer = fqLayer.getInputLayer();
|
||||
|
||||
// this fake quantize represents data quantization - not weights
|
||||
if (!LayerInfo(inputLayer).isConst()) {
|
||||
continue;
|
||||
}
|
||||
// also in mixed mode i8 should be stated as target precision
|
||||
if (fqLayer.getLevels() <= std::numeric_limits<uint8_t>::max()) {
|
||||
config.gnaPrecision = InferenceEngine::Precision::I8;
|
||||
} else if (fqLayer.getLevels() <= std::numeric_limits<uint16_t>::max()) {
|
||||
config.gnaPrecision = InferenceEngine::Precision::I16;
|
||||
} else {
|
||||
THROW_GNA_LAYER_EXCEPTION(*it)
|
||||
<< "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only up to "
|
||||
<< std::numeric_limits<uint16_t>::max() << " is supported";
|
||||
}
|
||||
|
||||
gnaFlags->fake_quantized = true;
|
||||
config.gnaFlags.fake_quantized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & network) {
|
||||
void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork& network) {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "UpdateInputScaleFromNetwork");
|
||||
// fp32 emulation mode dont need any modifications to configuration
|
||||
if (config.gnaFlags.sw_fp32) return;
|
||||
@@ -480,6 +444,7 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
|
||||
if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// replacing scale factor from this fq layer
|
||||
GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second);
|
||||
auto inputRange = fqLayer.getInputRange();
|
||||
@@ -714,12 +679,13 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
|
||||
}
|
||||
|
||||
bool isNgraphPassesUsed = false;
|
||||
|
||||
bool fake_quantized = false;
|
||||
if (_network.getFunction()) {
|
||||
CNNNetwork clonedNetwork = InferenceEngine::cloneNetwork(_network);
|
||||
const auto& graph = clonedNetwork.getFunction();
|
||||
ngraph::pass::Manager manager;
|
||||
manager.register_pass<ngraph::pass::InitNodeInfo>();
|
||||
fake_quantized = ngraph::op::util::has_op_with_type<ngraph::opset7::FakeQuantize>(graph);
|
||||
// WA: ConvertPriorBox must be executed before the 1st ConstantFolding pass
|
||||
manager.register_pass<ngraph::pass::ConvertPriorBox>();
|
||||
manager.register_pass<ngraph::pass::CommonOptimizations>();
|
||||
@@ -783,9 +749,9 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
|
||||
THROW_GNA_EXCEPTION << error.c_str();
|
||||
}
|
||||
|
||||
// FQ networks now replaces certain flags in the plugin - flags will'be owerritten
|
||||
UpdateGnaQuantModeFromNetwork(network);
|
||||
UpdateInputScaleFromNetwork(network);
|
||||
if (fake_quantized) {
|
||||
UpdateInputScaleFromNetwork(network);
|
||||
}
|
||||
|
||||
// Set input and output information from orginal network
|
||||
UpdateInputsAndOutputsInfoFromNetwork(network);
|
||||
@@ -849,19 +815,9 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
|
||||
// to run all passes need to have two calls to pass manager
|
||||
run_passes(newNet, true, gnaFlags->input_low_precision);
|
||||
run_passes(newNet, false, gnaFlags->input_low_precision);
|
||||
} else if (gnaFlags->fake_quantized) {
|
||||
switch (config.gnaPrecision) {
|
||||
case Precision::I16:
|
||||
ModelQuantizer<FakeQuantI16> q16;
|
||||
newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||
break;
|
||||
case Precision::I8:
|
||||
ModelQuantizer<FakeQuantI8> q8;
|
||||
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||
break;
|
||||
default:
|
||||
THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
|
||||
}
|
||||
} else if (fake_quantized) {
|
||||
ModelQuantizer<FakeQuant> modelQuantizer;
|
||||
newNet = modelQuantizer.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||
} else {
|
||||
switch (config.gnaPrecision) {
|
||||
case Precision::I16:
|
||||
|
||||
@@ -211,8 +211,7 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
|
||||
int idx = 0);
|
||||
|
||||
void UpdateFieldsFromConfig();
|
||||
void UpdateGnaQuantModeFromNetwork(InferenceEngine::CNNNetwork &);
|
||||
void UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork &);
|
||||
void UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork& network);
|
||||
void UpdateInputsAndOutputsInfoFromNetwork(InferenceEngine::CNNNetwork &);
|
||||
/**
|
||||
* @brief Tries to init an output on the base of a layer data
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "common_test_utils/test_constants.hpp"
|
||||
#include "subgraph_tests/fq_with_mixed_levels.hpp"
|
||||
|
||||
namespace SubgraphTestsDefinitions {
|
||||
namespace {
|
||||
const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||
InferenceEngine::Precision::FP32,
|
||||
InferenceEngine::Precision::FP16
|
||||
};
|
||||
|
||||
const std::vector<std::map<std::string, std::string>> configs = {
|
||||
{
|
||||
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
|
||||
}
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_FqWithMixedLevelsTest, FqWithMixedLevelsTest,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||
::testing::ValuesIn(configs)),
|
||||
FqWithMixedLevelsTest::getTestCaseName);
|
||||
} // namespace
|
||||
} // namespace SubgraphTestsDefinitions
|
||||
@@ -0,0 +1,18 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#ifndef TEST_FQ_WITH_MIXED_LEVELS_HPP
|
||||
#define TEST_FQ_WITH_MIXED_LEVELS_HPP
|
||||
|
||||
#include "shared_test_classes/subgraph/fq_with_mixed_levels.hpp"
|
||||
|
||||
namespace SubgraphTestsDefinitions {
|
||||
|
||||
TEST_P(FqWithMixedLevelsTest, CompareWithRefImpl) {
|
||||
Run();
|
||||
};
|
||||
|
||||
} // namespace SubgraphTestsDefinitions
|
||||
|
||||
#endif // TEST_FQ_WITH_MIXED_LEVELS_HPP
|
||||
@@ -0,0 +1,36 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#ifndef FQ_WITH_MIXED_LEVELS_HPP
|
||||
#define FQ_WITH_MIXED_LEVELS_HPP
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
#include "shared_test_classes/base/layer_test_utils.hpp"
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
#include "ngraph_functions/utils/ngraph_helpers.hpp"
|
||||
|
||||
namespace SubgraphTestsDefinitions {
|
||||
|
||||
typedef std::tuple<
|
||||
InferenceEngine::Precision, // Network Precision
|
||||
std::string, // Target Device
|
||||
std::map<std::string, std::string> // Configuration
|
||||
> FqWithMixedLevelsParams;
|
||||
|
||||
class FqWithMixedLevelsTest : public testing::WithParamInterface<FqWithMixedLevelsParams>,
|
||||
public LayerTestsUtils::LayerTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(const testing::TestParamInfo<FqWithMixedLevelsParams>& obj);
|
||||
|
||||
protected:
|
||||
void SetUp() override;
|
||||
};
|
||||
|
||||
} // namespace SubgraphTestsDefinitions
|
||||
|
||||
#endif // FQ_WITH_MIXED_LEVELS_HPP
|
||||
@@ -0,0 +1,74 @@
|
||||
// Copyright (C) 2021 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "shared_test_classes/subgraph/fq_with_mixed_levels.hpp"
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
|
||||
namespace SubgraphTestsDefinitions {
|
||||
|
||||
std::string FqWithMixedLevelsTest::getTestCaseName(const testing::TestParamInfo<FqWithMixedLevelsParams>& obj) {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::string targetDevice;
|
||||
std::map<std::string, std::string> configuration;
|
||||
std::tie(netPrecision, targetDevice, configuration) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "netPRC=" << netPrecision.name() << "_";
|
||||
result << "targetDevice=" << targetDevice;
|
||||
for (auto const& configItem : configuration) {
|
||||
result << "_configItem=" << configItem.first << "_" << configItem.second;
|
||||
}
|
||||
return result.str();
|
||||
}
|
||||
|
||||
void FqWithMixedLevelsTest::SetUp() {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::map<std::string, std::string> tempConfig;
|
||||
std::tie(netPrecision, targetDevice, tempConfig) = this->GetParam();
|
||||
configuration.insert(tempConfig.begin(), tempConfig.end());
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto unit = [=](const std::shared_ptr<ngraph::Node>& input,
|
||||
const std::vector<std::vector<size_t>>& shapes,
|
||||
float weights_min, float weights_max,
|
||||
size_t level1, const std::vector<std::vector<float>>& data1,
|
||||
size_t level2, const std::vector<std::vector<float>>& data2,
|
||||
size_t level3, const std::vector<std::vector<float>>& data3) {
|
||||
auto sigmoid = std::make_shared<ngraph::opset7::Sigmoid>(input);
|
||||
auto fake1 = ngraph::builder::makeFakeQuantize(sigmoid, ngPrc, level1, { 1 }, data1[0], data1[1], data1[2], data1[3]);
|
||||
std::vector<float> weights = CommonTestUtils::generate_float_numbers(shapes[1][0] * shapes[1][1], weights_min, weights_max);
|
||||
auto constant = std::make_shared<ngraph::opset7::Constant>(ngPrc, ngraph::Shape{shapes[1][0], shapes[1][1]}, weights);
|
||||
auto fake2 = ngraph::builder::makeFakeQuantize(constant, ngPrc, level2, { 1 }, data2[0], data2[1], data2[2], data2[3]);
|
||||
auto matmul = ngraph::builder::makeMatMul(fake1, fake2, false, true);
|
||||
auto bias = ngraph::builder::makeConstant(ngPrc, std::vector<size_t>{shapes[0][0], shapes[1][0]}, std::vector<float>{ 1.0 });
|
||||
auto add = ngraph::builder::makeEltwise(matmul, bias, ngraph::helpers::EltwiseTypes::ADD);
|
||||
return ngraph::builder::makeFakeQuantize(add, ngPrc, level3, { 1 }, data3[0], data3[1], data3[2], data3[3]);
|
||||
};
|
||||
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {{ 1, 8 }});
|
||||
auto input = ngraph::builder::makeFakeQuantize(params[0], ngPrc, std::numeric_limits<uint32_t>::max(), { 1 },
|
||||
{ -10. }, { 10. }, { -10. }, { 10. });
|
||||
input = unit(input,
|
||||
{{1, 8}, {8, 8}},
|
||||
-20., 20.,
|
||||
std::numeric_limits<uint16_t>::max(), {{ -1.0 }, { 1.0 }, { -1.0 }, { 1.0 }},
|
||||
std::numeric_limits<uint8_t>::max(), {{ -2.5 }, { 2.5 }, { -2.5 }, { 2.5 }},
|
||||
std::numeric_limits<uint32_t>::max(), {{ -5. } , { 5. }, { -5. }, { 5. }});
|
||||
input = unit(input,
|
||||
{{ 1, 8 }, { 8, 8 }},
|
||||
-13., 13.,
|
||||
std::numeric_limits<uint16_t>::max(), {{ -1.0 }, { 1.0 }, { -1.0 }, { 1.0 }},
|
||||
std::numeric_limits<uint16_t>::max(), {{ -2.5 }, { 2.5 }, { -2.5 }, { 2.5 }},
|
||||
std::numeric_limits<uint32_t>::max(), {{ -5. } , { 5. }, { -5. }, { 5. }});
|
||||
input = unit(input,
|
||||
{{1, 8}, {8, 8}},
|
||||
-20., 20.,
|
||||
std::numeric_limits<uint16_t>::max(), {{ -1.0 }, { 1.0 }, { -1.0 }, { 1.0 }},
|
||||
std::numeric_limits<uint8_t>::max(), {{ -2.5 }, { 2.5 }, { -2.5 }, { 2.5 }},
|
||||
std::numeric_limits<uint32_t>::max(), {{ -5. } , { 5. }, { -5. }, { 5. }});
|
||||
auto result = std::make_shared<ngraph::opset7::Result>(input);
|
||||
function = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, params, "FqWithMixedLevelsTest");
|
||||
}
|
||||
|
||||
} // namespace SubgraphTestsDefinitions
|
||||
Reference in New Issue
Block a user