[GNA] Remove extra FQ layers from the final network (#10599)

* [GNA] Fuse all FakeQuantize layers with their previous layers

* [GNA] Fuse FQ with previous layer if it's not required for precision change

* [GNA] Fixed MatMulOverloadCorrectionTest
This commit is contained in:
Elizaveta Lobanova 2022-03-31 13:21:27 +03:00 committed by GitHub
parent 79e3272237
commit 3578ee9c3f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 76 additions and 66 deletions

View File

@ -229,7 +229,7 @@ void make_gna_pwl(const DnnActivation& fun,
int32_t x_upper = INT32_MAX;
int16_t y_lower = y_min;
int16_t y_upper = y_max;
if (fun == kActFakeQuantize && fun.fqParams.set) {
if ((fun == kActFakeQuantize || fun == kActIdentity) && fun.fqParams.set) {
x_lower = std::max(static_cast<int64_t>(*fun.fqParams.input_low * in_scale), static_cast<int64_t>(x_lower));
x_upper = std::min(static_cast<int64_t>(*fun.fqParams.input_high * in_scale), static_cast<int64_t>(x_upper));
y_lower = std::max(static_cast<int32_t>(*fun.fqParams.input_low * out_scale), static_cast<int32_t>(y_lower));
@ -253,7 +253,7 @@ void make_gna_pwl(const DnnActivation& fun,
x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
}
}
} else if (fun == kActIdentity) {
} else if (fun == kActIdentity && !fun.fqParams.set) {
if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);

View File

@ -538,7 +538,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer*, QUANT_DESC> {
auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
result = CalculateScaleFactorFromStats(quantizedParams->_dst_quant.GetLevels(), minOutValue, maxOutValue);
auto levels = std::min(quantizedParams->_dst_quant.GetLevels(), static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1);
result = CalculateScaleFactorFromStats(levels, minOutValue, maxOutValue);
if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
result = max_activation_scale_factor;
}

View File

@ -74,7 +74,8 @@ static const char softSignLayersCounter[] = "numSoftSignLayers";
static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
InferenceEngine::CNNLayerPtr nextLayer,
std::shared_ptr<IPassManager> passmanager,
float fillValue) {
float fillValue,
size_t in_data_idx = invalid_data_idx) {
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
auto diagName = std::string("SyntheticScaleShift_") + std::to_string(passmanager->getIntVar(diagonalLayersCounterName)++);
gnalog() << "Inserted Diagonal Layer " << diagName <<" between: " << prevLayer->name << " and " << nextLayer->name << "\n" << std::flush;
@ -104,7 +105,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
getCreatorLayer(dataPtr) = diagonalWithQuant;
diagonalWithQuant->outData.push_back(dataPtr);
// actual insertion
CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant);
CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant, invalid_data_idx, in_data_idx);
}
/**
@ -137,7 +138,8 @@ static bool hasNextFuncLayer(const CNNLayerPtr layer) {
});
}
static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager) {
static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager,
bool skipFq = false) {
std::vector<CNNLayerPtr> prevLayers;
// skipping memory inputs and true inputs layers
@ -146,9 +148,9 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
auto concat = dynamic_cast<InferenceEngine::ConcatLayer *>(l.get());
auto PrevFunctionalLayer = [](CNNLayerPtr l, int idx = 0) {
auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [](CNNLayerPtr ptr) {
return LayerInfo(ptr).isNonFunctional();
auto PrevFunctionalLayer = [skipFq](CNNLayerPtr l, int idx = 0) {
auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [skipFq](CNNLayerPtr ptr) {
return LayerInfo(ptr).isNonFunctional() || skipFq && LayerInfo(ptr).isFakeQuantize();
});
gnalog() << "CNNNetPrevLayerSkipCertain for :: " << l->name << "returned: " << prevLayer->name << std::endl;
return prevLayer;
@ -308,7 +310,7 @@ void InsertDiagonalLayerPass::run() {
continue;
}
auto prevDirectLayer = CNNNetPrevLayer(l, 0);
insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f);
insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f, 0);
}
}
@ -820,18 +822,6 @@ void InsertIdentityLayerPass::run() {
gnalog() << "Inserted "<< identityLayer->name << " between: " << prev->name << " and " << true_layer->name << "\n" << std::flush;
// wether 1 identity or all outputs TODO possible grouping here, need to implement special grouped inserter
bool notAll = false;
for (auto && nextData : prev->outData) {
for (auto && nextLayer : getInputTo(nextData)) {
if (nextLayer.second.get() == l.get())
continue;
if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty() &&
hasNextFuncLayer(nextLayer.second)) {
notAll = true;
}
}
}
// copy offset - to be used while connecting outputs
if (prev->params.find("output_offset") != prev->params.end()) {
identityLayer->params["output_offset"] = prev->params["output_offset"];
@ -841,7 +831,7 @@ void InsertIdentityLayerPass::run() {
identityLayer->params["original_num_rows"] = prev->params["original_num_rows"];
}
CNNNetworkInsertLayer(prev, notAll ? true_layer : CNNLayerPtr(nullptr), identityLayer);
CNNNetworkInsertLayer(prev, CNNLayerPtr(nullptr), identityLayer);
}
}
}
@ -1341,6 +1331,12 @@ void InsertSplitAligningFilterPass::run() {
if (getInputTo(splitOutput).empty()) {
gnalog() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n";
} else {
auto lastDimSize = GetDataDimSize(splitOutput, 1);
if (lastDimSize != outputSize) {
THROW_GNA_EXCEPTION << l->name << " Convolution Filter doesn't support these input dimensions: lastDimSize="
<< lastDimSize << ", outputSize=" << outputSize;
}
// this split output not beginning from 64 bytes aligned boundary - need to correct by aligning filter layer
// insert the filter
auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++);
@ -2054,32 +2050,25 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
return false;
};
auto allowFQFuse = [](CNNLayerPtr layer) -> bool {
auto doNotSkip = [](CNNLayerPtr layer) {
return false;
auto allowFQFuse = [this](CNNLayerPtr layer) -> bool {
auto skipNonFunctionalOrMemory = [](CNNLayerPtr layer) {
return LayerInfo(layer).isNonFunctional() || LayerInfo(layer).isMemory();
};
if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkip).empty()) {
return false;
}
auto skipNonFunctional = [](CNNLayerPtr layer) {
return LayerInfo(layer).isNonFunctional();
};
auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional);
if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isMemory()) {
return true;
// Don't fuse FQ if it's the output layer for the network
if (CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctionalOrMemory).empty()) {
return false;
}
// Fuse FQ if it's not required to change precision from int32 to int16
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional);
for (auto& l : nextLayers) {
if (!LayerInfo(l).isActivation()) {
return false;
if (getCandidatesForIdentityInsertion(l, getPassManager(), true).empty()) {
return true;
}
}
return true;
return false;
};
std::function<void(QuantizedLayerParams*, CNNLayerPtr)> propagateStatistics =
@ -2213,8 +2202,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
}
// Allow FQ Fuse checks if FQ layer can be fused to a layer before or after.
// FQ Layer is fused only when previous layer is const, memory or activation layer
// or a next layer is activation layer.
// FQ Layer is fused if it's not required for precision change.
bool isFQFuseAllowed = allowFQFuse(l);
auto prevData = *prevDataIt;

View File

@ -93,7 +93,7 @@ protected:
const ngraph::Shape shape = {1, 128};
auto params = ngraph::builder::makeParams(ngPrc, {shape});
auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMin });
auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * -inputDataMax });
auto highNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMax });
auto fqIn = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn, highNodeIn,
lowNodeIn, highNodeIn, levels16);

View File

@ -67,6 +67,8 @@ protected:
const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
const float maxInputValue = 10.0f;
auto params = ngraph::builder::makeParams(ngPrc, {shape1});
auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
std::shared_ptr<ngraph::Node> input2;
if (isSecondInputConst) {
input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
@ -78,7 +80,7 @@ protected:
auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(relu, lowNodeIn1, highNodeIn1,
lowNodeIn1, highNodeIn1, levels16);
auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
@ -138,4 +140,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
::testing::ValuesIn({true, false}),
::testing::ValuesIn({true, false})),
MatMulOverloadCorrectionTest::getTestCaseName);
} // namespace LayerTestsDefinitions
} // namespace LayerTestsDefinitions

View File

@ -56,22 +56,23 @@ protected:
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
auto fq1 = std::make_shared<ngraph::opset8::FakeQuantize>(
params[0],
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
255);
relu,
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
auto constant = ngraph::builder::makeConstant(ngPrc, constantShape, std::vector<float>{}, true);
auto fq2 = std::make_shared<ngraph::opset8::FakeQuantize>(
constant,
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
255);
auto concat = ngraph::builder::makeConcat({fq1, fq2}, 0);
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
auto concat = ngraph::builder::makeConcat({fq1, fq2}, 1);
function = std::make_shared<ngraph::Function>(concat, params, "WeighableLayerWithoutFq");
}
}; // class WeighableLayerWithoutFqTest
@ -91,7 +92,7 @@ const std::vector<std::vector<size_t>> inputShapes = {
};
const std::vector<std::vector<size_t>> constantShapes = {
{{16, 5}}
{{1, 16}}
};
const std::vector<std::map<std::string, std::string>> configs = {

View File

@ -27,15 +27,33 @@ void MultipleInputTest::SetUp() {
std::tie(targetDevice, netPrecision, inputSize, config) = this->GetParam();
configuration.insert(config.begin(), config.end());
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
const float minInput = -10.0;
const float maxInput = 10.0;
auto input = ngraph::builder::makeParams(ngPrc, {{1, inputSize}, {1, inputSize}, {1, inputSize}});
auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
auto mul1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
auto mul2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
auto mul3 = ngraph::builder::makeEltwise(mul1, mul2, ngraph::helpers::EltwiseTypes::ADD);
auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
auto mul4 = ngraph::builder::makeEltwise(fake3, mul3, ngraph::helpers::EltwiseTypes::ADD);
auto result = std::make_shared<ngraph::opset7::Result>(mul4);
auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ minInput }, { maxInput }, { minInput }, { maxInput });
auto add1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
auto fake_add1 = ngraph::builder::makeFakeQuantize(add1, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ minInput }, { maxInput }, { minInput }, { maxInput });
auto add2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
auto fake_add2 = ngraph::builder::makeFakeQuantize(add2, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
auto add3 = ngraph::builder::makeEltwise(fake_add1, fake_add2, ngraph::helpers::EltwiseTypes::ADD);
auto fake_add3 = ngraph::builder::makeFakeQuantize(add3, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ 4 * minInput }, { 4 * maxInput }, { 4 * minInput }, { 4 * maxInput });
auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ minInput }, { maxInput }, { minInput }, { maxInput });
auto add4 = ngraph::builder::makeEltwise(fake3, fake_add3, ngraph::helpers::EltwiseTypes::ADD);
auto fake_add4 = ngraph::builder::makeFakeQuantize(add4, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
{ 5 * minInput }, { 5 * maxInput }, { 5 * minInput }, { 5 * maxInput });
auto result = std::make_shared<ngraph::opset7::Result>(fake_add4);
function = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, input, "multiple_input");
}