[GNA] Remove extra FQ layers from the final network (#10599)
* [GNA] Fuse all FakeQuantize layers with their previous layers * [GNA] Fuse FQ with previous layer if it's not required for precision change * [GNA] Fixed MatMulOverloadCorrectionTest
This commit is contained in:
parent
79e3272237
commit
3578ee9c3f
@ -229,7 +229,7 @@ void make_gna_pwl(const DnnActivation& fun,
|
||||
int32_t x_upper = INT32_MAX;
|
||||
int16_t y_lower = y_min;
|
||||
int16_t y_upper = y_max;
|
||||
if (fun == kActFakeQuantize && fun.fqParams.set) {
|
||||
if ((fun == kActFakeQuantize || fun == kActIdentity) && fun.fqParams.set) {
|
||||
x_lower = std::max(static_cast<int64_t>(*fun.fqParams.input_low * in_scale), static_cast<int64_t>(x_lower));
|
||||
x_upper = std::min(static_cast<int64_t>(*fun.fqParams.input_high * in_scale), static_cast<int64_t>(x_upper));
|
||||
y_lower = std::max(static_cast<int32_t>(*fun.fqParams.input_low * out_scale), static_cast<int32_t>(y_lower));
|
||||
@ -253,7 +253,7 @@ void make_gna_pwl(const DnnActivation& fun,
|
||||
x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
|
||||
}
|
||||
}
|
||||
} else if (fun == kActIdentity) {
|
||||
} else if (fun == kActIdentity && !fun.fqParams.set) {
|
||||
if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
|
||||
if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
|
||||
if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
|
||||
|
@ -538,7 +538,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer*, QUANT_DESC> {
|
||||
auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
|
||||
auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
|
||||
|
||||
result = CalculateScaleFactorFromStats(quantizedParams->_dst_quant.GetLevels(), minOutValue, maxOutValue);
|
||||
auto levels = std::min(quantizedParams->_dst_quant.GetLevels(), static_cast<size_t>(std::numeric_limits<uint16_t>::max()) + 1);
|
||||
result = CalculateScaleFactorFromStats(levels, minOutValue, maxOutValue);
|
||||
if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
|
||||
result = max_activation_scale_factor;
|
||||
}
|
||||
|
@ -74,7 +74,8 @@ static const char softSignLayersCounter[] = "numSoftSignLayers";
|
||||
static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
|
||||
InferenceEngine::CNNLayerPtr nextLayer,
|
||||
std::shared_ptr<IPassManager> passmanager,
|
||||
float fillValue) {
|
||||
float fillValue,
|
||||
size_t in_data_idx = invalid_data_idx) {
|
||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
|
||||
auto diagName = std::string("SyntheticScaleShift_") + std::to_string(passmanager->getIntVar(diagonalLayersCounterName)++);
|
||||
gnalog() << "Inserted Diagonal Layer " << diagName <<" between: " << prevLayer->name << " and " << nextLayer->name << "\n" << std::flush;
|
||||
@ -104,7 +105,7 @@ static void insertDiagonalLayerBetween(InferenceEngine::CNNLayerPtr prevLayer,
|
||||
getCreatorLayer(dataPtr) = diagonalWithQuant;
|
||||
diagonalWithQuant->outData.push_back(dataPtr);
|
||||
// actual insertion
|
||||
CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant);
|
||||
CNNNetworkInsertLayer(prevLayer, nextLayer, diagonalWithQuant, invalid_data_idx, in_data_idx);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -137,7 +138,8 @@ static bool hasNextFuncLayer(const CNNLayerPtr layer) {
|
||||
});
|
||||
}
|
||||
|
||||
static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager) {
|
||||
static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager,
|
||||
bool skipFq = false) {
|
||||
std::vector<CNNLayerPtr> prevLayers;
|
||||
|
||||
// skipping memory inputs and true inputs layers
|
||||
@ -146,9 +148,9 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
|
||||
auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
|
||||
auto concat = dynamic_cast<InferenceEngine::ConcatLayer *>(l.get());
|
||||
|
||||
auto PrevFunctionalLayer = [](CNNLayerPtr l, int idx = 0) {
|
||||
auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [](CNNLayerPtr ptr) {
|
||||
return LayerInfo(ptr).isNonFunctional();
|
||||
auto PrevFunctionalLayer = [skipFq](CNNLayerPtr l, int idx = 0) {
|
||||
auto prevLayer = CNNNetPrevLayerSkipCertain(l, idx, [skipFq](CNNLayerPtr ptr) {
|
||||
return LayerInfo(ptr).isNonFunctional() || skipFq && LayerInfo(ptr).isFakeQuantize();
|
||||
});
|
||||
gnalog() << "CNNNetPrevLayerSkipCertain for :: " << l->name << "returned: " << prevLayer->name << std::endl;
|
||||
return prevLayer;
|
||||
@ -308,7 +310,7 @@ void InsertDiagonalLayerPass::run() {
|
||||
continue;
|
||||
}
|
||||
auto prevDirectLayer = CNNNetPrevLayer(l, 0);
|
||||
insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f);
|
||||
insertDiagonalLayerBetween(prevDirectLayer, l, getPassManager(), 1.f, 0);
|
||||
}
|
||||
}
|
||||
|
||||
@ -820,18 +822,6 @@ void InsertIdentityLayerPass::run() {
|
||||
|
||||
gnalog() << "Inserted "<< identityLayer->name << " between: " << prev->name << " and " << true_layer->name << "\n" << std::flush;
|
||||
|
||||
// wether 1 identity or all outputs TODO possible grouping here, need to implement special grouped inserter
|
||||
bool notAll = false;
|
||||
for (auto && nextData : prev->outData) {
|
||||
for (auto && nextLayer : getInputTo(nextData)) {
|
||||
if (nextLayer.second.get() == l.get())
|
||||
continue;
|
||||
if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty() &&
|
||||
hasNextFuncLayer(nextLayer.second)) {
|
||||
notAll = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// copy offset - to be used while connecting outputs
|
||||
if (prev->params.find("output_offset") != prev->params.end()) {
|
||||
identityLayer->params["output_offset"] = prev->params["output_offset"];
|
||||
@ -841,7 +831,7 @@ void InsertIdentityLayerPass::run() {
|
||||
identityLayer->params["original_num_rows"] = prev->params["original_num_rows"];
|
||||
}
|
||||
|
||||
CNNNetworkInsertLayer(prev, notAll ? true_layer : CNNLayerPtr(nullptr), identityLayer);
|
||||
CNNNetworkInsertLayer(prev, CNNLayerPtr(nullptr), identityLayer);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1341,6 +1331,12 @@ void InsertSplitAligningFilterPass::run() {
|
||||
if (getInputTo(splitOutput).empty()) {
|
||||
gnalog() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n";
|
||||
} else {
|
||||
auto lastDimSize = GetDataDimSize(splitOutput, 1);
|
||||
if (lastDimSize != outputSize) {
|
||||
THROW_GNA_EXCEPTION << l->name << " Convolution Filter doesn't support these input dimensions: lastDimSize="
|
||||
<< lastDimSize << ", outputSize=" << outputSize;
|
||||
}
|
||||
|
||||
// this split output not beginning from 64 bytes aligned boundary - need to correct by aligning filter layer
|
||||
// insert the filter
|
||||
auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++);
|
||||
@ -2054,32 +2050,25 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
|
||||
return false;
|
||||
};
|
||||
|
||||
auto allowFQFuse = [](CNNLayerPtr layer) -> bool {
|
||||
auto doNotSkip = [](CNNLayerPtr layer) {
|
||||
return false;
|
||||
auto allowFQFuse = [this](CNNLayerPtr layer) -> bool {
|
||||
auto skipNonFunctionalOrMemory = [](CNNLayerPtr layer) {
|
||||
return LayerInfo(layer).isNonFunctional() || LayerInfo(layer).isMemory();
|
||||
};
|
||||
|
||||
if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkip).empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto skipNonFunctional = [](CNNLayerPtr layer) {
|
||||
return LayerInfo(layer).isNonFunctional();
|
||||
};
|
||||
|
||||
auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional);
|
||||
if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isMemory()) {
|
||||
return true;
|
||||
// Don't fuse FQ if it's the output layer for the network
|
||||
if (CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctionalOrMemory).empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fuse FQ if it's not required to change precision from int32 to int16
|
||||
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional);
|
||||
for (auto& l : nextLayers) {
|
||||
if (!LayerInfo(l).isActivation()) {
|
||||
return false;
|
||||
if (getCandidatesForIdentityInsertion(l, getPassManager(), true).empty()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
std::function<void(QuantizedLayerParams*, CNNLayerPtr)> propagateStatistics =
|
||||
@ -2213,8 +2202,7 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
|
||||
}
|
||||
|
||||
// Allow FQ Fuse checks if FQ layer can be fused to a layer before or after.
|
||||
// FQ Layer is fused only when previous layer is const, memory or activation layer
|
||||
// or a next layer is activation layer.
|
||||
// FQ Layer is fused if it's not required for precision change.
|
||||
bool isFQFuseAllowed = allowFQFuse(l);
|
||||
auto prevData = *prevDataIt;
|
||||
|
||||
|
@ -93,7 +93,7 @@ protected:
|
||||
const ngraph::Shape shape = {1, 128};
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {shape});
|
||||
|
||||
auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMin });
|
||||
auto lowNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * -inputDataMax });
|
||||
auto highNodeIn = ngraph::builder::makeConstant<float>(ngPrc, {1}, { 100 * inputDataMax });
|
||||
auto fqIn = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn, highNodeIn,
|
||||
lowNodeIn, highNodeIn, levels16);
|
||||
|
@ -67,6 +67,8 @@ protected:
|
||||
const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
|
||||
const float maxInputValue = 10.0f;
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {shape1});
|
||||
auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
|
||||
|
||||
std::shared_ptr<ngraph::Node> input2;
|
||||
if (isSecondInputConst) {
|
||||
input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
|
||||
@ -78,7 +80,7 @@ protected:
|
||||
|
||||
auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
|
||||
auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
|
||||
auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
|
||||
auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(relu, lowNodeIn1, highNodeIn1,
|
||||
lowNodeIn1, highNodeIn1, levels16);
|
||||
|
||||
auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
|
||||
@ -138,4 +140,4 @@ INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
|
||||
::testing::ValuesIn({true, false}),
|
||||
::testing::ValuesIn({true, false})),
|
||||
MatMulOverloadCorrectionTest::getTestCaseName);
|
||||
} // namespace LayerTestsDefinitions
|
||||
} // namespace LayerTestsDefinitions
|
||||
|
@ -56,22 +56,23 @@ protected:
|
||||
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
|
||||
auto relu = std::make_shared<ngraph::opset8::Relu>(params[0]);
|
||||
auto fq1 = std::make_shared<ngraph::opset8::FakeQuantize>(
|
||||
params[0],
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
255);
|
||||
relu,
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
|
||||
static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
|
||||
auto constant = ngraph::builder::makeConstant(ngPrc, constantShape, std::vector<float>{}, true);
|
||||
auto fq2 = std::make_shared<ngraph::opset8::FakeQuantize>(
|
||||
constant,
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {1.}),
|
||||
255);
|
||||
auto concat = ngraph::builder::makeConcat({fq1, fq2}, 0);
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {-10.}),
|
||||
ngraph::opset8::Constant::create(ngraph::element::f32, {1}, {10.}),
|
||||
static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()) + 1);
|
||||
auto concat = ngraph::builder::makeConcat({fq1, fq2}, 1);
|
||||
function = std::make_shared<ngraph::Function>(concat, params, "WeighableLayerWithoutFq");
|
||||
}
|
||||
}; // class WeighableLayerWithoutFqTest
|
||||
@ -91,7 +92,7 @@ const std::vector<std::vector<size_t>> inputShapes = {
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> constantShapes = {
|
||||
{{16, 5}}
|
||||
{{1, 16}}
|
||||
};
|
||||
|
||||
const std::vector<std::map<std::string, std::string>> configs = {
|
||||
|
@ -27,15 +27,33 @@ void MultipleInputTest::SetUp() {
|
||||
std::tie(targetDevice, netPrecision, inputSize, config) = this->GetParam();
|
||||
configuration.insert(config.begin(), config.end());
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
|
||||
const float minInput = -10.0;
|
||||
const float maxInput = 10.0;
|
||||
auto input = ngraph::builder::makeParams(ngPrc, {{1, inputSize}, {1, inputSize}, {1, inputSize}});
|
||||
auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
|
||||
auto mul1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
|
||||
auto mul2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto mul3 = ngraph::builder::makeEltwise(mul1, mul2, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, 255, { 1 }, { -0.5 }, { 0.5 }, { -0.5 }, { 0.5 });
|
||||
auto mul4 = ngraph::builder::makeEltwise(fake3, mul3, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto result = std::make_shared<ngraph::opset7::Result>(mul4);
|
||||
auto fake1 = ngraph::builder::makeFakeQuantize(input[0], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ minInput }, { maxInput }, { minInput }, { maxInput });
|
||||
auto add1 = ngraph::builder::makeEltwise(input[0], fake1, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto fake_add1 = ngraph::builder::makeFakeQuantize(add1, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
|
||||
|
||||
auto fake2 = ngraph::builder::makeFakeQuantize(input[1], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ minInput }, { maxInput }, { minInput }, { maxInput });
|
||||
auto add2 = ngraph::builder::makeEltwise(input[1], fake2, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto fake_add2 = ngraph::builder::makeFakeQuantize(add2, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ 2 * minInput }, { 2 * maxInput }, { 2 * minInput }, { 2 * maxInput });
|
||||
|
||||
auto add3 = ngraph::builder::makeEltwise(fake_add1, fake_add2, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto fake_add3 = ngraph::builder::makeFakeQuantize(add3, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ 4 * minInput }, { 4 * maxInput }, { 4 * minInput }, { 4 * maxInput });
|
||||
|
||||
auto fake3 = ngraph::builder::makeFakeQuantize(input[2], ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ minInput }, { maxInput }, { minInput }, { maxInput });
|
||||
auto add4 = ngraph::builder::makeEltwise(fake3, fake_add3, ngraph::helpers::EltwiseTypes::ADD);
|
||||
auto fake_add4 = ngraph::builder::makeFakeQuantize(add4, ngPrc, std::numeric_limits<uint16_t>::max(), { 1 },
|
||||
{ 5 * minInput }, { 5 * maxInput }, { 5 * minInput }, { 5 * maxInput });
|
||||
|
||||
auto result = std::make_shared<ngraph::opset7::Result>(fake_add4);
|
||||
function = std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, input, "multiple_input");
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user