[IE CLDNN] Improve network outputs detection in quantized FP16+INT8 IR to avoid converting them to FP16 precision (#3407)

This commit is contained in:
Jedrzej Hajduczenia 2020-12-01 11:27:51 +01:00 committed by GitHub
parent c902eb5df7
commit 4a91f914e2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -405,6 +405,8 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<const cld
}
}
OutputsDataMap outputsMap = network.getOutputsInfo();
// [WA part2] Try to find non-quantized layers and convert them back to FP16
if (config.enableInt8) {
if (fqFound && baselineIsFP16 && config.enable_fp16_for_quantized_models) {
@ -417,14 +419,42 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<const cld
if (layer->outData.empty() || layer->insData.empty())
continue;
auto canReduceOutputPrecision = [](const CNNLayerPtr& l) -> bool {
auto type = LayerTypeFromStr(l->type);
// Don't do conversion for outputs
auto next = GetNextLayers(l);
if (next.empty()) {
return false;
auto isOutputLayer = [](const CNNLayerPtr& l, const OutputsDataMap& networkOutputs) -> bool {
bool is_output = false;
if (GetNextLayers(l).empty())
is_output = true;
// Condition above is not enough, as network output layer
// can still be used in other parts of the graph
// (e.g. 1st output form TopK primitive may become network output
// while 2nd output from the same primitive may still be used
// in the graph).
if (!is_output) {
for (auto layerOutput : l->outData) {
for (auto networkOutput : networkOutputs) {
if (layerOutput->getName() == networkOutput.second->getName()) {
is_output = true;
break;
}
}
if (is_output)
break;
}
}
return is_output;
};
auto canReduceOutputPrecision = [](const CNNLayerPtr& l, const bool isNetworkOutput) -> bool {
// Don't do the conversion for network outputs
if (isNetworkOutput)
return false;
auto type = LayerTypeFromStr(l->type);
auto next = GetNextLayers(l);
if (type == LayerType::ScaleShift) {
// ScaleShift is supposed to return Dequantized values, so in most of the cases we can convert it's output to FP16
// The exception is when the next node is Eltwise, so LPT keeps modified ScaleShift node on one of the branches
@ -462,9 +492,11 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<const cld
return result;
};
bool is_network_output = isOutputLayer(layer, outputsMap);
if (canReducePrecision(layer)) {
convertLayerPrecision<Precision::FP32, Precision::FP16>(layer, GetNextLayers(layer).empty());
} else if (canReduceOutputPrecision(layer)) {
convertLayerPrecision<Precision::FP32, Precision::FP16>(layer, is_network_output);
} else if (canReduceOutputPrecision(layer, is_network_output)) {
for (auto &out_data : layer->outData) {
if (out_data->getPrecision() == Precision::FP32)
out_data->setPrecision(Precision::FP16);