[IE CLDNN] Fallback to FP16 for non-quantized layers in quantized FP16+INT8 IR (#941)
This commit is contained in:
parent
37a7bbc08c
commit
b976782c70
@ -59,9 +59,12 @@ DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR);
|
||||
DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR);
|
||||
|
||||
/**
|
||||
* @brief This key turns usage of int8 optimizations and qunatized models on.
|
||||
* @brief This key enables FP16 precision for quantized models.
|
||||
* By default the model is converted to FP32 precision before running LPT. If this key is enabled (default), then non-quantized layers
|
||||
* will be converted back to FP16 after LPT, which might imrpove the performance if a model has a lot of compute operations in
|
||||
* non-quantized path. This key has no effect if current device doesn't have INT8 optimization capabilities.
|
||||
*/
|
||||
DECLARE_CLDNN_CONFIG_KEY(INT8_ENABLED);
|
||||
DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS);
|
||||
|
||||
/**
|
||||
* @brief This key should be set to correctly handle NV12 input without pre-processing.
|
||||
|
@ -189,6 +189,14 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap)
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported NV12 flag value: " << val;
|
||||
}
|
||||
} else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS) == 0) {
|
||||
if (val.compare(PluginConfigParams::YES) == 0) {
|
||||
enable_fp16_for_quantized_models = true;
|
||||
} else if (val.compare(PluginConfigParams::NO) == 0) {
|
||||
enable_fp16_for_quantized_models = false;
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val;
|
||||
}
|
||||
} else {
|
||||
THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property key by plugin: " << key;
|
||||
}
|
||||
@ -228,6 +236,11 @@ void Config::adjustKeyMapValues() {
|
||||
else
|
||||
key_config_map[CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS] = PluginConfigParams::NO;
|
||||
|
||||
if (enable_fp16_for_quantized_models)
|
||||
key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS] = PluginConfigParams::YES;
|
||||
else
|
||||
key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS] = PluginConfigParams::NO;
|
||||
|
||||
{
|
||||
std::string qp = "0";
|
||||
switch (queuePriority) {
|
||||
|
@ -27,6 +27,7 @@ struct Config {
|
||||
enableDynamicBatch(false),
|
||||
enableInt8(true),
|
||||
nv12_two_inputs(false),
|
||||
enable_fp16_for_quantized_models(true),
|
||||
queuePriority(cldnn::priority_mode_types::disabled),
|
||||
queueThrottle(cldnn::throttle_mode_types::disabled),
|
||||
max_dynamic_batch(1),
|
||||
@ -49,6 +50,7 @@ struct Config {
|
||||
bool enableDynamicBatch;
|
||||
bool enableInt8;
|
||||
bool nv12_two_inputs;
|
||||
bool enable_fp16_for_quantized_models;
|
||||
cldnn::priority_mode_types queuePriority;
|
||||
cldnn::throttle_mode_types queueThrottle;
|
||||
int max_dynamic_batch;
|
||||
|
@ -78,6 +78,7 @@
|
||||
#include <legacy/graph_tools.hpp>
|
||||
#include <legacy/ie_layers_internal.hpp>
|
||||
#include <legacy/net_pass.h>
|
||||
#include <precision_utils.h>
|
||||
#include "cldnn_infer_request.h"
|
||||
#include <threading/ie_executor_manager.hpp>
|
||||
#include "caseless.hpp"
|
||||
@ -98,6 +99,176 @@
|
||||
using namespace InferenceEngine;
|
||||
using namespace InferenceEngine::details;
|
||||
|
||||
namespace {
|
||||
|
||||
std::vector<CNNLayerPtr> BFSSort(const ICNNNetwork& network) {
|
||||
std::vector<CNNLayerPtr> ordered;
|
||||
std::unordered_set<std::string> used;
|
||||
|
||||
OutputsDataMap outputs;
|
||||
network.getOutputsInfo(outputs);
|
||||
|
||||
InputsDataMap inputs;
|
||||
network.getInputsInfo(inputs);
|
||||
|
||||
auto get_consumers = [](const CNNLayerPtr& node) -> std::vector<CNNLayerPtr> {
|
||||
std::vector<CNNLayerPtr> consumers;
|
||||
for (const auto & output : node->outData) {
|
||||
for (const auto &consumer : getInputTo(output)) {
|
||||
consumers.push_back(consumer.second);
|
||||
}
|
||||
}
|
||||
return consumers;
|
||||
};
|
||||
auto bfs = [&used, &ordered, &get_consumers](const CNNLayerPtr& start_node, bool traverse_via_outputs = false) {
|
||||
if (!start_node) return;
|
||||
std::deque<CNNLayerPtr> q;
|
||||
q.push_front(start_node);
|
||||
while (!q.empty()) {
|
||||
auto node = q.front();
|
||||
q.pop_front();
|
||||
if (used.insert(node->name).second) {
|
||||
ordered.push_back(node);
|
||||
}
|
||||
|
||||
// Traverse via inputs
|
||||
for (const auto & input : node->insData) {
|
||||
auto locked_input = input.lock();
|
||||
if (!locked_input) {
|
||||
THROW_IE_EXCEPTION << "insData for " << node->name << " is not valid.";
|
||||
}
|
||||
if (auto next_node = getCreatorLayer(locked_input).lock()) {
|
||||
if (!used.count(next_node->name)) {
|
||||
// Check that all consumers were used
|
||||
bool all_consumers_used(true);
|
||||
for (const auto & consumer : get_consumers(next_node)) {
|
||||
if (!used.count(consumer->name)) all_consumers_used = false;
|
||||
}
|
||||
if (all_consumers_used) {
|
||||
q.push_front(next_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Traverse via outputs
|
||||
if (traverse_via_outputs) {
|
||||
for (const auto &consumer : get_consumers(node)) {
|
||||
if (!used.count(consumer->name)) {
|
||||
q.push_front(consumer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// First we run bfs starting from outputs that provides deterministic graph traverse
|
||||
for (const auto & output : outputs) {
|
||||
if (!used.count(output.first)) {
|
||||
bfs(getCreatorLayer(output.second).lock());
|
||||
}
|
||||
}
|
||||
|
||||
// For cases when graph has no outputs we start bfs from inputs to ensure topological sort
|
||||
for (const auto & input : inputs) {
|
||||
const auto data_ptr = input.second->getInputData();
|
||||
for (const auto & consumer : getInputTo(data_ptr))
|
||||
if (!used.count(consumer.first)) {
|
||||
bfs(consumer.second, true);
|
||||
}
|
||||
}
|
||||
|
||||
std::reverse(ordered.begin(), ordered.end());
|
||||
return ordered;
|
||||
}
|
||||
|
||||
template <Precision::ePrecision PREC_FROM, Precision::ePrecision PREC_TO>
|
||||
void convertArrayPrecision(typename PrecisionTrait<PREC_TO>::value_type* dst,
|
||||
const typename PrecisionTrait<PREC_FROM>::value_type* src, size_t nelem) {
|
||||
using dst_type = typename PrecisionTrait<PREC_TO>::value_type;
|
||||
|
||||
for (size_t i = 0; i < nelem; i++) {
|
||||
dst[i] = static_cast<dst_type>(src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void convertArrayPrecision<Precision::FP16, Precision::FP32>(float* dst, const short* src, size_t nelem) {
|
||||
InferenceEngine::PrecisionUtils::f16tof32Arrays(dst, src, nelem, 1.0f, 0.0f);
|
||||
}
|
||||
|
||||
template <>
|
||||
void convertArrayPrecision<Precision::FP32, Precision::FP16>(short* dst, const float* src, size_t nelem) {
|
||||
InferenceEngine::PrecisionUtils::f32tof16Arrays(dst, src, nelem, 1.0f, 0.0f);
|
||||
}
|
||||
|
||||
template <Precision::ePrecision PREC_FROM, Precision::ePrecision PREC_TO>
|
||||
Blob::Ptr convertBlobPrecision(const Blob::Ptr& blob) {
|
||||
using from_d_type = typename PrecisionTrait<PREC_FROM>::value_type;
|
||||
using to_d_type = typename PrecisionTrait<PREC_TO>::value_type;
|
||||
|
||||
auto tensor_desc = blob->getTensorDesc();
|
||||
Blob::Ptr new_blob = make_shared_blob<to_d_type>(TensorDesc {PREC_TO, tensor_desc.getDims(), tensor_desc.getLayout()});
|
||||
new_blob->allocate();
|
||||
auto target = new_blob->buffer().as<to_d_type*>();
|
||||
auto source = blob->buffer().as<from_d_type*>();
|
||||
convertArrayPrecision<PREC_FROM, PREC_TO>(target, source, blob->size());
|
||||
return new_blob;
|
||||
}
|
||||
|
||||
template <Precision::ePrecision PREC_FROM, Precision::ePrecision PREC_TO>
|
||||
void convertLayerPrecision(const CNNLayerPtr& layer, bool isOutput = false) {
|
||||
if (layer->type == "TensorIterator" && dynamic_cast<TensorIterator*>(layer.get()) != nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
using LayerType = CLDNNPlugin::Program::LayerType;
|
||||
|
||||
if (!isOutput) {
|
||||
for (auto &out_data : layer->outData) {
|
||||
if (PREC_FROM == out_data->getPrecision())
|
||||
out_data->setPrecision(PREC_TO);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &in_data : layer->insData) {
|
||||
auto data = in_data.lock();
|
||||
if (PREC_FROM == data->getPrecision())
|
||||
data->setPrecision(PREC_TO);
|
||||
|
||||
auto prev_layer = getCreatorLayer(data).lock();
|
||||
|
||||
if (CLDNNPlugin::Program::LayerTypeFromStr(prev_layer->type) == LayerType::ConstantBlob &&
|
||||
CLDNNPlugin::Program::LayerTypeFromStr(layer->type) != LayerType::Quantize) {
|
||||
convertLayerPrecision<Precision::FP32, Precision::FP16>(prev_layer, false);
|
||||
}
|
||||
}
|
||||
|
||||
if (layer->precision == PREC_FROM)
|
||||
layer->precision = PREC_TO;
|
||||
|
||||
auto wLayer = dynamic_cast<InferenceEngine::WeightableLayer *>(layer.get());
|
||||
if (wLayer) {
|
||||
if (wLayer->_weights && wLayer->_weights->getTensorDesc().getPrecision() == PREC_FROM) {
|
||||
wLayer->_weights = convertBlobPrecision<PREC_FROM, PREC_TO>(wLayer->_weights);
|
||||
}
|
||||
if (wLayer->_biases && wLayer->_biases->getTensorDesc().getPrecision() == PREC_FROM) {
|
||||
wLayer->_biases = convertBlobPrecision<PREC_FROM, PREC_TO>(wLayer->_biases);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &blob : layer->blobs) {
|
||||
auto &data = blob.second;
|
||||
if (nullptr != data) {
|
||||
if (data->getTensorDesc().getPrecision() == PREC_FROM) {
|
||||
data = convertBlobPrecision<PREC_FROM, PREC_TO>(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace CLDNNPlugin {
|
||||
|
||||
const cldnn::primitive_id Program::m_preProcessTag("_cldnn_input_preprocess");
|
||||
@ -242,29 +413,103 @@ Program::Program(InferenceEngine::ICNNNetwork& network, std::shared_ptr<const cl
|
||||
.add<FullyConnectedTransformation>(LayerTransformation::Params(params).setSupportAsymmetricQuantization(false), "FullyConnected")
|
||||
.add<GemmTransformation>(LayerTransformation::Params(params).setSupportAsymmetricQuantization(false), "GEMM");
|
||||
|
||||
auto it = details::CNNNetworkIterator(&network);
|
||||
auto end = details::CNNNetworkIterator();
|
||||
bool fqFound = false;
|
||||
bool allFQareSupported = true;
|
||||
while (it != end) {
|
||||
if (CaselessEq<std::string>()((*it)->type, "FakeQuantize")) {
|
||||
fqFound = true;
|
||||
auto levels = (*it)->GetParamAsUInt("levels");
|
||||
if (levels != 255 && levels != 256) {
|
||||
allFQareSupported = false;
|
||||
break;
|
||||
bool baselineIsFP16 = false;
|
||||
{
|
||||
auto it = details::CNNNetworkIterator(&network);
|
||||
auto end = details::CNNNetworkIterator();
|
||||
while (it != end) {
|
||||
auto& layer = *it;
|
||||
if (layer->precision == Precision::FP16) {
|
||||
baselineIsFP16 = true;
|
||||
}
|
||||
|
||||
if (CaselessEq<std::string>()(layer->type, "FakeQuantize")) {
|
||||
fqFound = true;
|
||||
auto levels = layer->GetParamAsUInt("levels");
|
||||
if (levels != 255 && levels != 256) {
|
||||
allFQareSupported = false;
|
||||
}
|
||||
}
|
||||
it++;
|
||||
}
|
||||
it++;
|
||||
}
|
||||
|
||||
// [WA] Convert quantized FP16 model to FP32 to avoid possible overflow and mixed precision errors
|
||||
// [WA part1] Convert quantized FP16 model to FP32 to avoid possible overflow and mixed precision errors
|
||||
if (fqFound && allFQareSupported) {
|
||||
NetPass::ConvertPrecision(network, Precision::FP16, Precision::FP32);
|
||||
}
|
||||
|
||||
LowPrecisionTransformer transformer(transforms);
|
||||
transformer.transform(network);
|
||||
|
||||
// [WA part2] Try to find non-quantized layers and convert them back to FP16
|
||||
if (fqFound && baselineIsFP16 && config.enable_fp16_for_quantized_models) {
|
||||
auto layersSorted = BFSSort(network);
|
||||
|
||||
for (auto& layer : layersSorted) {
|
||||
if (layer == nullptr)
|
||||
continue;
|
||||
|
||||
if (layer->outData.empty() || layer->insData.empty())
|
||||
continue;
|
||||
|
||||
auto canReduceOutputPrecision = [](const CNNLayerPtr& l) -> bool {
|
||||
auto type = LayerTypeFromStr(l->type);
|
||||
// Don't do conversion for outputs
|
||||
auto next = GetNextLayers(l);
|
||||
if (next.empty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (type == LayerType::ScaleShift) {
|
||||
// ScaleShift is supposed to return Dequantized values, so in most of the cases we can convert it's output to FP16
|
||||
// The exception is when the next node is Eltwise, so LPT keeps modified ScaleShift node on one of the branches
|
||||
// and this node doesn't do requantization, thus we have to keep the result in FP32 precision.
|
||||
for (auto n : next) {
|
||||
if (LayerTypeFromStr(n->type) == LayerType::Eltwise)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
if (type == LayerType::Quantize) {
|
||||
auto in = getCreatorLayer(l->insData[0].lock()).lock();
|
||||
if (l->outData[0]->getPrecision() == Precision::FP32 && in->type != "Input")
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
auto canReducePrecision = [](const CNNLayerPtr& l) -> bool {
|
||||
auto layerType = LayerTypeFromStr(l->type);
|
||||
|
||||
bool result = true;
|
||||
for (auto& in : l->insData) {
|
||||
auto input = in.lock();
|
||||
auto precision = input->getPrecision();
|
||||
auto in_type = LayerTypeFromStr(getCreatorLayer(input).lock()->type);
|
||||
if (precision != Precision::FP16 && in_type != LayerType::ConstantBlob) {
|
||||
result = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
if (canReducePrecision(layer)) {
|
||||
convertLayerPrecision<Precision::FP32, Precision::FP16>(layer, GetNextLayers(layer).empty());
|
||||
} else if (canReduceOutputPrecision(layer)) {
|
||||
for (auto &out_data : layer->outData) {
|
||||
if (out_data->getPrecision() == Precision::FP32)
|
||||
out_data->setPrecision(Precision::FP16);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NetPass::CombineRNNSeq(network);
|
||||
|
@ -135,7 +135,7 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params&
|
||||
jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1));
|
||||
}
|
||||
|
||||
jit.Merge(MakeActivationJitConstants(params.activations, GetAccumulatorType(params), "_TYPED"));
|
||||
jit.Merge(MakeActivationJitConstants(params.activations, params.output.GetDType(), "_TYPED"));
|
||||
|
||||
if (params.output.Feature().v % 16 != 0)
|
||||
jit.AddConstant(MakeJitConstant("LEFTOVERS", params.output.Feature().v % 16));
|
||||
|
Loading…
Reference in New Issue
Block a user