diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp
index fd7dd701bdd..41e287832a9 100644
--- a/inference-engine/include/gna/gna_config.hpp
+++ b/inference-engine/include/gna/gna_config.hpp
@@ -92,6 +92,13 @@ DECLARE_GNA_CONFIG_KEY(COMPACT_MODE);
*/
DECLARE_GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN);
+/**
+* @brief The option to allow to specify the maximum error percent that the optimized algorithm finding
+* will use to find PWL functions.
+* By default (in case of NO value set), 1.0 value is used.
+*/
+DECLARE_GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT);
+
/**
* @brief By default, the GNA plugin uses one worker thread for inference computations.
* This parameter allows you to create up to 127 threads for software modes.
diff --git a/inference-engine/samples/speech_sample/main.cpp b/inference-engine/samples/speech_sample/main.cpp
index 8f3b3b5cd15..e117ca79da9 100644
--- a/inference-engine/samples/speech_sample/main.cpp
+++ b/inference-engine/samples/speech_sample/main.cpp
@@ -519,6 +519,10 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
throw std::logic_error("Invalid value for 'cw_l' argument. It must be greater than or equal to 0");
}
+ if (FLAGS_pwl_me < 0.0 || FLAGS_pwl_me > 100.0) {
+ throw std::logic_error("Invalid value for 'pwl_me' argument. It must be greater than 0.0 and less than 100.0");
+ }
+
return true;
}
@@ -671,6 +675,7 @@ int main(int argc, char *argv[]) {
gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads);
gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO);
+ gnaPluginConfig[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(FLAGS_pwl_me);
// -----------------------------------------------------------------------------------------------------
// --------------------------- 5. Write model to file --------------------------------------------------
diff --git a/inference-engine/samples/speech_sample/speech_sample.hpp b/inference-engine/samples/speech_sample/speech_sample.hpp
index 5fc905ae689..1409d557d60 100644
--- a/inference-engine/samples/speech_sample/speech_sample.hpp
+++ b/inference-engine/samples/speech_sample/speech_sample.hpp
@@ -91,6 +91,10 @@ static const char input_layer_names_message[] = "Optional. Layer names for input
"The names are separated with \",\" " \
"Example: Input1,Input2 ";
+/// @brief message for PWL max error percent
+static const char pwl_max_error_percent_message[] = "Optional. The maximum percent of error for PWL function." \
+ "The value must be in <0, 100> range. The default value is 1.0.";
+
/// \brief Define flag for showing help message
DEFINE_bool(h, false, help_message);
@@ -161,6 +165,9 @@ DEFINE_string(oname, "", output_layer_names_message);
/// @brief Input layer name
DEFINE_string(iname, "", input_layer_names_message);
+/// @brief PWL max error percent
+DEFINE_double(pwl_me, 1.0, pwl_max_error_percent_message);
+
/**
* \brief This function show a help message
*/
@@ -191,5 +198,6 @@ static void showUsage() {
std::cout << " -cw_r \"\" " << context_window_message_r << std::endl;
std::cout << " -oname \"\" " << output_layer_names_message << std::endl;
std::cout << " -iname \"\" " << input_layer_names_message << std::endl;
+ std::cout << " -pwl_me \"\" " << pwl_max_error_percent_message << std::endl;
}
diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
index 0f641e5473f..4a758649e94 100644
--- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
+++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp
@@ -1243,15 +1243,15 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
break;
case kActFakeQuantize :
out_file << " " <<
- std::dec << component[i].op.pwl.func_id.args.fakeQuantize.levels << "\n";
+ std::dec << component[i].op.pwl.func_id.fqParams.levels << "\n";
out_file << " " <<
- std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.input_low << "\n";
+ std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.input_low << "\n";
out_file << " " <<
- std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.input_high << "\n";
+ std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.input_high << "\n";
out_file << " " <<
- std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.output_low << "\n";
+ std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.output_low << "\n";
out_file << " " <<
- std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.output_high << "\n";
+ std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.output_high << "\n";
break;
default:
break;
diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h
index 341f890490e..ea0b5a1e399 100644
--- a/inference-engine/src/gna_plugin/backend/dnn_types.h
+++ b/inference-engine/src/gna_plugin/backend/dnn_types.h
@@ -34,9 +34,25 @@ enum DnnActivationType : uint8_t {
kActNumType
};
+struct FakeQuantizeParams {
+ int8_t set;
+ int32_t levels;
+ // if input is per-channel quantization - input pointers contains per-channel ranges
+ int8_t inputPerChannel;
+ float* input_low;
+ float* input_high;
+ // if output is per-channel quantization - output pointers contains per-channel ranges
+ int8_t outputPerChannel;
+ float* output_low;
+ float* output_high;
+};
+
struct DnnActivation {
// for prelu
DnnActivationType type;
+ FakeQuantizeParams fqParams;
+ FakeQuantizeParams srcFQParams;
+
union {
struct {
float negative_slope;
@@ -50,17 +66,6 @@ struct DnnActivation {
float low;
float high;
} clamp;
- struct {
- int32_t levels;
- // if input is per-channel quantization - input pointers contains per-channel ranges
- int8_t inputPerChannel;
- float *input_low;
- float *input_high;
- // if output is per-channel quantization - output pointers contains per-channel ranges
- int8_t outputPerChannel;
- float *output_low;
- float *output_high;
- } fakeQuantize;
} args;
operator DnnActivationType () const noexcept {
return type;
diff --git a/inference-engine/src/gna_plugin/backend/make_pwl.cpp b/inference-engine/src/gna_plugin/backend/make_pwl.cpp
index f7f34d33270..ddf73975b88 100644
--- a/inference-engine/src/gna_plugin/backend/make_pwl.cpp
+++ b/inference-engine/src/gna_plugin/backend/make_pwl.cpp
@@ -34,15 +34,20 @@ void make_gna_pwl(const DnnActivation fun,
gna_pwl[0].xBase = static_cast (INT32_MIN & XBASEMASK); // zero out the 2 lsb
if (fun == kActSigmoid) {
gnalog() << "=========================== Sigmoid Segments ===========================\n";
- gna_pwl[0].yBase = gna_pwl[1].yBase = 0;
+ auto minVal = fun.fqParams.set? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale): 0;
+ gna_pwl[0].yBase = gna_pwl[1].yBase = minVal;
gna_pwl[1].xBase = (static_cast (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK;
} else if (fun == kActTanh) {
gnalog() << "=========================== Tanh Segments ===========================\n";
- gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast(-1.0 * out_scale);
+ auto minVal = fun.fqParams.set ? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale) :
+ static_cast(-1.0 * out_scale);
+ gna_pwl[0].yBase = gna_pwl[1].yBase = minVal;
gna_pwl[1].xBase = (static_cast (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK;
} else {
gnalog() << "=========================== SoftSign Segments ===========================\n";
- gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast(-1.0 * out_scale);
+ auto minVal = fun.fqParams.set ? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale) :
+ static_cast(-1.0 * out_scale);
+ gna_pwl[0].yBase = gna_pwl[1].yBase = minVal;
gna_pwl[1].xBase = (static_cast (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK;
}
gna_pwl[0].slope = 0;
@@ -74,9 +79,10 @@ void make_gna_pwl(const DnnActivation fun,
<< "\n";
}
// insert extra segment for xvalues > u_bound
+ auto maxVal = fun.fqParams.set ? *fun.fqParams.input_high : 1.0;
gna_pwl[n_segments - 1].xBase =
((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK;
- gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(1.0 * out_scale);
+ gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(maxVal * out_scale);
gna_pwl[n_segments - 1].slope = 0;
gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale)
@@ -223,9 +229,19 @@ void make_gna_pwl(const DnnActivation fun,
else
gnalog() << "=========================== LeakyReLU Segments ======================\n";
int32_t x_lower = INT32_MIN;
+ int32_t x_upper = INT32_MAX;
int16_t y_lower = INT16_MIN;
- if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
- if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
+ int16_t y_upper = INT16_MAX;
+ if (fun.fqParams.set) {
+ x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale);
+ x_upper = FLOAT_TO_INT32(*fun.fqParams.input_high * 1.25 * in_scale);
+ y_lower = FLOAT_TO_INT16(*fun.fqParams.input_low * 1.25 * out_scale);
+ y_upper = FLOAT_TO_INT16(*fun.fqParams.input_high * 1.25 * out_scale);
+ } else {
+ if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+ if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
+ }
+
gna_pwl[0].yBase = y_lower * fun.args.lrelu.negative_slope;
s = gna_slope(fun.args.lrelu.negative_slope, in_scale, out_scale);
gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index; // zero out the 2 lsb
@@ -244,6 +260,18 @@ void make_gna_pwl(const DnnActivation fun,
<< " " << 0.0
<< " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale)
<< "\n";
+
+ if (fun.fqParams.set) { // need a right segment
+ gna_pwl.push_back({
+ static_cast(x_upper & XBASEMASK), // zero out the 2 lsb
+ y_upper,
+ 0 });
+
+ gnalog() << (x_upper & XBASEMASK) / in_scale
+ << " " << gna_pwl[n_segments].yBase / out_scale
+ << " " << 0
+ << "\n";
+ }
break;
}
case kActSign: {
@@ -281,11 +309,18 @@ void make_gna_pwl(const DnnActivation fun,
break;
}
case kActIdentity:
- case kActKaldiLstmClipping: {
+ case kActKaldiLstmClipping:
+ case kActFakeQuantize: {
int32_t x_lower = INT32_MIN;
int32_t x_upper = INT32_MAX;
int16_t y_lower = INT16_MIN;
int16_t y_upper = INT16_MAX;
+ if (fun == kActFakeQuantize && fun.fqParams.set) {
+ x_lower = *fun.fqParams.input_low * in_scale;
+ x_upper = *fun.fqParams.input_high * in_scale;
+ y_lower = *fun.fqParams.input_low * out_scale;
+ y_upper = *fun.fqParams.input_high * out_scale;
+ }
auto n_segments = 2;
if (fun == kActKaldiLstmClipping) {
gnalog() << "=========================== Clipping Segments ===========================\n";
@@ -311,6 +346,8 @@ void make_gna_pwl(const DnnActivation fun,
if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale);
+ } else if (fun == kActFakeQuantize) {
+ gnalog() << "=========================== Fake Quantize Segments ===========================\n";
}
gna_pwl.resize(n_segments);
gna_pwl[0].xBase = INT32_MIN & XBASEMASK; // zero out the 2 lsb
diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
index 34af49e5586..9bb0169183a 100644
--- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
+++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp
@@ -13,6 +13,7 @@ struct GNAFlags {
bool compact_mode = false;
bool exclusive_async_requests = false;
bool uniformPwlDesign = false;
+ float pwlMaxErrorPercent = 1.0f;
bool gna_openmp_multithreading = false;
bool sw_fp32 = false;
bool fake_quantized = false;
diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
index dba694b8055..6f38366f6e5 100644
--- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
@@ -95,6 +95,15 @@ struct QuantPair {
static B optional () { return B();}
};
+struct FakeQuantizeParams {
+ bool paramsSet = false;
+ uint32_t levelsNum = 1;
+ float inputMinValue = 1.0f;
+ float inputMaxValue = 1.0f;
+ float outputMinValue = 1.0f;
+ float outputMaxValue = 1.0f;
+};
+
/**
* @brief should allocated blob for specific data type, in case of src blob is nullptr
* @tparam T
@@ -170,14 +179,41 @@ class Quant {
template
-inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
+inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision,
+ float scale_factor, const FakeQuantizeParams& fqParams) {
auto prec_blob = InferenceEngine::make_shared_blob({ precision,
fp32_blob->getTensorDesc().getDims(), fp32_blob->getTensorDesc().getLayout() });
prec_blob->allocate();
+ auto input_low = 0.0f;
+ auto input_high = 0.0f;
+ auto output_low = 0.0f;
+ auto output_high = 0.0f;
+ auto levels = 1;
+ if (fqParams.paramsSet) {
+ input_low = fqParams.inputMinValue;
+ input_high = fqParams.inputMaxValue;
+ output_low = fqParams.outputMinValue;
+ output_high = fqParams.outputMaxValue;
+ levels = fqParams.levelsNum;
+ }
+
int i = 0;
for (auto& precValue : *prec_blob) {
- auto f32Value = fp32_blob->buffer().template as::value_type*>()[i++] * scale_factor;
+ auto f32Value = fp32_blob->buffer().template as::value_type*>()[i++];
+ if (fqParams.paramsSet) {
+ auto x = f32Value;
+ if (x <= std::min(input_low, input_high)) {
+ f32Value = output_low;
+ } else if (x > std::max(input_low, input_high)) {
+ f32Value = output_high;
+ } else {
+ f32Value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
+ (levels - 1) * (output_high - output_low) + output_low;
+ }
+ }
+
+ f32Value = f32Value * scale_factor;
if (f32Value > std::numeric_limits::max()) {
precValue = std::numeric_limits::max();
} else if (f32Value < std::numeric_limits::min()) {
@@ -190,20 +226,21 @@ inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::
return static_cast(prec_blob);
}
-inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
+inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision,
+ float scale_factor, const FakeQuantizeParams &fqParams) {
InferenceEngine::Blob::Ptr result_ptr = nullptr;
switch (precision) {
case InferenceEngine::Precision::FP32:
- result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor);
+ result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams);
break;
case InferenceEngine::Precision::I32:
- result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor);
+ result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams);
break;
case InferenceEngine::Precision::I16:
- result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor);
+ result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams);
break;
case InferenceEngine::Precision::I8:
- result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor);
+ result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams);
break;
default:
THROW_GNA_EXCEPTION << "FP32 to " << precision << " not supported";
@@ -304,13 +341,15 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
auto quantData = InferenceEngine::getInjectedData(*wl);
{
- auto per_channel_weights = !quantData->_weights_quant.GetMinValues().empty();
+ auto weightsStats = !quantData->_weights_quant.GetMinValues().empty();
auto weightsScale = quantData->_weights_quant.GetScale();
auto dstScale = quantData->_dst_quant.GetScale();
- fnc(wl->_weights->buffer().as(),
- wl->_biases ? wl->_biases->buffer().as() : nullptr,
+ auto blob_precision = wl->_weights->getTensorDesc().getPrecision();
+ auto quantizedWeights = blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16;
+ fnc(wl->_weights->buffer().as(),
+ wl->_biases ? wl->_biases->buffer().as() : nullptr,
intWeights->buffer(),
- intBiases ? intBiases->buffer() : static_cast(nullptr),
+ intBiases ? intBiases->buffer() : static_cast(nullptr),
input_scale_factor,
&weightsScale,
&dstScale,
@@ -318,12 +357,13 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
num_columns,
num_rows_padded,
num_columns_padded,
+ quantizedWeights,
quantData->_weights_quant.GetLevels(),
- nullptr,
- nullptr,
- per_channel_weights ? &quantData->_weights_quant.GetMinValues().front(): nullptr,
- per_channel_weights ? &quantData->_weights_quant.GetMaxValues().front(): nullptr,
- &quantData->_weights_quantized);
+ quantData->_weights_quant.GetMinValues().size(),
+ weightsStats ? &quantData->_weights_quant.GetMinValues(true).front() : nullptr,
+ weightsStats ? &quantData->_weights_quant.GetMaxValues(true).front() : nullptr,
+ weightsStats ? &quantData->_weights_quant.GetMinValues(false).front() : nullptr,
+ weightsStats ? &quantData->_weights_quant.GetMaxValues(false).front() : nullptr);
}
wl->_weights = intWeights;
wl->_biases = intBiases;
@@ -410,19 +450,29 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
auto quantData = InferenceEngine::getInjectedData(*conv);
{
+ auto weightsStats = !quantData->_weights_quant.GetMinValues().empty();
auto weightsScale = quantData->_weights_quant.GetScale();
auto dstScale = quantData->_dst_quant.GetScale();
- fnc(conv->_weights->buffer().as(),
- conv->_biases ? conv->_biases->buffer().as() : nullptr,
+ auto blob_precision = conv->_weights->getTensorDesc().getPrecision();
+ auto quantizedWeights = blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16;
+ fnc(conv->_weights->buffer().as(),
+ conv->_biases ? conv->_biases->buffer().as() : nullptr,
intWeights->buffer(),
- intBiases ? intBiases->buffer() : static_cast(nullptr),
+ intBiases ? intBiases->buffer() : static_cast(nullptr),
input_scale_factor,
&weightsScale,
&dstScale,
num_rows,
num_columns,
num_rows_padded,
- num_columns_padded);
+ num_columns_padded,
+ quantizedWeights,
+ quantData->_weights_quant.GetLevels(),
+ quantData->_weights_quant.GetMinValues().size(),
+ weightsStats ? &quantData->_weights_quant.GetMinValues(true).front() : nullptr,
+ weightsStats ? &quantData->_weights_quant.GetMaxValues(true).front() : nullptr,
+ weightsStats ? &quantData->_weights_quant.GetMinValues(false).front() : nullptr,
+ weightsStats ? &quantData->_weights_quant.GetMaxValues(false).front() : nullptr);
}
conv->_weights = intWeights;
conv->_biases = intBiases;
@@ -494,11 +544,22 @@ class DataQuantizer : public DataQuantizerBas
if (initial_precision == InferenceEngine::Precision::FP16) {
cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]);
}
- auto const_scale_factor = InferenceEngine::getInjectedData(*cnnLayer)->_dst_quant.GetScale();
+ auto quantParams = InferenceEngine::getInjectedData(*cnnLayer);
auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]);
auto const_blob = cnnLayer->blobs["custom"];
if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
- cnnLayer->blobs["custom"] = fp32_to_precision_blob(const_blob, cnnLayer->outData[0]->getPrecision(), const_scale_factor);
+ auto fqParams = FakeQuantizeParams{};
+ if (quantParams->_dst_quant.IsStatsSet()) {
+ fqParams.paramsSet = true;
+ fqParams.levelsNum = quantParams->_dst_quant.GetLevels();
+ fqParams.inputMinValue = quantParams->_dst_quant.GetMinValues(true).front();
+ fqParams.inputMaxValue = quantParams->_dst_quant.GetMaxValues(true).front();
+ fqParams.outputMinValue = quantParams->_dst_quant.GetMinValues(false).front();
+ fqParams.outputMaxValue = quantParams->_dst_quant.GetMaxValues(false).front();
+ }
+
+ cnnLayer->blobs["custom"] = fp32_to_precision_blob(const_blob, cnnLayer->outData[0]->getPrecision(),
+ quantParams->_dst_quant.GetScale(), fqParams);
}
}
diff --git a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
index 523fdb3d47a..dc867be0a9a 100644
--- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
+++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp
@@ -7,6 +7,7 @@
#include
#include
#include
+#include
#include
#include "gna_graph_tools.hpp"
@@ -77,7 +78,8 @@ class ModelQuantizer {
scaleIndex++;
}
- propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size());
+ bool isFakeQuantize = std::is_same() || std::is_same();
+ propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), isFakeQuantize);
// sorted order gives possibility for propagate quantisation along depended layers
for (auto &&layer : sortedNewNet) {
@@ -88,8 +90,8 @@ class ModelQuantizer {
}
private :
- void propagateScaleFactor(std::vector & net, int weightsBytesSize) const {
- ScaleFactorCalculator sf(net, weightsBytesSize);
+ void propagateScaleFactor(std::vector & net, int weightsBytesSize, bool fakeQuantize) const {
+ ScaleFactorCalculator sf(net, weightsBytesSize, fakeQuantize);
while (!sf.allLayersProcessed()) {
for (auto &&layer : sf.getStartLayers()) {
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp
index 33999cffe3e..d8b5f9d4da3 100644
--- a/inference-engine/src/gna_plugin/frontend/quantization.cpp
+++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp
@@ -9,6 +9,7 @@
#include
#include "backend/gna_types.h"
#include "quantization.h"
+#include
#ifdef DEBUG
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
@@ -19,26 +20,44 @@
template<>
void QuantizationCallback::runFakeQuantize() const {
+ if (quantizedWeights) {
+ THROW_GNA_EXCEPTION << "Quantized weights are not yet supported in int16 quantization mode";
+ }
+
uint32_t num_saturate = 0;
+ auto input_low = 0.0f;
+ auto input_high = 0.0f;
+ auto output_low = 0.0f;
+ auto output_high = 0.0f;
+ auto levels = 1;
+ if (fq_num_stats > 0) {
+ input_low = *fq_ptr_input_low;
+ input_high = *fq_ptr_input_high;
+ output_low = *fq_ptr_output_low;
+ output_high = *fq_ptr_output_high;
+ levels = fq_levels;
+ }
for (uint32_t row = 0; row < num_rows; row++) {
for (uint32_t col = 0; col < num_columns; col++) {
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
float value = ptr_float_weights[row * num_columns + col];
- if (!*ptr_quantized_weights) {
- value = value * *ptr_weight_scale_factor + rounding_value;
- } else {
- value -= MAX_VAL_2B_WEIGHT;
+ if (fq_num_stats > 0) {
+ auto x = value;
+ if (x <= std::min(input_low, input_high)) {
+ value = output_low;
+ } else if (x > std::max(input_low, input_high)) {
+ value = output_high;
+ } else {
+ value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
+ (levels - 1) * (output_high - output_low) + output_low;
+ }
}
+ value = value * *ptr_weight_scale_factor + rounding_value;
+
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
- if (*ptr_quantized_weights &&
- (value > std::numeric_limits::max() ||
- value < std::numeric_limits::min())) {
- THROW_GNA_EXCEPTION << "unsupported weights range for I16 quantisation: " << value;
- }
-
if (value > std::numeric_limits::max()) {
*ptr_weight_16 = std::numeric_limits::max();
num_saturate++;
@@ -91,37 +110,6 @@ void QuantizationCallback::runFakeQuantize() const {
template<>
void QuantizationCallback::runQuantize() const {
uint32_t num_saturate = 0;
-
- if (*ptr_weight_scale_factor == 1.0) {
- // scale factor for weights is not calculated yet
- float mean_weight = 0.0;
- float mean_weight_squared = 0.0;
- float max_weight = -1e20f;
- float var_weight;
- float mean_plus_2stdev;
-
- for (uint32_t i = 0; i < num_rows; i++) {
- for (uint32_t j = 0; j < num_columns; j++) {
- float weight = ptr_float_weights[i * num_columns + j];
- mean_weight += weight;
- mean_weight_squared += weight * weight;
- if (fabs(weight) > max_weight) {
- max_weight = fabs(weight);
- }
- }
- }
-
- mean_weight /= static_cast(num_rows * num_columns);
- mean_weight_squared /= static_cast(num_rows * num_columns);
- var_weight = mean_weight_squared - mean_weight * mean_weight;
- mean_plus_2stdev = mean_weight + 2.0f * static_cast(sqrtf(var_weight));
-
- if (max_weight != 0.0f) {
- *ptr_weight_scale_factor = static_cast(MAX_VAL_2B_WEIGHT) / max_weight;
- }
- *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
- }
-
for (uint32_t row = 0; row < num_rows; row++) {
for (uint32_t col = 0; col < num_columns; col++) {
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
@@ -176,6 +164,24 @@ void QuantizationCallback::runQuantize() const {
}
}
+std::pair FindMinMaxValues(void* ptr_float_memory, size_t num_elements) {
+ float* ptr_float_feat = reinterpret_cast(ptr_float_memory);
+ float min = num_elements ? ptr_float_feat[0] : 0.0;
+ float max = num_elements ? ptr_float_feat[0] : 0.0;
+
+ for (size_t i = 1; i < num_elements; i++) {
+ if (fabs(ptr_float_feat[i]) > max) {
+ max = fabs(ptr_float_feat[i]);
+ }
+
+ if (fabs(ptr_float_feat[i]) < min) {
+ min = fabs(ptr_float_feat[i]);
+ }
+ }
+
+ return { min, max };
+}
+
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) {
float *ptr_float_feat = reinterpret_cast(ptr_float_memory);
float max = 0.0;
@@ -224,17 +230,37 @@ template<>
void QuantizationCallback::runFakeQuantize() const {
uint32_t num_saturate = 0;
- if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
- THROW_GNA_EXCEPTION << "Fake quantized output range not set";
- }
- if (fq_levels == 0 || fq_levels == 1) {
- THROW_GNA_EXCEPTION << "Fake quantized levels not set";
- }
-
+ auto input_low = 0.0f;
+ auto input_high = 0.0f;
+ auto output_low = 0.0f;
+ auto output_high = 0.0f;
+ auto levels = 1;
+ float valueAcc = 0.0;
for (uint32_t i = 0; i < num_rows; i++) {
- uint32_t channel_multiplier = ((fq_ptr_output_high[i] - fq_ptr_output_low[i]) *
- *ptr_weight_scale_factor) / (fq_levels - 1) + 0.5f;
- ptr_int_biases[i].multiplier = static_cast (channel_multiplier);
+ uint32_t channel_multiplier = 1;
+ if (fq_num_stats > 0) {
+ auto idx = fq_num_stats == 1 ? 0 : i;
+ input_low = fq_ptr_input_low[idx];
+ input_high = fq_ptr_input_high[idx];
+ output_low = fq_ptr_output_low[idx];
+ output_high = fq_ptr_output_high[idx];
+ levels = fq_levels;
+
+ channel_multiplier = ((input_high - input_low) * *ptr_weight_scale_factor) / (levels - 1);
+ } else {
+ float scaled_row_max = 0;
+ for (uint32_t col = 0; col < num_columns; col++) {
+ float value = ptr_float_weights[i * num_columns + col] * *ptr_weight_scale_factor;
+ valueAcc += value;
+ if (fabs(value) > scaled_row_max) {
+ scaled_row_max = fabs(value);
+ }
+ }
+
+ channel_multiplier = scaled_row_max / static_cast(MAX_VAL_1B_WEIGHT);
+ }
+
+ ptr_int_biases[i].multiplier = static_cast (channel_multiplier + 0.5f);
if (channel_multiplier > MAX_OUT_MULTIPLIER) {
THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;
}
@@ -243,19 +269,25 @@ void QuantizationCallback::runFakeQuantize() const
auto offset = i * num_columns + j;
auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f;
float value = ptr_float_weights[offset];
- if (!*ptr_quantized_weights) {
+ if (!quantizedWeights) {
+ if (fq_num_stats > 0) {
+ auto x = value;
+ if (x <= std::min(input_low, input_high)) {
+ value = output_low;
+ } else if (x > std::max(input_low, input_high)) {
+ value = output_high;
+ } else {
+ value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
+ (levels - 1) * (output_high - output_low) + output_low;
+ }
+ }
+
value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value;
} else {
value -= MAX_VAL_1B_WEIGHT;
}
auto normalizedWeight = static_cast(value);
- if (*ptr_quantized_weights &&
- (value > std::numeric_limits::max() ||
- value < std::numeric_limits::min())) {
- THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantization: " << value;
- }
-
if (value > std::numeric_limits::max()) {
normalizedWeight = std::numeric_limits::max();
num_saturate++;
@@ -309,40 +341,6 @@ void QuantizationCallback::runQuantize() const {
}
uint32_t num_saturate = 0;
- if (*ptr_weight_scale_factor == 1.0) {
- // scale factor for weights is not calculated yet
- float mean_weight = 0.0;
- float mean_weight_squared = 0.0;
- float max_weight = -1e20f;
- float var_weight;
- float mean_plus_2stdev;
-
- for (uint32_t i = 0; i < num_rows; i++) {
- for (uint32_t j = 0; j < num_columns; j++) {
- float weight = ptr_float_weights[i*num_columns + j];
- mean_weight += weight;
- mean_weight_squared += weight * weight;
- if (fabs(weight) > max_weight) {
- max_weight = fabs(weight);
- }
- }
- }
-
- mean_weight /= static_cast(num_rows * num_columns);
- mean_weight_squared /= static_cast(num_rows * num_columns);
- var_weight = mean_weight_squared - mean_weight * mean_weight;
- mean_plus_2stdev = mean_weight + 2.0f * static_cast(sqrtf(var_weight));
-
- *ptr_weight_scale_factor = static_cast(MAX_VAL_1B_WEIGHT) / max_weight;
-
- // For 8 bit weights quantize as follows:
- // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier
- // 2. find maximum scaled weight for each row
- // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range
- // 4. quantize and store scaled row
- *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor; // increase dynamic range by max multiplier
- *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
- }
float valueAcc = 0.0;
for (uint32_t row = 0; row < num_rows; row++) {
float scaled_row_max = 0;
diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h
index 67a72aadadf..1916bba298e 100644
--- a/inference-engine/src/gna_plugin/frontend/quantization.h
+++ b/inference-engine/src/gna_plugin/frontend/quantization.h
@@ -31,12 +31,13 @@ struct QuantizationCallback {
uint32_t num_rows_padded;
uint32_t num_columns_padded;
+ bool quantizedWeights;
int32_t fq_levels;
+ const size_t fq_num_stats;
const float *fq_ptr_input_low;
const float *fq_ptr_input_high;
- const float *fq_ptr_output_low;
- const float *fq_ptr_output_high;
- const bool* ptr_quantized_weights;
+ const float* fq_ptr_output_low;
+ const float* fq_ptr_output_high;
void runQuantize() const;
void runFakeQuantize() const;
@@ -45,5 +46,6 @@ struct QuantizationCallback {
template class QuantizationCallback;
template class QuantizationCallback;
+std::pair FindMinMaxValues(void* ptr_float_memory, size_t num_elements);
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
index 5f6c6a60907..bf510c7bb50 100644
--- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
+++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp
@@ -24,27 +24,57 @@ public:
int32_t GetLevels() const {
return levels;
}
- void SetMinValues(const std::vector &min) {
- min_values.clear();
- min_values.insert(min_values.end(), min.begin(), min.end());
+ bool IsStatsSet() const {
+ return !input_min_values.empty() && !input_max_values.empty();
}
- const std::vector& GetMinValues() const {
- return min_values;
+ void SetMinValues(const std::vector &min, bool input = true) {
+ if (input) {
+ input_min_values.clear();
+ input_min_values.insert(input_min_values.end(), min.begin(), min.end());
+ } else {
+ output_min_values.clear();
+ output_min_values.insert(output_min_values.end(), min.begin(), min.end());
+ }
}
- void SetMaxValues(const std::vector& max) {
- max_values.clear();
- max_values.insert(max_values.end(), max.begin(), max.end());
+ std::vector& GetMinValues(bool input = true) {
+ if (input) {
+ return input_min_values;
+ }
+
+ return output_min_values;
}
- const std::vector& GetMaxValues() const {
- return max_values;
+ void SetMaxValues(const std::vector& max, bool input = true) {
+ if (input) {
+ input_max_values.clear();
+ input_max_values.insert(input_max_values.end(), max.begin(), max.end());
+ } else {
+ output_max_values.clear();
+ output_max_values.insert(output_max_values.end(), max.begin(), max.end());
+ }
+ }
+ std::vector& GetMaxValues(bool input = true) {
+ if (input) {
+ return input_max_values;
+ }
+
+ return output_max_values;
+ }
+ void CopyStats(Quantization &src) {
+ levels = src.GetLevels();
+ SetMinValues(src.GetMinValues(true), true);
+ SetMaxValues(src.GetMaxValues(true), true);
+ SetMinValues(src.GetMinValues(false), false);
+ SetMaxValues(src.GetMaxValues(false), false);
}
private:
float scale = 1.0f;
bool scale_set = false;
int32_t levels = 0;
- std::vector min_values;
- std::vector max_values;
+ std::vector input_min_values;
+ std::vector input_max_values;
+ std::vector output_min_values;
+ std::vector output_max_values;
};
struct QuantizedLayerParams {
@@ -53,7 +83,6 @@ struct QuantizedLayerParams {
// deprecate this
Quantization _weights_quant;
- bool _weights_quantized = false;
Quantization _bias_quant;
float _o_shift = 0.0f;
float _b_shift = 0.0f;
diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
index 6791768e4e9..b6f5912a814 100644
--- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
+++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
@@ -16,9 +16,13 @@
#include "layers/gna_layer_info.hpp"
#include "gna_plugin_log.hpp"
#include "gna_slope_scale.h"
+#include "runtime/pwl.h"
namespace GNAPluginNS {
namespace frontend {
+static const float MIN_SEARCH_WEIGHTS_VAL = 1.0f;
+static const float MAX_SEARCH_WEIGHTS_VAL = 1024.0f;
+
struct ScaleFactorUpdateResult {
InferenceEngine::CNNLayer *restartLayer = nullptr;
ScaleFactorUpdateResult() = default;
@@ -29,6 +33,146 @@ struct ScaleFactorUpdateResult {
}
};
+/**
+ * @brief Compares two float values and returns if they are equal
+ * @param p1 First float value
+ * @param p2 Second float value
+ * @return Returns true if two float values are equal
+ */
+static bool fp32eq(float p1, float p2) {
+ return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+}
+
+/**
+ * @brief Calculates PWL slopes for specified function in a given input range
+ * @param info Layer information
+ * @return Array of slopes for a function
+ */
+static std::vector getPWLSlopes(const LayerInfo& info) {
+ if (info.isIdentity() || info.isFakeQuantize() || info.isRelu() || info.isClamp() || info.isAbs()) {
+ return { 1.0f };
+ }
+
+ return {};
+}
+
+/**
+ * @brief Finds the best output activation scale factor that allows to get the most precise PWL slope
+ * @param inScale Input activation layer scale factor
+ * @param outScales Array of output activation scale factors
+ * @param slopes Array of slopes for a given function
+ * @return Best output activation scale factor
+ */
+static float selectBestOutputScaleFactors(float inScale, std::vector outScales, const std::vector& slopes) {
+ std::vector scaleErrors;
+ for (size_t i = 0; i < outScales.size(); ++i) {
+ auto outScale = outScales[i];
+
+ auto sd = 0.0;
+ for (size_t j = 0; j < slopes.size(); ++j) {
+ auto s = gna_slope(slopes[j], inScale, outScale);
+ auto slope = static_cast(s.slope * s.slope_scale);
+ if (slope < std::numeric_limits::min() && slope > std::numeric_limits::max()) {
+ sd += std::numeric_limits::max();
+ continue;
+ }
+
+ auto testSlope = static_cast(slope) / s.slope_scale * inScale / outScale;
+ if (fp32eq(testSlope, slopes[j])) {
+ return outScale;
+ }
+
+ sd += pow(testSlope - slopes[j], 2.0);
+ }
+
+ sd /= slopes.size();
+ sd = sqrtf(sd);
+ scaleErrors.push_back(sd);
+ }
+
+ size_t minIndex = 0;
+ auto minError = scaleErrors[0];
+ for (size_t i = 1; i < scaleErrors.size(); ++i) {
+ if (scaleErrors[i] < minError) {
+ minError = scaleErrors[i];
+ minIndex = i;
+ }
+ }
+
+ return outScales[minIndex];
+}
+
+/**
+ * @brief Finds the weights scale factor that allows to get the most precise PWL slope
+ * @param inScale Input weightable layer scale factor
+ * @param outScale Output activation scale factor
+ * @param weightsScales Array of weights scales to check
+ * @return Best weights scale factor
+ */
+static float selectBestWeightsScaleFactors(float inScale, float outScale, std::vector weightsScales,
+ const std::vector& slopes) {
+ std::vector scaleErrors;
+ for (size_t i = 0; i < weightsScales.size(); ++i) {
+ auto weightScale = weightsScales[i];
+
+ auto sd = 0.0;
+ for (size_t j = 0; j < slopes.size(); ++j) {
+ auto s = gna_slope(slopes[j], inScale * weightScale, outScale);
+ auto slope = static_cast(s.slope * s.slope_scale);
+ if (slope < std::numeric_limits::min() && slope > std::numeric_limits::max()) {
+ sd += std::numeric_limits::max();
+ continue;
+ }
+
+ auto testSlope = static_cast(slope) / s.slope_scale * (inScale * weightScale) / outScale;
+ if (fp32eq(testSlope, slopes[j])) {
+ return outScale;
+ }
+ sd += pow(testSlope - slopes[j], 2.0);
+ }
+
+ sd /= slopes.size();
+ sd = sqrtf(sd);
+ scaleErrors.push_back(sd);
+ }
+
+ size_t minIndex = 0;
+ auto minError = scaleErrors[0];
+ for (size_t i = 1; i < scaleErrors.size(); ++i) {
+ if (scaleErrors[i] < minError) {
+ minError = scaleErrors[i];
+ minIndex = i;
+ }
+ }
+
+ return weightsScales[minIndex];
+}
+
+/**
+ * @brief Generates specified number of scale factors in a given range.
+ * @param startRange First scale factor
+ * @param endRange Last scale factor
+ * @param numIterations number of scale factors to generate
+ * @return Array of scale factors
+ */
+static std::vector generateScaleFactors(float startRange, float endRange, size_t numScaleFactors) {
+ if (!numScaleFactors) {
+ return { startRange, endRange };
+ }
+
+ auto scaleFactors = std::vector{};
+ auto domain = endRange - startRange;
+ auto step = domain / numScaleFactors;
+ for (size_t i = 0; i <= numScaleFactors; ++i) {
+ auto scale = startRange + step * i;
+ if (!std::isnan(scale)) {
+ scaleFactors.push_back(scale);
+ }
+ }
+
+ return scaleFactors;
+}
+
/**
* @brief calculates output scale factor per layer
* @tparam T
@@ -44,7 +188,7 @@ class ScaleFactorPerLayer {
* @param result
* @return
*/
- bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result) {
+ bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
return false;
}
};
@@ -54,17 +198,15 @@ class ScaleFactorPerLayer {
private :
const float activation_scale_factor = 2048.f;
const float identity_scale_factor = 2049.0f;
+ const float max_activation_scale_factor = 4096.0f;
const float k = 5;
const float k_identity = 6;
const double pow_domain = 16;
protected :
- static bool fp32eq(float p1, float p2) {
- return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
- }
-
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
- GNAPluginNS::LayerInfo const& layer) {
+ GNAPluginNS::LayerInfo const& layer,
+ const bool fakeQuantize) {
auto quantizedParams = InferenceEngine::getInjectedData(*cnnLayer);
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
@@ -136,18 +278,140 @@ class ScaleFactorPerLayer {
}
}
- if (!quantizedParams->_dst_quant.GetMaxValues().empty()) {
- auto min_value = quantizedParams->_dst_quant.GetMinValues().front();
- auto max_value = quantizedParams->_dst_quant.GetMaxValues().front();
- auto newScaleFactor = (quantizedParams->_dst_quant.GetLevels() - 1) / (max_value - min_value);
- result = newScaleFactor < result ? newScaleFactor : result;
+ // Identity layer is inserted by GNA passes and requires statistics to correctly set output
+ // scale factor. POT does not produce any statistics for this layer as it does not exist
+ // in the source IR.
+ if (fakeQuantize && !quantizedParams->_dst_quant.IsScaleSet() && layer.isIdentity()) {
+ auto prevLayer = CNNNetPrevLayer(cnnLayer);
+ while (prevLayer != nullptr) {
+ auto prevQuantParams = InferenceEngine::getInjectedData(*prevLayer);
+ if (prevQuantParams->_dst_quant.IsStatsSet()) {
+ quantizedParams->_dst_quant.CopyStats(prevQuantParams->_dst_quant);
+ quantizedParams->_src_quant.CopyStats(prevQuantParams->_dst_quant);
+ break;
+ }
+
+ // Take the input statistics only if layer does not modify input values.
+ if (prevQuantParams->_src_quant.IsStatsSet() &&
+ (LayerInfo(prevLayer).isNonFunctional() || LayerInfo(prevLayer).isMemory() ||
+ LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isInput())) {
+ quantizedParams->_dst_quant.CopyStats(prevQuantParams->_src_quant);
+ quantizedParams->_src_quant.CopyStats(prevQuantParams->_src_quant);
+ break;
+ }
+
+ // Stop searching for statistics if previous layer does not modify input values.
+ if ((LayerInfo(prevLayer).isWeightable() && !LayerInfo(prevLayer).isWeightableIdentity())
+ || LayerInfo(prevLayer).isEltwise() || LayerInfo(prevLayer).isActivation()) {
+ break;
+ }
+
+ if (!CNNNetHasPrevLayer(prevLayer.get())) {
+ break;
+ }
+
+ prevLayer = CNNNetPrevLayer(prevLayer);
+ }
+
+ // If did not find statistics by searching previous layers, check if a next layer has
+ // statistics set.
+ if (!quantizedParams->_dst_quant.IsStatsSet()) {
+ auto donotSkip = [](InferenceEngine::CNNLayerPtr) {
+ return false;
+ };
+
+ auto nextLayers = CNNNetGetAllNextLayersSkipCertain(cnnLayer, -1, donotSkip);
+ for (auto &l : nextLayers) {
+ auto nextQuantParams = InferenceEngine::getInjectedData(*l);
+ if (nextQuantParams->_src_quant.IsStatsSet()) {
+ quantizedParams->_dst_quant.CopyStats(nextQuantParams->_src_quant);
+ quantizedParams->_src_quant.CopyStats(nextQuantParams->_src_quant);
+ break;
+ }
+
+ // Take output statistics only if a next layer does not modify input values
+ if (nextQuantParams->_dst_quant.IsStatsSet() &&
+ (LayerInfo(l).isNonFunctional() || LayerInfo(l).isMemory())) {
+ quantizedParams->_dst_quant.CopyStats(nextQuantParams->_dst_quant);
+ quantizedParams->_src_quant.CopyStats(nextQuantParams->_dst_quant);
+ break;
+ }
+ }
+ }
+ }
+
+ // Adjust output scale factor based on statistics (if present) in the following steps:
+ // 1. calculate scale factor based on output min and max values
+ // 2. (temporary W/A) clamp scale factor to maximum activation scale factor
+ // 3. search previous layers if there was already scale factor set
+ // 4. adjust output scale factor to get the most precise PWL slope
+ if (quantizedParams->_dst_quant.IsStatsSet()) {
+ auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front();
+ auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
+ auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
+ auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue));
+
+ result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
+ if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) {
+ result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax);
+ }
+ //
+ //result = MAX_VAL_2B_FEAT / absMax;
+ if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
+ result = max_activation_scale_factor;
+ }
+
+ // TODO: remove clamping maximum scale factor
+ result = result > max_activation_scale_factor ? max_activation_scale_factor : result;
+ if (!layer.isIdentity() && !layer.isFakeQuantize() && !layer.isRelu() && !layer.isClamp()) {
+ result = result > activation_scale_factor ? activation_scale_factor : result;
+ }
+
+ // Take input scale factor from previous layer if previous layer does not modify
+ // input values
+ bool usePrevScaleFactor = false;
+ auto skipNonFunctional = [](InferenceEngine::CNNLayerPtr l) {
+ return LayerInfo(l).isNonFunctional();
+ };
+
+ auto prevLayer = CNNNetPrevLayerSkipCertain(cnnLayer, 0, skipNonFunctional);
+ auto prevLayer2 = prevLayer != nullptr? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional): nullptr;
+ if (prevLayer != nullptr &&
+ (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
+ auto prevLayerQuant = InferenceEngine::getInjectedData(*prevLayer);
+ if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
+ (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
+ result = prevLayerQuant->_src_quant.GetScale();
+ usePrevScaleFactor = true;
+ }
+ }
+
+ // Adjust output scale factor to get the most precise PWL slope.
+ // NOTE: Currently it is only implemented for identity, clamp, relu and FQ layers.
+ // For all other layers, it does not improve accuracy.
+ auto slopes = getPWLSlopes(layer);
+ if (!slopes.empty() && !usePrevScaleFactor) {
+ auto div = 10;
+ auto mul = 10;
+ auto startRange = result > 1.0f ? static_cast(result) : result;
+ auto endRange = startRange - startRange / div;
+ endRange = endRange > 1.0f ? static_cast(endRange) : endRange;
+ auto scaleFactors = generateScaleFactors(startRange, endRange, static_cast(startRange - endRange) * mul);
+ auto newScaleFactor = selectBestOutputScaleFactors(quantizedParams->_src_quant.GetScale(), scaleFactors, slopes);
+ if (!fp32eq(result, newScaleFactor) &&
+ !fp32eq(newScaleFactor, 1.0f) && !fp32eq(newScaleFactor, 0.0f) && !std::isinf(newScaleFactor)) {
+ gnalog() << "[INFO] Adjusting scale factor for " << cnnLayer->name
+ << " from: " << result << " to: " << newScaleFactor << "\n";
+ result = newScaleFactor;
+ }
+ }
}
return result;
}
public :
- bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result) {
+ bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !cnnLayer ) {
THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
}
@@ -156,7 +420,11 @@ class ScaleFactorPerLayer {
auto quant = InferenceEngine::getInjectedData(*cnnLayer);
if (InferenceEngine::details::CaselessEq()(cnnLayer->type, "Memory")) {
- if (!CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsScaleSet()) {
+ if (CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsStatsSet() && !quant->_dst_quant.IsScaleSet()) {
+ auto minOutValue = quant->_dst_quant.GetMinValues().front();
+ auto maxOutValue = quant->_dst_quant.GetMaxValues().front();
+ auto scale = (quant->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
+ quant->_dst_quant.SetScale(scale);
quant->_src_quant = quant->_dst_quant;
}
@@ -180,7 +448,9 @@ class ScaleFactorPerLayer {
return true;
}
- if (quantSibling->_dst_quant.IsScaleSet()) {
+ if ((!fakeQuantize && quantSibling->_dst_quant.IsScaleSet()) ||
+ (fakeQuantize && quantSibling->_dst_quant.IsScaleSet() && !fp32eq(quantSibling->_dst_quant.GetScale(), 1.0) &&
+ quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale())) {
// means we already restarted propagation input memory layer
// need to search for requantiseable layer prior memory output layer
InferenceEngine::CNNLayerPtr restartedLayer;
@@ -230,7 +500,8 @@ class ScaleFactorPerLayer {
<< activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl;
// try updating memory input layer scale factor and restart from it
- quantSibling->_src_quant = quantSibling->_dst_quant = inputQuant->_dst_quant;
+ quantSibling->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
+ quantSibling->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
result = ScaleFactorUpdateResult(input.get());
return true;
}
@@ -241,49 +512,55 @@ class ScaleFactorPerLayer {
if (cnnLayer->type == "Const") {
if (quant->_dst_quant.IsScaleSet()) {
quant->_src_quant = quant->_dst_quant;
- return ScaleFactorUpdateResult();
- }
-
- auto blob = cnnLayer->blobs["custom"];
- auto blob_precision = blob->getTensorDesc().getPrecision();
-
- if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) {
- quant->_dst_quant.SetScale(1.0f);
return true;
}
- if (blob_precision == InferenceEngine::Precision::FP16) {
- blob = make_fp32_blob(blob);
- }
-
auto max_val = std::numeric_limits::min();
auto min_val = std::numeric_limits::max();
+ if (quant->_dst_quant.IsStatsSet()) {
+ min_val = quant->_dst_quant.GetMinValues().front();
+ max_val = quant->_dst_quant.GetMaxValues().front();
+ } else {
+ auto blob = cnnLayer->blobs["custom"];
+ auto blob_precision = blob->getTensorDesc().getPrecision();
- auto flt_buf = blob->buffer().as();
- auto size = blob->size();
+ if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) {
+ quant->_dst_quant.SetScale(1.0f);
+ return true;
+ }
- for (int i=0; i < size; i++) {
- auto val = flt_buf[i];
- if (val > max_val) max_val = val;
- if (val < min_val) min_val = val;
+ if (blob_precision == InferenceEngine::Precision::FP16) {
+ blob = make_fp32_blob(blob);
+ }
+
+ auto flt_buf = blob->buffer().as();
+ auto size = blob->size();
+
+ for (int i = 0; i < size; i++) {
+ auto val = flt_buf[i];
+ if (val > max_val) max_val = val;
+ if (val < min_val) min_val = val;
+ }
}
+ auto levels = fakeQuantize ? MAX_VAL_2B_FEAT : std::numeric_limits::max();
auto abs_val = std::max(std::abs(max_val), std::abs(min_val));
- auto scale_val = static_cast(std::numeric_limits::max()) / abs_val;
+ auto scale_val = static_cast(levels) / abs_val;
+ //TODO: use FQ formula for scale factor calculation
- // TODO: Investigate what should be the scale in such cases (31910)
- if (std::isinf(scale_val)) {
- quant->_dst_quant.SetScale(quant->_src_quant.GetScale());
+ if (std::isinf(scale_val) || fp32eq(abs_val, 0.0f)) {
+ quant->_dst_quant.SetScale(fakeQuantize ? levels : 1.0f);
} else {
quant->_dst_quant.SetScale(scale_val);
}
+ quant->_src_quant.SetScale(quant->_dst_quant.GetScale());
- return ScaleFactorUpdateResult();
+ return true;
}
if (!CNNNetHasPrevLayer(cnnLayer)) {
quant->_dst_quant = quant->_src_quant;
- return ScaleFactorUpdateResult();
+ return true;
}
// by default layer is pass thru its scale factor
@@ -292,17 +569,41 @@ class ScaleFactorPerLayer {
THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized";
}
- quant->_src_quant = inputQuant->_dst_quant;
- if (layerInfo.isActivation()) {
+ if (layerInfo.isPower() && !layerInfo.isActivation()) {
+ auto quant = InferenceEngine::getInjectedData(*cnnLayer);
+ auto powerLayer = dynamic_cast(cnnLayer);
+ if (!powerLayer) {
+ THROW_IE_EXCEPTION << "Incorrect Power Layer pointer \n";
+ }
+
+ auto powerScale = std::abs(powerLayer->scale);
+ if (fp32eq(powerScale, 0.0f)) {
+ powerScale = 1.0f;
+ }
+ auto weightsScaleFactor = MAX_VAL_2B_WEIGHT / powerScale;
+ quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
+ quant->_weights_quant.SetScale(weightsScaleFactor);
+ quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
+ return true;
+ } else if (layerInfo.isActivation()) {
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
// set the initial value
- if (!quant->_dst_quant.IsScaleSet()) {
- auto scale = getActivationScale(cnnLayer, layerInfo);
+ if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) ||
+ !fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
+ quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
+ auto scale = getActivationScale(cnnLayer, layerInfo, fakeQuantize);
quant->_dst_quant.SetScale(scale);
}
return true;
+ } else if (layerInfo.isCropAffined()) {
+ auto weightsScaleFactor = 1;
+ quant->_weights_quant.SetScale(weightsScaleFactor);
+ quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
+ quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
+ return true;
}
- quant->_dst_quant = inputQuant->_dst_quant;
+ quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
+ quant->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
return true;
}
@@ -311,7 +612,7 @@ class ScaleFactorPerLayer {
template<>
class ScaleFactorPerLayer {
public:
- bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result) {
+ bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !eltwiseLayer ) {
THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
}
@@ -325,7 +626,7 @@ class ScaleFactorPerLayer {
switch (eltwiseLayer->_operation) {
case InferenceEngine::EltwiseLayer::Prod: {
- quantData->_weights_quant = quantParams1->_dst_quant;
+ quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale());
break;
}
@@ -344,9 +645,51 @@ class ScaleFactorPerLayer {
std::swap(quantParams0, quantParams1);
}
+ auto prevLayer = in1;
+ while (LayerInfo(prevLayer).isNonFunctional() && CNNNetHasPrevLayer(prevLayer.get(), 0)) {
+ prevLayer = CNNNetPrevLayer(prevLayer);
+ }
+
// this path might result in significant data loss
quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
- quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
+ auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale();
+ auto prevLayerIn1 = CNNNetPrevLayer(in1);
+ // If a previous layer is a layer where freely weights scale factor can be selected,
+ // try to find the scale factor that will allow to use integer as weights scale factor for eltwise
+ // operation.
+ // If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation.
+ if (fakeQuantize && LayerInfo(in1).isWeightableIdentity() &&
+ (prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has16BOutput())) {
+ auto bestWeightsScale = 0.0f;
+ auto bestError = static_cast(std::numeric_limits::max());
+ auto scaleIn0Dst = quantParams0->_dst_quant.GetScale();
+ auto scaleIn1Src = quantParams1->_src_quant.GetScale();
+ for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) {
+ auto scaleIn1Dst = i * scaleIn1Src;
+ auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst;
+ if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits::max() - 1) {
+ continue;
+ }
+
+ auto error = std::abs(eltwiseWeightsScale - static_cast(eltwiseWeightsScale));
+ if (error < bestError) {
+ bestError = error;
+ bestWeightsScale = i;
+ }
+
+ if (fp32eq(error, 0.0f)) {
+ break;
+ }
+ }
+
+ if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
+ quantParams1->_weights_quant.SetScale(bestWeightsScale);
+ quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
+ result = ScaleFactorUpdateResult(in1.get());
+ return true;
+ }
+ }
+ quantData->_weights_quant.SetScale(weightsScale);
quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
// eltwise will always work in int16
@@ -382,6 +725,22 @@ class ScaleFactorPerLayer {
break;
}
+ if (fakeQuantize && info.isWeightableIdentity()) {
+ auto quantDataForInputLayer = InferenceEngine::getInjectedData(*in);
+ if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
+ auto reducer = quantData->_weights_quant.GetScale() / std::numeric_limits::max();
+ reducer = std::max(1.0f, reducer);
+ auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer;
+ newWeightsScale = std::max(1.0f, newWeightsScale);
+ quantDataForInputLayer->_weights_quant.SetScale(static_cast(newWeightsScale));
+ quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
+ quantDataForInputLayer->_src_quant.GetScale());
+
+ result = ScaleFactorUpdateResult(in.get());
+ return true;
+ }
+ }
+
// if we are here it means that we are in the port 1
if (info.isFullyConnected() || info.isConvolution()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData(*in);
@@ -408,7 +767,7 @@ class ScaleFactorPerLayer {
template<>
class ScaleFactorPerLayer {
public:
- bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result) {
+ bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !concatLayer ) {
THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
}
@@ -417,10 +776,6 @@ class ScaleFactorPerLayer {
THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
}
- auto fp32eq = [](float p1, float p2) -> bool {
- return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
- };
-
auto quantData = InferenceEngine::getInjectedData(*concatLayer);
std::vector inputLayers;
for (auto input_idx = 0; input_idx != concatLayer->insData.size(); input_idx++) {
@@ -435,7 +790,7 @@ class ScaleFactorPerLayer {
auto in0 = inputLayers.front();
auto quantParams0 = InferenceEngine::getInjectedData(in0);
auto scaleFactor = quantParams0->_dst_quant.GetScale();
- auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
+ auto scaleFactorCheck = [scaleFactor](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData(inputLayer);
return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor);
};
@@ -453,14 +808,14 @@ class ScaleFactorPerLayer {
};
GNAPluginNS::QuantizedLayerParams* sourceQuantParams = nullptr;
- auto firstInputIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck);
- if (firstInputIt != inputLayers.end()) {
- auto quantParamsFirst = InferenceEngine::getInjectedData(*firstInputIt);
- auto nextInputIt = firstInputIt + 1;
+ auto sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck);
+ if (sourceLayerIt != inputLayers.end()) {
+ auto quantParamsFirst = InferenceEngine::getInjectedData(*sourceLayerIt);
+ auto nextInputIt = sourceLayerIt + 1;
while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) {
auto quantParamsSecond = InferenceEngine::getInjectedData(*nextInputIt);
if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) {
- THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name
+ THROW_GNA_EXCEPTION << "Two Input layers " << (*sourceLayerIt)->name
<< " and " << (*nextInputIt)->name << " have different scales in concat!!! \n";
}
}
@@ -469,7 +824,6 @@ class ScaleFactorPerLayer {
// find a source quant value
// - 1st candidate - input layer
// - 2nd candidate - non-activation layer with non-1 scale factor
- // - 3rd candidate - 1st layer with non-1 scale factor
static std::map restarted_counter;
auto restartedCountIt = restarted_counter.find(concatLayer->name);
if (restartedCountIt == restarted_counter.end()) {
@@ -477,29 +831,45 @@ class ScaleFactorPerLayer {
restartedCountIt = pos.first;
}
- auto sourceLayerIt = firstInputIt;
if (sourceLayerIt == inputLayers.end()) {
if (((restartedCountIt->second) / 2) % 2 == 1) {
std::reverse(inputLayers.begin(), inputLayers.end());
}
- if (((restartedCountIt->second) / 4) % 2 == 0) {
- auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
- auto quantParams = InferenceEngine::getInjectedData(inputLayer);
- LayerInfo info(inputLayer);
- return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
- };
- sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck);
+
+ if (fakeQuantize) {
+ sourceLayerIt = inputLayers.begin();
+ auto quantParamsFirst = InferenceEngine::getInjectedData(*inputLayers.begin());
+ auto minScaleFactor = quantParamsFirst->_dst_quant.GetScale();
+ for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) {
+ auto quantParams = InferenceEngine::getInjectedData(*it);
+ if (quantParams->_dst_quant.GetScale() < minScaleFactor &&
+ !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f) ||
+ fp32eq(minScaleFactor, 1.0f)) {
+ minScaleFactor = quantParams->_dst_quant.GetScale();
+ sourceLayerIt = it;
+ }
+ }
+ } else {
+ if (((restartedCountIt->second) / 4) % 2 == 0) {
+ auto sourceLayerCheck = [](InferenceEngine::CNNLayerPtr& inputLayer) {
+ auto quantParams = InferenceEngine::getInjectedData(inputLayer);
+ LayerInfo info(inputLayer);
+ return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
+ };
+ sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck);
+ }
+
+ if (sourceLayerIt == inputLayers.end()) {
+ auto nonDefaultScaleFactor = [](InferenceEngine::CNNLayerPtr& inputLayer) {
+ auto quantParams = InferenceEngine::getInjectedData(inputLayer);
+ return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
+ };
+
+ sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
+ }
}
- }
- ++restartedCountIt->second;
- if (sourceLayerIt == inputLayers.end()) {
- auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
- auto quantParams = InferenceEngine::getInjectedData(inputLayer);
- return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
- };
-
- sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
+ ++restartedCountIt->second;
}
std::set concatIdxToUpdate;
@@ -514,24 +884,29 @@ class ScaleFactorPerLayer {
continue;
}
- // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
- if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) {
+ if (fakeQuantize) {
concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
- }
+ quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
+ } else {
+ // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
+ if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) {
+ concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
+ }
- quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
+ quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
+ }
}
}
auto updatedScaleFactor = InferenceEngine::getInjectedData(in0)->_dst_quant.GetScale();
- auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
+ auto equalScaleFactor = [updatedScaleFactor](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData(inputLayer);
return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor);
};
auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor);
if (layerIt != inputLayers.end()) {
- THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name;
+ THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors. Layer name: " << concatLayer->name;
}
quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
@@ -555,7 +930,7 @@ class ScaleFactorPerLayer {
gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name;
// found that direct input to concat is a indirect parent of align filter - so no link required
auto info = LayerInfo(layer);
- if (!info.isWeightable() && !info.isActivation() && !info.isConst() && !info.isMemory()) {
+ if (!info.isWeightable() && !info.isActivation() && !info.isConst()) {
gnalog() << "... skipped\n";
return;
}
@@ -575,16 +950,44 @@ class ScaleFactorPerLayer {
auto restarLayerInfo = LayerInfo(restartedLayer);
if (restarLayerInfo.isActivation()) {
// requantize activation by just changing it's output scale factor
- quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
- }
- if (restarLayerInfo.isConst()) {
+ auto newScaleFactor = sourceQuantParams->_dst_quant.GetScale();
+ auto skipNonFunctional = [](InferenceEngine::CNNLayerPtr l) {
+ return LayerInfo(l).isNonFunctional();
+ };
+
+ auto prevLayer = CNNNetPrevLayerSkipCertain(restartedLayer, 0, skipNonFunctional);
+ auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr;
+
+ if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() &&
+ (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
+ auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL,
+ MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL);
+
+ auto prevLayerQuant = InferenceEngine::getInjectedData(*prevLayer);
+ auto bestWeightsScale = 1.0f;
+ auto slopes = getPWLSlopes(restarLayerInfo);
+ if (!slopes.empty() && !fp32eq(prevLayerQuant->_src_quant.GetScale(), newScaleFactor)) {
+ bestWeightsScale = selectBestWeightsScaleFactors(prevLayerQuant->_src_quant.GetScale(),
+ newScaleFactor, weightsScales, { 1.0f });
+ }
+ if (!slopes.empty() && !fp32eq(bestWeightsScale, prevLayerQuant->_weights_quant.GetScale())) {
+ gnalog() << "[INFO][Concat] Optimizing weights scale factor for '" << prevLayer->name << "' layer. Change from "
+ << prevLayerQuant->_weights_quant.GetScale() << " to " << bestWeightsScale << "\n";
+
+ prevLayerQuant->_weights_quant.SetScale(bestWeightsScale);
+ prevLayerQuant->_dst_quant.SetScale(prevLayerQuant->_weights_quant.GetScale() * prevLayerQuant->_src_quant.GetScale());
+ result = ScaleFactorUpdateResult(prevLayer.get());
+ return true;
+ }
+ }
+
+ quantDataForConCatInput->_dst_quant.SetScale(newScaleFactor);
+ } else if (restarLayerInfo.isConst()) {
gnalog() << "... warning const layer will be requantized\n";
- quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
- }
- if (restarLayerInfo.isMemory()) {
- gnalog() << "... warning memory layer will be requantized\n";
quantDataForConCatInput->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
+ } else {
+ THROW_GNA_EXCEPTION << "cannot requantize '" << restartedLayer->name << "' input to concat: " << concatLayer->name;
}
result = ScaleFactorUpdateResult(restartedLayer.get());
}
@@ -607,7 +1010,7 @@ class ScaleFactorPerLayer {
uint16_t const _scale_change_threshold_200 = 200;
public:
- bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result) {
+ bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !wl ) {
THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n";
} else if (!wl->_weights) {
@@ -620,8 +1023,30 @@ class ScaleFactorPerLayer {
auto quant = InferenceEngine::getInjectedData(*wl);
quant->_src_quant = quantDataForInputLayer->_dst_quant;
+ if (quant->_weights_quant.IsStatsSet() && !quant->_weights_quant.IsScaleSet()) {
+ auto getScale = [&quant](size_t i) {
+ return (quant->_weights_quant.GetLevels() - 1) /
+ (quant->_weights_quant.GetMaxValues(false)[i] - quant->_weights_quant.GetMinValues(false)[i]);
+ };
+
+ float min_channel_scale = getScale(0);
+ for (uint32_t i = 1; i < quant->_weights_quant.GetMinValues().size(); i++) {
+ min_channel_scale = std::min(min_channel_scale, getScale(i));
+ }
+
+ auto multiplier = 1.0f;
+ if (quant->_weights_quant.GetLevels() <= std::numeric_limits::max()) {
+ // GNA supports additional multiplier for only 8bit weights.
+ // The multipler is used to extend dynamic range.
+ multiplier = MAX_OUT_MULTIPLIER;
+ }
+
+ // Common weights scale calculation
+ quant->_weights_quant.SetScale(min_channel_scale * multiplier);
+ }
+
// TODO: pass 8 bits somehow
- if (quant->_weights_quant.GetScale() == 1.0f) {
+ if (!quant->_weights_quant.IsScaleSet()) {
size_t scaleRange = 0;
if (weightsSize == 2) {
scaleRange = MAX_VAL_2B_WEIGHT;
@@ -632,7 +1057,7 @@ class ScaleFactorPerLayer {
}
quant->_weights_quant.SetScale(
ScaleFactorForQuantization(wl->_weights->buffer().as(), scaleRange, wl->_weights->size()));
- if (quant->_weights_quant.GetScale() == -1.0f) {
+ if (quant->_weights_quant.GetScale() == -1.0f || (fakeQuantize && LayerInfo(wl).isConcatAlignFilter())) {
quant->_weights_quant.SetScale(1.0f);
}
@@ -685,6 +1110,39 @@ class ScaleFactorPerLayer {
}
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
+ if (quant->_dst_quant.IsStatsSet()) {
+ // Adjust weights scale factor if output values exceed int32 maximum value
+
+ if (wl->_biases && !quant->_bias_quant.IsScaleSet()) {
+ auto minMax = FindMinMaxValues(wl->_biases->buffer().as(), wl->_biases->size());
+ quant->_bias_quant.SetMinValues({ minMax.first });
+ quant->_bias_quant.SetMaxValues({ minMax.second });
+
+ auto biasScale = ScaleFactorForQuantization(wl->_biases->buffer().as(), MAX_VAL_4B_BIAS, wl->_biases->size());
+ quant->_bias_quant.SetScale(biasScale);
+ if (quant->_bias_quant.GetScale() != -1.0f && quant->_bias_quant.GetScale() < quant->_dst_quant.GetScale()) {
+ quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
+ quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
+ }
+ }
+
+ auto maxAbsVal = std::max(std::abs(quant->_dst_quant.GetMinValues().front()),
+ std::abs(quant->_dst_quant.GetMaxValues().front()));
+
+ auto maxIntVal = static_cast(maxAbsVal * quant->_dst_quant.GetScale() + 0.5f);
+ auto weightsReducer = static_cast(maxIntVal) / std::numeric_limits::max();
+ weightsReducer = std::max(1.0, weightsReducer);
+ if (!fp32eq(weightsReducer, 1.0f)) {
+ quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weightsReducer);
+ }
+
+ if (fp32eq(quant->_weights_quant.GetScale(), 0.0f) || std::isinf(quant->_weights_quant.GetScale())) {
+ quant->_weights_quant.SetScale(1.0f);
+ }
+
+ quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
+ }
+
return true;
}
};
@@ -692,8 +1150,8 @@ class ScaleFactorPerLayer {
template<>
class ScaleFactorPerLayer : public ScaleFactorPerLayer {
public:
- bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result) {
- return ScaleFactorPerLayer::operator()(wl, 2, result);
+ bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
+ return ScaleFactorPerLayer::operator()(wl, 2, result, fakeQuantize);
}
};
@@ -717,10 +1175,11 @@ class ScaleFactorCalculator {
mutable Cnt::const_iterator idx;
mutable bool needRestart = false;
int weightsBytesSize;
+ bool isFakeQuantize;
public:
- ScaleFactorCalculator(Cnt &net, int weightsBytesSize)
- : net(net), weightsBytesSize(weightsBytesSize) {
+ ScaleFactorCalculator(Cnt &net, int weightsBytesSize, bool fakeQuantize)
+ : net(net), weightsBytesSize(weightsBytesSize), isFakeQuantize(fakeQuantize) {
idx = std::begin(this->net);
}
bool needToRestart() const {
@@ -736,7 +1195,7 @@ class ScaleFactorCalculator {
bool operator()(T ptr) const {
needRestart = false;
frontend::ScaleFactorUpdateResult result;
- if (!frontend::ScaleFactorPerLayer()(ptr, weightsBytesSize, result)) {
+ if (!frontend::ScaleFactorPerLayer()(ptr, weightsBytesSize, result, isFakeQuantize)) {
return false;
}
if (result) {
diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
index 19f22520a90..87afd6deb7d 100644
--- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
+++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp
@@ -740,6 +740,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
auto orientation = kDnnInterleavedOrientation;
auto activation_type = DnnActivation::fromType(kActPow);
+ activation_type.fqParams.set = false;
+ activation_type.srcFQParams.set = false;
activation_type.args.pow.exponent = power.power;
activation_type.args.pow.scale = power.scale;
activation_type.args.pow.offset = power.offset;
@@ -768,7 +770,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
PwlDesignOpt16(activation_type,
ptr_pwl_segments,
input_pwl_scale_factor,
- output_pwl_scale_factor);
+ output_pwl_scale_factor,
+ gnaFlags->pwlMaxErrorPercent);
}
}
@@ -1668,14 +1671,6 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
}
}
-void GNAGraphCompiler::FakeQuantizePrimitive(InferenceEngine::CNNLayerPtr layer) {
- // in FP32 mode lets use special form of activation that satisfies fakeQuantize formula
- if (gnaFlags->sw_fp32) {
- PWLPrimitive(layer);
- return;
- }
-}
-
void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
auto* generic = dynamic_cast(layer.get());
std::string type;
@@ -1768,6 +1763,24 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type;
}
auto activation_type = DnnActivation::fromType(it->second);
+ activation_type.fqParams.set = false;
+ if (quantized != nullptr && quantized->_dst_quant.IsStatsSet()) {
+ activation_type.fqParams.set = true;
+ activation_type.fqParams.levels = quantized->_dst_quant.GetLevels();
+ activation_type.fqParams.inputPerChannel = false;
+ activation_type.fqParams.input_low = &(quantized->_dst_quant.GetMinValues(true).front());
+ activation_type.fqParams.input_high = &(quantized->_dst_quant.GetMaxValues(true).front());
+ }
+
+ activation_type.srcFQParams.set = false;
+ if (quantized != nullptr && quantized->_src_quant.IsStatsSet()) {
+ activation_type.srcFQParams.set = true;
+ activation_type.srcFQParams.levels = quantized->_src_quant.GetLevels();
+ activation_type.srcFQParams.inputPerChannel = false;
+ activation_type.srcFQParams.input_low = &(quantized->_src_quant.GetMinValues(true).front());
+ activation_type.srcFQParams.input_high = &(quantized->_src_quant.GetMaxValues(true).front());
+ }
+
if (it->second == kActRelu) {
auto reluLayer = dynamic_cast(layer.get());
activation_type.args.lrelu.negative_slope = reluLayer != nullptr ? reluLayer->negative_slope : 0.0f;
@@ -1775,11 +1788,9 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
activation_type.args.lrelu.negative_slope = 0.0f;
}
- if (it->second == kActFakeQuantize) {
+ if (quantized == nullptr && it->second == kActFakeQuantize) {
activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation();
- }
-
- if (it->second == kActKaldiLstmClipping) {
+ } else if (it->second == kActKaldiLstmClipping) {
auto clamp_layer = dynamic_cast(layer.get());
if (clamp_layer) {
if (clamp_layer->min_value == 0 && clamp_layer->max_value == 0) {
@@ -1856,7 +1867,8 @@ case name:\
PwlDesignOpt16(activation_type,
ptr_pwl_segments,
input_pwl_scale_factor,
- output_pwl_scale_factor);
+ output_pwl_scale_factor,
+ gnaFlags->pwlMaxErrorPercent);
}
ptr_pwl_segments_target = reinterpret_cast(&ptr_pwl_segments_target);
}
@@ -2001,7 +2013,7 @@ void GNAGraphCompiler::CreateLayerPrimitive(CNNLayerPtr layer) {
{{DelayedCopyLayerName}, CREATE(CopyPrimitive)},
{{"TensorIterator"}, SKIP},
{{"LSTMCell"}, SKIP},
- {{"FakeQuantize"}, CREATE(FakeQuantizePrimitive)} // TODO: fakequantize layer should be properly converted to GNA scale factors for integer case
+ {{"FakeQuantize"}, CREATE(PWLPrimitive)}
};
(void)layersBuilder;
auto it = LayersBuilder::getStorage().find(layer->type);
diff --git a/inference-engine/src/gna_plugin/gna_graph_tools.hpp b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
index 112e6060c30..bd3dfe90a9b 100644
--- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp
+++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp
@@ -663,10 +663,10 @@ inline void CNNNetworkRemoveLayer(CNNLayerPtr layer, bool checkDims = true) {
}
gnalog() << "Removing " << layer->name << " layer\n";
if (layer->insData.size() != 1) {
- THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 input";
+ THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has different number of inputs than 1";
}
if (layer->outData.size() != 1) {
- THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 output";
+ THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has different number of outputs than 1";
}
auto isp = layer->insData.front().lock();
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index c8a337c3617..d978bbd46f5 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -24,7 +24,6 @@
#include
#include
#include "gna_plugin_config.hpp"
-#include
#include "gna_plugin.hpp"
#include "optimizer/gna_pass_manager.hpp"
#include "layers/gna_layer_type.hpp"
@@ -50,6 +49,10 @@
#include
#include
#include
+#include
+#include
+#include
+#include
#if GNA_LIB_VER == 2
#include
@@ -394,9 +397,9 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
// search for FQ layers
// only supports cases of int16 or int8
InputsDataMap inputs = network.getInputsInfo();
- for (auto && input : inputs) {
+ size_t inputIdx = 0;
+ for (auto&& input : inputs) {
auto data = input.second->getInputData();
- size_t inputIdx = 0;
for (auto && nextToInputLayer : getInputTo(data)) {
if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
inputIdx++;
@@ -411,7 +414,16 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second)
<< "unsupported, per-channel quantization for input layer : " << input.second->name();
}
+
+ auto fp32eq = [](float p1, float p2) -> bool {
+ return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+ };
float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]);
+ auto minAbsVal = std::min(std::abs(inputRange.second[0]), std::abs(inputRange.first[0]));
+ auto maxAbsVal = std::max(std::abs(inputRange.second[0]), std::abs(inputRange.first[0]));
+ if (fp32eq(minAbsVal, 0.0f) && !fp32eq(maxAbsVal, 0.0f)) {
+ scaleInput = (fqLayer.getLevels() - 1) / (2 * maxAbsVal);
+ }
if (!config.inputScaleFactors.empty()) {
gnalog() << "Scale factor calculated during model quantization (" << scaleInput
@@ -676,6 +688,68 @@ void GNAPlugin::ConvertModelLayoutFromNCHWToNHWC(const std::vector
}
}
+#ifdef PLOT
+void GNAPlugin::AddDebugProperties(const InferenceEngine::CNNLayerPtr layer,
+ InferenceEngine::ordered_properties& printed_properties,
+ InferenceEngine::ordered_properties& node_properties) {
+ // printing quantized params
+ auto quantized = InferenceEngine::getInjectedData(layer);
+ if (!quantized) {
+ return;
+ }
+ if (LayerInfo(layer).isWeightable() || LayerInfo(layer).isEltwise()) {
+ printed_properties.emplace_back(
+ "weights scale factor", std::to_string(quantized->_weights_quant.GetScale()));
+ if (quantized->_weights_quant.IsStatsSet()) {
+ for (auto& min : quantized->_weights_quant.GetMinValues()) {
+ printed_properties.emplace_back(
+ "weights min val", std::to_string(min));
+ }
+ for (auto& max : quantized->_weights_quant.GetMaxValues()) {
+ printed_properties.emplace_back(
+ "weights max val", std::to_string(max));
+ }
+ }
+
+ if (quantized->_bias_quant.IsStatsSet()) {
+ for (auto& min : quantized->_bias_quant.GetMinValues()) {
+ printed_properties.emplace_back(
+ "bias min val", std::to_string(min));
+ }
+ for (auto& max : quantized->_bias_quant.GetMaxValues()) {
+ printed_properties.emplace_back(
+ "bias max val", std::to_string(max));
+ }
+ }
+ }
+ printed_properties.emplace_back(
+ "src scale factor", std::to_string(quantized->_src_quant.GetScale()));
+ if (quantized->_src_quant.IsStatsSet()) {
+ for (auto& min : quantized->_src_quant.GetMinValues()) {
+ printed_properties.emplace_back(
+ "src min val", std::to_string(min));
+ }
+ for (auto& max : quantized->_src_quant.GetMaxValues()) {
+ printed_properties.emplace_back(
+ "src max val", std::to_string(max));
+ }
+ }
+
+ printed_properties.emplace_back(
+ "dst scale factor", std::to_string(quantized->_dst_quant.GetScale()));
+ if (quantized->_dst_quant.IsStatsSet()) {
+ for (auto& min : quantized->_dst_quant.GetMinValues()) {
+ printed_properties.emplace_back(
+ "dst min val", std::to_string(min));
+ }
+ for (auto& max : quantized->_dst_quant.GetMaxValues()) {
+ printed_properties.emplace_back(
+ "dst max val", std::to_string(max));
+ }
+ }
+}
+#endif
+
void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
std::shared_ptr convertedNetwork;
if (_network.getFunction()) {
@@ -698,6 +772,10 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});
+ pass_config->disable();
+ pass_config->disable();
+ pass_config->disable();
+ pass_config->disable();
manager.run_passes(graph);
convertedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, clonedNetwork);
}
@@ -809,17 +887,11 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
#ifdef PLOT
std::ofstream file("gna_passes.dot");
- saveGraphToDot(newNet, file, [](const CNNLayerPtr layer,
- ordered_properties &printed_properties,
- ordered_properties &node_properties) {
- // printing quantized params
- auto quantized = InferenceEngine::getInjectedData(layer);
- if (!quantized) {
- return;
- }
- printed_properties.emplace_back(
- "scale factor", std::to_string(quantized->_dst_quant.GetScale()));
- });
+ saveGraphToDot(newNet, file, [this](const CNNLayerPtr layer,
+ ordered_properties& printed_properties,
+ ordered_properties& node_properties) {
+ AddDebugProperties(layer, printed_properties, node_properties);
+ });
#endif
auto sortedNet = CNNNetSortTopologicallyEx(newNet, make_fuzed_order);
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp
index 1a6e20d558c..0af27ba6572 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -23,6 +23,7 @@
#include "gna_plugin_policy.hpp"
#include "gna_plugin_log.hpp"
#include "gna_plugin_config.hpp"
+#include
#if GNA_LIB_VER == 2
#include
@@ -237,6 +238,11 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
* @param layers model sorted layers
*/
void ConvertModelLayoutFromNCHWToNHWC(const std::vector &layers);
+#ifdef PLOT
+ void AddDebugProperties(const InferenceEngine::CNNLayerPtr layer,
+ InferenceEngine::ordered_properties& printed_properties,
+ InferenceEngine::ordered_properties& node_properties);
+#endif
};
} // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.cpp b/inference-engine/src/gna_plugin/gna_plugin_config.cpp
index 60d4d854214..b7d20534733 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_config.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_config.cpp
@@ -156,6 +156,24 @@ void Config::UpdateFromMap(const std::map& config) {
THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
<< "should be equal to YES/NO, but not" << value;
}
+ } else if (key == GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)) {
+ float max_error;
+ try {
+ max_error = InferenceEngine::CNNLayer::ie_parse_float(value);
+ if (max_error < 0.0f || max_error > 100.0f) {
+ throw std::out_of_range("");
+ }
+ }
+ catch (std::invalid_argument&) {
+ THROW_GNA_EXCEPTION << "Invalid value of PWL max error percent";
+ }
+ catch (std::out_of_range&) {
+ log << "Unsupported PWL error percent value: " << value
+ << ", should be greater than 0 and less than 100";
+ THROW_GNA_EXCEPTION << "Unsupported PWL error percent value: " << value
+ << ", should be greater than 0 and less than 100";
+ }
+ gnaFlags.pwlMaxErrorPercent = max_error;
} else if (key == CONFIG_KEY(PERF_COUNT)) {
if (value == PluginConfigParams::YES) {
gnaFlags.performance_counting = true;
@@ -252,6 +270,7 @@ void Config::AdjustKeyMapValues() {
keyConfigMap[GNA_CONFIG_KEY(PRECISION)] = gnaPrecision.name();
keyConfigMap[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] =
gnaFlags.uniformPwlDesign ? PluginConfigParams::YES: PluginConfigParams::NO;
+ keyConfigMap[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(gnaFlags.pwlMaxErrorPercent);
keyConfigMap[CONFIG_KEY(PERF_COUNT)] =
gnaFlags.performance_counting ? PluginConfigParams::YES: PluginConfigParams::NO;
keyConfigMap[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(gnaFlags.gna_lib_async_threads_num);
diff --git a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
index c80cb62d6e5..9d30126a1ce 100644
--- a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp
@@ -29,7 +29,7 @@ class GNAFakeQuantizeLayer {
DnnActivation parseAsActivation() const {
DnnActivation fqActivation;
- fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels");
+ fqActivation.fqParams.levels = fqLayer->GetParamAsInt("levels");
auto inputShape = getShapeForRange(fqLayer, 1);
auto outputShape = getShapeForRange(fqLayer, 3);
@@ -37,13 +37,15 @@ class GNAFakeQuantizeLayer {
auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end());
auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end());
- fqActivation.args.fakeQuantize.inputPerChannel = inputRangeSize != 1;
- fqActivation.args.fakeQuantize.input_low = getParamFromInputAsFloats(fqLayer, 1);
- fqActivation.args.fakeQuantize.input_high = getParamFromInputAsFloats(fqLayer, 2);
+ fqActivation.fqParams.set = true;
- fqActivation.args.fakeQuantize.outputPerChannel = outputRangeSize != 1;
- fqActivation.args.fakeQuantize.output_low = getParamFromInputAsFloats(fqLayer, 3);
- fqActivation.args.fakeQuantize.output_high = getParamFromInputAsFloats(fqLayer, 4);
+ fqActivation.fqParams.inputPerChannel = inputRangeSize != 1;
+ fqActivation.fqParams.input_low = getParamFromInputAsFloats(fqLayer, 1);
+ fqActivation.fqParams.input_high = getParamFromInputAsFloats(fqLayer, 2);
+
+ fqActivation.fqParams.outputPerChannel = outputRangeSize != 1;
+ fqActivation.fqParams.output_low = getParamFromInputAsFloats(fqLayer, 3);
+ fqActivation.fqParams.output_high = getParamFromInputAsFloats(fqLayer, 4);
fqActivation.type = kActFakeQuantize;
return fqActivation;
diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
index 6c1bf161e28..1112160974b 100644
--- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
+++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp
@@ -103,7 +103,8 @@ class LayerInfo {
"neglog",
"neghalflog",
"softsign",
- "power"};
+ "power",
+ "fakequantize"};
if (isPower()) {
auto powerLayer = as();
@@ -157,7 +158,10 @@ class LayerInfo {
IS_VALID();
return nullptr != as();
}
-
+ bool isSyntheticScaleShift() const noexcept {
+ IS_VALID();
+ return layer->name.find("SyntheticScaleShift") != std::string::npos;
+ }
bool isEltwise() const noexcept {
IS_VALID();
return nullptr != as();
@@ -193,6 +197,18 @@ class LayerInfo {
bool isIdentity() const noexcept {
return isOfType("identity");
}
+ bool isTanh() const noexcept {
+ return isOfType("tanh");
+ }
+ bool isSigmoid() const noexcept {
+ return isOfType("sigmoid");
+ }
+ bool isSoftSign() const noexcept {
+ return isOfType("softsign");
+ }
+ bool isClamp() const noexcept {
+ return isOfType("clamp");
+ }
bool isFullyConnected() const noexcept {
return isOfType("FullyConnected") || isOfType("InnerProduct");
}
@@ -283,6 +299,9 @@ class LayerInfo {
bool isCopyDelayed() const noexcept {
return isOfType(DelayedCopyLayerName);
}
+ bool isWeightableIdentity() const noexcept {
+ return isConcatAlignFilter() || isSyntheticScaleShift() || isCropAffined();
+ }
size_t paddingSize() const {
static InferenceEngine::details::caseless_set layersWithPossiblePadding = {"FullyConnected",
diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
index c6233547677..d32b49c42c7 100644
--- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
+++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp
@@ -39,6 +39,7 @@
#include "frontend/quantization.h"
#include "gna_groups.hpp"
#include "gna_graph_patterns.hpp"
+#include "gna_data_types.hpp"
using namespace InferenceEngine;
using namespace InferenceEngine::details;
@@ -54,6 +55,10 @@ std::shared_ptr BasePass::getPassManager() {
return sharedMgr;
}
+
+static bool fp32eq(float p1, float p2) {
+ return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
+}
// indexes stored in pass manager
static const char identityLayersCounterName[] = "identityLayerCounter";
static const char diagonalLayersCounterName[] = "diagonalLayerCounter";
@@ -1836,9 +1841,6 @@ void FuseFQIntoWeightsPass::run() {
weightableLayer->insData.resize(1);
// 2. running FQ function for given layer
- if (weightDims.size() != 2) {
- THROW_GNA_LAYER_EXCEPTION(fqLayer) << " layout of weigths not equal to NC not yet supported";
- }
auto outputSize = details::product(weightDims.begin(), weightDims.end());
// depending on compute precision weights will be recreated
@@ -1874,61 +1876,42 @@ void FuseFQIntoWeightsPass::run() {
// check if
// - weights were float values and need to be quantized,
// - weights are integer values and quantization can be skipped
- for (size_t i = 0; i < outputRange.first.size(); ++i) {
- if (inputRange.first[i] > outputRange.first[i] ||
- inputRange.second[i] > outputRange.second[i]) {
- quantized->_weights_quantized = true;
- break;
- }
- }
-
- quantized->_weights_quant.SetMinValues(outputRange.first);
- quantized->_weights_quant.SetMaxValues(outputRange.second);
+ quantized->_weights_quant.SetMinValues(inputRange.first, true);
+ quantized->_weights_quant.SetMaxValues(inputRange.second, true);
+ quantized->_weights_quant.SetMinValues(outputRange.first, false);
+ quantized->_weights_quant.SetMaxValues(outputRange.second, false);
quantized->_weights_quant.SetLevels(levels);
// lets find out minimum scale factor among channels
- if (quantized->_weights_quant.GetMinValues().empty()) {
+ if (!quantized->_weights_quant.IsStatsSet()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed";
}
- auto getScale = [&quantized](size_t i) {
- return (quantized->_weights_quant.GetLevels() - 1) /
- (quantized->_weights_quant.GetMaxValues()[i] - quantized->_weights_quant.GetMinValues()[i]);
- };
-
- float min_channel_scale = getScale(0);
- for (uint32_t i = 1; i < quantized->_weights_quant.GetMinValues().size(); i++) {
- min_channel_scale = std::min(min_channel_scale, getScale(i));
- }
-
- auto multiplier = 1.0f;
- if (quantized->_weights_quant.GetLevels() <= std::numeric_limits::max()) {
- // GNA supports additional multiplier for only 8bit weights.
- // The multipler is used to extend dynamic range.
- multiplier = MAX_OUT_MULTIPLIER;
- }
-
- // Common weights scale calculation
- quantized->_weights_quant.SetScale(min_channel_scale * multiplier);
continue;
}
+ size_t depth = 1;
intel_dnn_component_t component;
component.num_columns_in = weightDims[1];
component.num_rows_in = weightDims[0];
+ if (LayerInfo(weightableLayer).isConvolution()) {
+ depth = (weightDims.size() == 4)? weightDims[3]: 1;
+ }
+
intel_piecewiselinear_t *transform = reinterpret_cast(&component.op.pwl);
transform->func_id = gnaFakeQuantizeLayer.parseAsActivation();
auto quantizedWeightsData = quantizedWeights->buffer();
- component.ptr_inputs = quantizedWeightsData.as();
-
auto dequantizedWeights = make_shared_blob(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
dequantizedWeights->allocate();
auto resultBuffer = dequantizedWeights->buffer();
- component.ptr_outputs = resultBuffer.as();
+ for (size_t i = 0; i < depth; ++i) {
+ component.ptr_inputs = quantizedWeightsData.as() + i * component.num_columns_in * component.num_rows_in;
+ component.ptr_outputs = resultBuffer.as() + i * component.num_columns_in * component.num_rows_in;
- PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
+ PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
+ }
// 3. assign dequantized const blob to weightable layer
assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases);
@@ -1944,6 +1927,97 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
auto donotSkip = [](CNNLayerPtr) {
return false;
};
+
+ auto allowFQFuse = [](CNNLayerPtr layer) -> bool {
+ auto doNotSkup = [](CNNLayerPtr layer) {
+ return false;
+ };
+
+ if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkup).empty()) {
+ return false;
+ }
+
+ auto skipNonFunctional = [](CNNLayerPtr layer) {
+ return LayerInfo(layer).isNonFunctional();
+ };
+
+ auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional);
+ if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst()) {
+ return true;
+ }
+
+ auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional);
+ for (auto& l : nextLayers) {
+ if (!LayerInfo(l).isActivation()) {
+ return false;
+ }
+ }
+
+ return true;
+ };
+
+ std::function