[GNA] POT enabling (#4358)

* POT enabling for GNA

* POT enabling for GNA

* POT enabling for GNA - accuracy improvements
This commit is contained in:
Bartosz Sochacki 2021-03-04 14:10:01 +01:00 committed by GitHub
parent ed8d3b72d3
commit 3daacb5e09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 1396 additions and 381 deletions

View File

@ -92,6 +92,13 @@ DECLARE_GNA_CONFIG_KEY(COMPACT_MODE);
*/ */
DECLARE_GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN); DECLARE_GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN);
/**
* @brief The option to allow to specify the maximum error percent that the optimized algorithm finding
* will use to find PWL functions.
* By default (in case of NO value set), 1.0 value is used.
*/
DECLARE_GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT);
/** /**
* @brief By default, the GNA plugin uses one worker thread for inference computations. * @brief By default, the GNA plugin uses one worker thread for inference computations.
* This parameter allows you to create up to 127 threads for software modes. * This parameter allows you to create up to 127 threads for software modes.

View File

@ -519,6 +519,10 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) {
throw std::logic_error("Invalid value for 'cw_l' argument. It must be greater than or equal to 0"); throw std::logic_error("Invalid value for 'cw_l' argument. It must be greater than or equal to 0");
} }
if (FLAGS_pwl_me < 0.0 || FLAGS_pwl_me > 100.0) {
throw std::logic_error("Invalid value for 'pwl_me' argument. It must be greater than 0.0 and less than 100.0");
}
return true; return true;
} }
@ -671,6 +675,7 @@ int main(int argc, char *argv[]) {
gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads); gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads);
gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO); gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO);
gnaPluginConfig[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(FLAGS_pwl_me);
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------
// --------------------------- 5. Write model to file -------------------------------------------------- // --------------------------- 5. Write model to file --------------------------------------------------

View File

@ -91,6 +91,10 @@ static const char input_layer_names_message[] = "Optional. Layer names for input
"The names are separated with \",\" " \ "The names are separated with \",\" " \
"Example: Input1,Input2 "; "Example: Input1,Input2 ";
/// @brief message for PWL max error percent
static const char pwl_max_error_percent_message[] = "Optional. The maximum percent of error for PWL function." \
"The value must be in <0, 100> range. The default value is 1.0.";
/// \brief Define flag for showing help message <br> /// \brief Define flag for showing help message <br>
DEFINE_bool(h, false, help_message); DEFINE_bool(h, false, help_message);
@ -161,6 +165,9 @@ DEFINE_string(oname, "", output_layer_names_message);
/// @brief Input layer name /// @brief Input layer name
DEFINE_string(iname, "", input_layer_names_message); DEFINE_string(iname, "", input_layer_names_message);
/// @brief PWL max error percent
DEFINE_double(pwl_me, 1.0, pwl_max_error_percent_message);
/** /**
* \brief This function show a help message * \brief This function show a help message
*/ */
@ -191,5 +198,6 @@ static void showUsage() {
std::cout << " -cw_r \"<integer>\" " << context_window_message_r << std::endl; std::cout << " -cw_r \"<integer>\" " << context_window_message_r << std::endl;
std::cout << " -oname \"<string>\" " << output_layer_names_message << std::endl; std::cout << " -oname \"<string>\" " << output_layer_names_message << std::endl;
std::cout << " -iname \"<string>\" " << input_layer_names_message << std::endl; std::cout << " -iname \"<string>\" " << input_layer_names_message << std::endl;
std::cout << " -pwl_me \"<double>\" " << pwl_max_error_percent_message << std::endl;
} }

View File

@ -1243,15 +1243,15 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
break; break;
case kActFakeQuantize : case kActFakeQuantize :
out_file << "<fakeQuantize.levels> " << out_file << "<fakeQuantize.levels> " <<
std::dec << component[i].op.pwl.func_id.args.fakeQuantize.levels << "\n"; std::dec << component[i].op.pwl.func_id.fqParams.levels << "\n";
out_file << "<fakeQuantize.input_low> " << out_file << "<fakeQuantize.input_low> " <<
std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.input_low << "\n"; std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.input_low << "\n";
out_file << "<fakeQuantize.input_high> " << out_file << "<fakeQuantize.input_high> " <<
std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.input_high << "\n"; std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.input_high << "\n";
out_file << "<fakeQuantize.output_low> " << out_file << "<fakeQuantize.output_low> " <<
std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.output_low << "\n"; std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.output_low << "\n";
out_file << "<fakeQuantize.output_high> " << out_file << "<fakeQuantize.output_high> " <<
std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.output_high << "\n"; std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.output_high << "\n";
break; break;
default: default:
break; break;

View File

@ -34,9 +34,25 @@ enum DnnActivationType : uint8_t {
kActNumType kActNumType
}; };
struct FakeQuantizeParams {
int8_t set;
int32_t levels;
// if input is per-channel quantization - input pointers contains per-channel ranges
int8_t inputPerChannel;
float* input_low;
float* input_high;
// if output is per-channel quantization - output pointers contains per-channel ranges
int8_t outputPerChannel;
float* output_low;
float* output_high;
};
struct DnnActivation { struct DnnActivation {
// for prelu // for prelu
DnnActivationType type; DnnActivationType type;
FakeQuantizeParams fqParams;
FakeQuantizeParams srcFQParams;
union { union {
struct { struct {
float negative_slope; float negative_slope;
@ -50,17 +66,6 @@ struct DnnActivation {
float low; float low;
float high; float high;
} clamp; } clamp;
struct {
int32_t levels;
// if input is per-channel quantization - input pointers contains per-channel ranges
int8_t inputPerChannel;
float *input_low;
float *input_high;
// if output is per-channel quantization - output pointers contains per-channel ranges
int8_t outputPerChannel;
float *output_low;
float *output_high;
} fakeQuantize;
} args; } args;
operator DnnActivationType () const noexcept { operator DnnActivationType () const noexcept {
return type; return type;

View File

@ -34,15 +34,20 @@ void make_gna_pwl(const DnnActivation fun,
gna_pwl[0].xBase = static_cast<int32_t> (INT32_MIN & XBASEMASK); // zero out the 2 lsb gna_pwl[0].xBase = static_cast<int32_t> (INT32_MIN & XBASEMASK); // zero out the 2 lsb
if (fun == kActSigmoid) { if (fun == kActSigmoid) {
gnalog() << "=========================== Sigmoid Segments ===========================\n"; gnalog() << "=========================== Sigmoid Segments ===========================\n";
gna_pwl[0].yBase = gna_pwl[1].yBase = 0; auto minVal = fun.fqParams.set? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale): 0;
gna_pwl[0].yBase = gna_pwl[1].yBase = minVal;
gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK; gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK;
} else if (fun == kActTanh) { } else if (fun == kActTanh) {
gnalog() << "=========================== Tanh Segments ===========================\n"; gnalog() << "=========================== Tanh Segments ===========================\n";
gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast<int16_t>(-1.0 * out_scale); auto minVal = fun.fqParams.set ? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale) :
static_cast<int16_t>(-1.0 * out_scale);
gna_pwl[0].yBase = gna_pwl[1].yBase = minVal;
gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK; gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK;
} else { } else {
gnalog() << "=========================== SoftSign Segments ===========================\n"; gnalog() << "=========================== SoftSign Segments ===========================\n";
gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast<int16_t>(-1.0 * out_scale); auto minVal = fun.fqParams.set ? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale) :
static_cast<int16_t>(-1.0 * out_scale);
gna_pwl[0].yBase = gna_pwl[1].yBase = minVal;
gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK; gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK;
} }
gna_pwl[0].slope = 0; gna_pwl[0].slope = 0;
@ -74,9 +79,10 @@ void make_gna_pwl(const DnnActivation fun,
<< "\n"; << "\n";
} }
// insert extra segment for xvalues > u_bound // insert extra segment for xvalues > u_bound
auto maxVal = fun.fqParams.set ? *fun.fqParams.input_high : 1.0;
gna_pwl[n_segments - 1].xBase = gna_pwl[n_segments - 1].xBase =
((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK; ((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK;
gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(1.0 * out_scale); gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(maxVal * out_scale);
gna_pwl[n_segments - 1].slope = 0; gna_pwl[n_segments - 1].slope = 0;
gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale) gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale)
@ -223,9 +229,19 @@ void make_gna_pwl(const DnnActivation fun,
else else
gnalog() << "=========================== LeakyReLU Segments ======================\n"; gnalog() << "=========================== LeakyReLU Segments ======================\n";
int32_t x_lower = INT32_MIN; int32_t x_lower = INT32_MIN;
int32_t x_upper = INT32_MAX;
int16_t y_lower = INT16_MIN; int16_t y_lower = INT16_MIN;
int16_t y_upper = INT16_MAX;
if (fun.fqParams.set) {
x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale);
x_upper = FLOAT_TO_INT32(*fun.fqParams.input_high * 1.25 * in_scale);
y_lower = FLOAT_TO_INT16(*fun.fqParams.input_low * 1.25 * out_scale);
y_upper = FLOAT_TO_INT16(*fun.fqParams.input_high * 1.25 * out_scale);
} else {
if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
}
gna_pwl[0].yBase = y_lower * fun.args.lrelu.negative_slope; gna_pwl[0].yBase = y_lower * fun.args.lrelu.negative_slope;
s = gna_slope(fun.args.lrelu.negative_slope, in_scale, out_scale); s = gna_slope(fun.args.lrelu.negative_slope, in_scale, out_scale);
gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index; // zero out the 2 lsb gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index; // zero out the 2 lsb
@ -244,6 +260,18 @@ void make_gna_pwl(const DnnActivation fun,
<< " " << 0.0 << " " << 0.0
<< " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale) << " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale)
<< "\n"; << "\n";
if (fun.fqParams.set) { // need a right segment
gna_pwl.push_back({
static_cast<int32_t>(x_upper & XBASEMASK), // zero out the 2 lsb
y_upper,
0 });
gnalog() << (x_upper & XBASEMASK) / in_scale
<< " " << gna_pwl[n_segments].yBase / out_scale
<< " " << 0
<< "\n";
}
break; break;
} }
case kActSign: { case kActSign: {
@ -281,11 +309,18 @@ void make_gna_pwl(const DnnActivation fun,
break; break;
} }
case kActIdentity: case kActIdentity:
case kActKaldiLstmClipping: { case kActKaldiLstmClipping:
case kActFakeQuantize: {
int32_t x_lower = INT32_MIN; int32_t x_lower = INT32_MIN;
int32_t x_upper = INT32_MAX; int32_t x_upper = INT32_MAX;
int16_t y_lower = INT16_MIN; int16_t y_lower = INT16_MIN;
int16_t y_upper = INT16_MAX; int16_t y_upper = INT16_MAX;
if (fun == kActFakeQuantize && fun.fqParams.set) {
x_lower = *fun.fqParams.input_low * in_scale;
x_upper = *fun.fqParams.input_high * in_scale;
y_lower = *fun.fqParams.input_low * out_scale;
y_upper = *fun.fqParams.input_high * out_scale;
}
auto n_segments = 2; auto n_segments = 2;
if (fun == kActKaldiLstmClipping) { if (fun == kActKaldiLstmClipping) {
gnalog() << "=========================== Clipping Segments ===========================\n"; gnalog() << "=========================== Clipping Segments ===========================\n";
@ -311,6 +346,8 @@ void make_gna_pwl(const DnnActivation fun,
if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale); if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale); if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale);
} else if (fun == kActFakeQuantize) {
gnalog() << "=========================== Fake Quantize Segments ===========================\n";
} }
gna_pwl.resize(n_segments); gna_pwl.resize(n_segments);
gna_pwl[0].xBase = INT32_MIN & XBASEMASK; // zero out the 2 lsb gna_pwl[0].xBase = INT32_MIN & XBASEMASK; // zero out the 2 lsb

View File

@ -13,6 +13,7 @@ struct GNAFlags {
bool compact_mode = false; bool compact_mode = false;
bool exclusive_async_requests = false; bool exclusive_async_requests = false;
bool uniformPwlDesign = false; bool uniformPwlDesign = false;
float pwlMaxErrorPercent = 1.0f;
bool gna_openmp_multithreading = false; bool gna_openmp_multithreading = false;
bool sw_fp32 = false; bool sw_fp32 = false;
bool fake_quantized = false; bool fake_quantized = false;

View File

@ -95,6 +95,15 @@ struct QuantPair {
static B optional () { return B();} static B optional () { return B();}
}; };
struct FakeQuantizeParams {
bool paramsSet = false;
uint32_t levelsNum = 1;
float inputMinValue = 1.0f;
float inputMaxValue = 1.0f;
float outputMinValue = 1.0f;
float outputMaxValue = 1.0f;
};
/** /**
* @brief should allocated blob for specific data type, in case of src blob is nullptr * @brief should allocated blob for specific data type, in case of src blob is nullptr
* @tparam T * @tparam T
@ -170,14 +179,41 @@ class Quant<FakeQuantI8> {
template <typename T> template <typename T>
inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) { inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision,
float scale_factor, const FakeQuantizeParams& fqParams) {
auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision, auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
fp32_blob->getTensorDesc().getDims(), fp32_blob->getTensorDesc().getLayout() }); fp32_blob->getTensorDesc().getDims(), fp32_blob->getTensorDesc().getLayout() });
prec_blob->allocate(); prec_blob->allocate();
auto input_low = 0.0f;
auto input_high = 0.0f;
auto output_low = 0.0f;
auto output_high = 0.0f;
auto levels = 1;
if (fqParams.paramsSet) {
input_low = fqParams.inputMinValue;
input_high = fqParams.inputMaxValue;
output_low = fqParams.outputMinValue;
output_high = fqParams.outputMaxValue;
levels = fqParams.levelsNum;
}
int i = 0; int i = 0;
for (auto& precValue : *prec_blob) { for (auto& precValue : *prec_blob) {
auto f32Value = fp32_blob->buffer().template as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP32>::value_type*>()[i++] * scale_factor; auto f32Value = fp32_blob->buffer().template as<InferenceEngine::PrecisionTrait<InferenceEngine::Precision::FP32>::value_type*>()[i++];
if (fqParams.paramsSet) {
auto x = f32Value;
if (x <= std::min(input_low, input_high)) {
f32Value = output_low;
} else if (x > std::max(input_low, input_high)) {
f32Value = output_high;
} else {
f32Value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
(levels - 1) * (output_high - output_low) + output_low;
}
}
f32Value = f32Value * scale_factor;
if (f32Value > std::numeric_limits<T>::max()) { if (f32Value > std::numeric_limits<T>::max()) {
precValue = std::numeric_limits<T>::max(); precValue = std::numeric_limits<T>::max();
} else if (f32Value < std::numeric_limits<T>::min()) { } else if (f32Value < std::numeric_limits<T>::min()) {
@ -190,20 +226,21 @@ inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::
return static_cast<InferenceEngine::Blob::Ptr>(prec_blob); return static_cast<InferenceEngine::Blob::Ptr>(prec_blob);
} }
inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) { inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision,
float scale_factor, const FakeQuantizeParams &fqParams) {
InferenceEngine::Blob::Ptr result_ptr = nullptr; InferenceEngine::Blob::Ptr result_ptr = nullptr;
switch (precision) { switch (precision) {
case InferenceEngine::Precision::FP32: case InferenceEngine::Precision::FP32:
result_ptr = fp32_to_precision_blob<float>(fp32_blob, precision, scale_factor); result_ptr = fp32_to_precision_blob<float>(fp32_blob, precision, scale_factor, fqParams);
break; break;
case InferenceEngine::Precision::I32: case InferenceEngine::Precision::I32:
result_ptr = fp32_to_precision_blob<int32_t>(fp32_blob, precision, scale_factor); result_ptr = fp32_to_precision_blob<int32_t>(fp32_blob, precision, scale_factor, fqParams);
break; break;
case InferenceEngine::Precision::I16: case InferenceEngine::Precision::I16:
result_ptr = fp32_to_precision_blob<int16_t>(fp32_blob, precision, scale_factor); result_ptr = fp32_to_precision_blob<int16_t>(fp32_blob, precision, scale_factor, fqParams);
break; break;
case InferenceEngine::Precision::I8: case InferenceEngine::Precision::I8:
result_ptr = fp32_to_precision_blob<int8_t>(fp32_blob, precision, scale_factor); result_ptr = fp32_to_precision_blob<int8_t>(fp32_blob, precision, scale_factor, fqParams);
break; break;
default: default:
THROW_GNA_EXCEPTION << "FP32 to " << precision << " not supported"; THROW_GNA_EXCEPTION << "FP32 to " << precision << " not supported";
@ -304,9 +341,11 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl); auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
{ {
auto per_channel_weights = !quantData->_weights_quant.GetMinValues().empty(); auto weightsStats = !quantData->_weights_quant.GetMinValues().empty();
auto weightsScale = quantData->_weights_quant.GetScale(); auto weightsScale = quantData->_weights_quant.GetScale();
auto dstScale = quantData->_dst_quant.GetScale(); auto dstScale = quantData->_dst_quant.GetScale();
auto blob_precision = wl->_weights->getTensorDesc().getPrecision();
auto quantizedWeights = blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16;
fnc(wl->_weights->buffer().as<float*>(), fnc(wl->_weights->buffer().as<float*>(),
wl->_biases ? wl->_biases->buffer().as<float*>() : nullptr, wl->_biases ? wl->_biases->buffer().as<float*>() : nullptr,
intWeights->buffer(), intWeights->buffer(),
@ -318,12 +357,13 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
num_columns, num_columns,
num_rows_padded, num_rows_padded,
num_columns_padded, num_columns_padded,
quantizedWeights,
quantData->_weights_quant.GetLevels(), quantData->_weights_quant.GetLevels(),
nullptr, quantData->_weights_quant.GetMinValues().size(),
nullptr, weightsStats ? &quantData->_weights_quant.GetMinValues(true).front() : nullptr,
per_channel_weights ? &quantData->_weights_quant.GetMinValues().front(): nullptr, weightsStats ? &quantData->_weights_quant.GetMaxValues(true).front() : nullptr,
per_channel_weights ? &quantData->_weights_quant.GetMaxValues().front(): nullptr, weightsStats ? &quantData->_weights_quant.GetMinValues(false).front() : nullptr,
&quantData->_weights_quantized); weightsStats ? &quantData->_weights_quant.GetMaxValues(false).front() : nullptr);
} }
wl->_weights = intWeights; wl->_weights = intWeights;
wl->_biases = intBiases; wl->_biases = intBiases;
@ -410,8 +450,11 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv); auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
{ {
auto weightsStats = !quantData->_weights_quant.GetMinValues().empty();
auto weightsScale = quantData->_weights_quant.GetScale(); auto weightsScale = quantData->_weights_quant.GetScale();
auto dstScale = quantData->_dst_quant.GetScale(); auto dstScale = quantData->_dst_quant.GetScale();
auto blob_precision = conv->_weights->getTensorDesc().getPrecision();
auto quantizedWeights = blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16;
fnc(conv->_weights->buffer().as<float*>(), fnc(conv->_weights->buffer().as<float*>(),
conv->_biases ? conv->_biases->buffer().as<float*>() : nullptr, conv->_biases ? conv->_biases->buffer().as<float*>() : nullptr,
intWeights->buffer(), intWeights->buffer(),
@ -422,7 +465,14 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
num_rows, num_rows,
num_columns, num_columns,
num_rows_padded, num_rows_padded,
num_columns_padded); num_columns_padded,
quantizedWeights,
quantData->_weights_quant.GetLevels(),
quantData->_weights_quant.GetMinValues().size(),
weightsStats ? &quantData->_weights_quant.GetMinValues(true).front() : nullptr,
weightsStats ? &quantData->_weights_quant.GetMaxValues(true).front() : nullptr,
weightsStats ? &quantData->_weights_quant.GetMinValues(false).front() : nullptr,
weightsStats ? &quantData->_weights_quant.GetMaxValues(false).front() : nullptr);
} }
conv->_weights = intWeights; conv->_weights = intWeights;
conv->_biases = intBiases; conv->_biases = intBiases;
@ -494,11 +544,22 @@ class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBas
if (initial_precision == InferenceEngine::Precision::FP16) { if (initial_precision == InferenceEngine::Precision::FP16) {
cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]); cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]);
} }
auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.GetScale(); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]); auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]);
auto const_blob = cnnLayer->blobs["custom"]; auto const_blob = cnnLayer->blobs["custom"];
if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) { if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
cnnLayer->blobs["custom"] = fp32_to_precision_blob(const_blob, cnnLayer->outData[0]->getPrecision(), const_scale_factor); auto fqParams = FakeQuantizeParams{};
if (quantParams->_dst_quant.IsStatsSet()) {
fqParams.paramsSet = true;
fqParams.levelsNum = quantParams->_dst_quant.GetLevels();
fqParams.inputMinValue = quantParams->_dst_quant.GetMinValues(true).front();
fqParams.inputMaxValue = quantParams->_dst_quant.GetMaxValues(true).front();
fqParams.outputMinValue = quantParams->_dst_quant.GetMinValues(false).front();
fqParams.outputMaxValue = quantParams->_dst_quant.GetMaxValues(false).front();
}
cnnLayer->blobs["custom"] = fp32_to_precision_blob(const_blob, cnnLayer->outData[0]->getPrecision(),
quantParams->_dst_quant.GetScale(), fqParams);
} }
} }

View File

@ -7,6 +7,7 @@
#include <vector> #include <vector>
#include <utility> #include <utility>
#include <string> #include <string>
#include <type_traits>
#include <legacy/layer_transform.hpp> #include <legacy/layer_transform.hpp>
#include "gna_graph_tools.hpp" #include "gna_graph_tools.hpp"
@ -77,7 +78,8 @@ class ModelQuantizer {
scaleIndex++; scaleIndex++;
} }
propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size()); bool isFakeQuantize = std::is_same<T, FakeQuantI8>() || std::is_same<T, FakeQuantI16>();
propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), isFakeQuantize);
// sorted order gives possibility for propagate quantisation along depended layers // sorted order gives possibility for propagate quantisation along depended layers
for (auto &&layer : sortedNewNet) { for (auto &&layer : sortedNewNet) {
@ -88,8 +90,8 @@ class ModelQuantizer {
} }
private : private :
void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int weightsBytesSize) const { void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int weightsBytesSize, bool fakeQuantize) const {
ScaleFactorCalculator sf(net, weightsBytesSize); ScaleFactorCalculator sf(net, weightsBytesSize, fakeQuantize);
while (!sf.allLayersProcessed()) { while (!sf.allLayersProcessed()) {
for (auto &&layer : sf.getStartLayers()) { for (auto &&layer : sf.getStartLayers()) {

View File

@ -9,6 +9,7 @@
#include <limits> #include <limits>
#include "backend/gna_types.h" #include "backend/gna_types.h"
#include "quantization.h" #include "quantization.h"
#include <algorithm>
#ifdef DEBUG #ifdef DEBUG
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__)) #define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
@ -19,26 +20,44 @@
template<> template<>
void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const { void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
if (quantizedWeights) {
THROW_GNA_EXCEPTION << "Quantized weights are not yet supported in int16 quantization mode";
}
uint32_t num_saturate = 0; uint32_t num_saturate = 0;
auto input_low = 0.0f;
auto input_high = 0.0f;
auto output_low = 0.0f;
auto output_high = 0.0f;
auto levels = 1;
if (fq_num_stats > 0) {
input_low = *fq_ptr_input_low;
input_high = *fq_ptr_input_high;
output_low = *fq_ptr_output_low;
output_high = *fq_ptr_output_high;
levels = fq_levels;
}
for (uint32_t row = 0; row < num_rows; row++) { for (uint32_t row = 0; row < num_rows; row++) {
for (uint32_t col = 0; col < num_columns; col++) { for (uint32_t col = 0; col < num_columns; col++) {
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
float value = ptr_float_weights[row * num_columns + col]; float value = ptr_float_weights[row * num_columns + col];
if (!*ptr_quantized_weights) { if (fq_num_stats > 0) {
value = value * *ptr_weight_scale_factor + rounding_value; auto x = value;
if (x <= std::min(input_low, input_high)) {
value = output_low;
} else if (x > std::max(input_low, input_high)) {
value = output_high;
} else { } else {
value -= MAX_VAL_2B_WEIGHT; value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
(levels - 1) * (output_high - output_low) + output_low;
} }
}
value = value * *ptr_weight_scale_factor + rounding_value;
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
if (*ptr_quantized_weights &&
(value > std::numeric_limits<int16_t>::max() ||
value < std::numeric_limits<int16_t>::min())) {
THROW_GNA_EXCEPTION << "unsupported weights range for I16 quantisation: " << value;
}
if (value > std::numeric_limits<int16_t>::max()) { if (value > std::numeric_limits<int16_t>::max()) {
*ptr_weight_16 = std::numeric_limits<int16_t>::max(); *ptr_weight_16 = std::numeric_limits<int16_t>::max();
num_saturate++; num_saturate++;
@ -91,37 +110,6 @@ void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
template<> template<>
void QuantizationCallback<int16_t, int32_t>::runQuantize() const { void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
uint32_t num_saturate = 0; uint32_t num_saturate = 0;
if (*ptr_weight_scale_factor == 1.0) {
// scale factor for weights is not calculated yet
float mean_weight = 0.0;
float mean_weight_squared = 0.0;
float max_weight = -1e20f;
float var_weight;
float mean_plus_2stdev;
for (uint32_t i = 0; i < num_rows; i++) {
for (uint32_t j = 0; j < num_columns; j++) {
float weight = ptr_float_weights[i * num_columns + j];
mean_weight += weight;
mean_weight_squared += weight * weight;
if (fabs(weight) > max_weight) {
max_weight = fabs(weight);
}
}
}
mean_weight /= static_cast<float>(num_rows * num_columns);
mean_weight_squared /= static_cast<float>(num_rows * num_columns);
var_weight = mean_weight_squared - mean_weight * mean_weight;
mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
if (max_weight != 0.0f) {
*ptr_weight_scale_factor = static_cast<float>(MAX_VAL_2B_WEIGHT) / max_weight;
}
*ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
}
for (uint32_t row = 0; row < num_rows; row++) { for (uint32_t row = 0; row < num_rows; row++) {
for (uint32_t col = 0; col < num_columns; col++) { for (uint32_t col = 0; col < num_columns; col++) {
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
@ -176,6 +164,24 @@ void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
} }
} }
std::pair<float, float> FindMinMaxValues(void* ptr_float_memory, size_t num_elements) {
float* ptr_float_feat = reinterpret_cast<float*>(ptr_float_memory);
float min = num_elements ? ptr_float_feat[0] : 0.0;
float max = num_elements ? ptr_float_feat[0] : 0.0;
for (size_t i = 1; i < num_elements; i++) {
if (fabs(ptr_float_feat[i]) > max) {
max = fabs(ptr_float_feat[i]);
}
if (fabs(ptr_float_feat[i]) < min) {
min = fabs(ptr_float_feat[i]);
}
}
return { min, max };
}
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) { float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) {
float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory); float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
float max = 0.0; float max = 0.0;
@ -224,17 +230,37 @@ template<>
void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const { void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
uint32_t num_saturate = 0; uint32_t num_saturate = 0;
if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) { auto input_low = 0.0f;
THROW_GNA_EXCEPTION << "Fake quantized output range not set"; auto input_high = 0.0f;
auto output_low = 0.0f;
auto output_high = 0.0f;
auto levels = 1;
float valueAcc = 0.0;
for (uint32_t i = 0; i < num_rows; i++) {
uint32_t channel_multiplier = 1;
if (fq_num_stats > 0) {
auto idx = fq_num_stats == 1 ? 0 : i;
input_low = fq_ptr_input_low[idx];
input_high = fq_ptr_input_high[idx];
output_low = fq_ptr_output_low[idx];
output_high = fq_ptr_output_high[idx];
levels = fq_levels;
channel_multiplier = ((input_high - input_low) * *ptr_weight_scale_factor) / (levels - 1);
} else {
float scaled_row_max = 0;
for (uint32_t col = 0; col < num_columns; col++) {
float value = ptr_float_weights[i * num_columns + col] * *ptr_weight_scale_factor;
valueAcc += value;
if (fabs(value) > scaled_row_max) {
scaled_row_max = fabs(value);
} }
if (fq_levels == 0 || fq_levels == 1) {
THROW_GNA_EXCEPTION << "Fake quantized levels not set";
} }
for (uint32_t i = 0; i < num_rows; i++) { channel_multiplier = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
uint32_t channel_multiplier = ((fq_ptr_output_high[i] - fq_ptr_output_low[i]) * }
*ptr_weight_scale_factor) / (fq_levels - 1) + 0.5f;
ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier); ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier + 0.5f);
if (channel_multiplier > MAX_OUT_MULTIPLIER) { if (channel_multiplier > MAX_OUT_MULTIPLIER) {
THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier; THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;
} }
@ -243,19 +269,25 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const
auto offset = i * num_columns + j; auto offset = i * num_columns + j;
auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f; auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f;
float value = ptr_float_weights[offset]; float value = ptr_float_weights[offset];
if (!*ptr_quantized_weights) { if (!quantizedWeights) {
if (fq_num_stats > 0) {
auto x = value;
if (x <= std::min(input_low, input_high)) {
value = output_low;
} else if (x > std::max(input_low, input_high)) {
value = output_high;
} else {
value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
(levels - 1) * (output_high - output_low) + output_low;
}
}
value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value; value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value;
} else { } else {
value -= MAX_VAL_1B_WEIGHT; value -= MAX_VAL_1B_WEIGHT;
} }
auto normalizedWeight = static_cast<int32_t>(value); auto normalizedWeight = static_cast<int32_t>(value);
if (*ptr_quantized_weights &&
(value > std::numeric_limits<int8_t>::max() ||
value < std::numeric_limits<int8_t>::min())) {
THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantization: " << value;
}
if (value > std::numeric_limits<int8_t>::max()) { if (value > std::numeric_limits<int8_t>::max()) {
normalizedWeight = std::numeric_limits<int8_t>::max(); normalizedWeight = std::numeric_limits<int8_t>::max();
num_saturate++; num_saturate++;
@ -309,40 +341,6 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
} }
uint32_t num_saturate = 0; uint32_t num_saturate = 0;
if (*ptr_weight_scale_factor == 1.0) {
// scale factor for weights is not calculated yet
float mean_weight = 0.0;
float mean_weight_squared = 0.0;
float max_weight = -1e20f;
float var_weight;
float mean_plus_2stdev;
for (uint32_t i = 0; i < num_rows; i++) {
for (uint32_t j = 0; j < num_columns; j++) {
float weight = ptr_float_weights[i*num_columns + j];
mean_weight += weight;
mean_weight_squared += weight * weight;
if (fabs(weight) > max_weight) {
max_weight = fabs(weight);
}
}
}
mean_weight /= static_cast<float>(num_rows * num_columns);
mean_weight_squared /= static_cast<float>(num_rows * num_columns);
var_weight = mean_weight_squared - mean_weight * mean_weight;
mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
*ptr_weight_scale_factor = static_cast<float>(MAX_VAL_1B_WEIGHT) / max_weight;
// For 8 bit weights quantize as follows:
// 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier
// 2. find maximum scaled weight for each row
// 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range
// 4. quantize and store scaled row
*ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor; // increase dynamic range by max multiplier
*ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
}
float valueAcc = 0.0; float valueAcc = 0.0;
for (uint32_t row = 0; row < num_rows; row++) { for (uint32_t row = 0; row < num_rows; row++) {
float scaled_row_max = 0; float scaled_row_max = 0;

View File

@ -31,12 +31,13 @@ struct QuantizationCallback {
uint32_t num_rows_padded; uint32_t num_rows_padded;
uint32_t num_columns_padded; uint32_t num_columns_padded;
bool quantizedWeights;
int32_t fq_levels; int32_t fq_levels;
const size_t fq_num_stats;
const float *fq_ptr_input_low; const float *fq_ptr_input_low;
const float *fq_ptr_input_high; const float *fq_ptr_input_high;
const float* fq_ptr_output_low; const float* fq_ptr_output_low;
const float* fq_ptr_output_high; const float* fq_ptr_output_high;
const bool* ptr_quantized_weights;
void runQuantize() const; void runQuantize() const;
void runFakeQuantize() const; void runFakeQuantize() const;
@ -45,5 +46,6 @@ struct QuantizationCallback {
template class QuantizationCallback<int16_t, int32_t>; template class QuantizationCallback<int16_t, int32_t>;
template class QuantizationCallback<int8_t, gna_compound_bias_t>; template class QuantizationCallback<int8_t, gna_compound_bias_t>;
std::pair<float, float> FindMinMaxValues(void* ptr_float_memory, size_t num_elements);
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements); float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor); void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);

View File

@ -24,27 +24,57 @@ public:
int32_t GetLevels() const { int32_t GetLevels() const {
return levels; return levels;
} }
void SetMinValues(const std::vector<float> &min) { bool IsStatsSet() const {
min_values.clear(); return !input_min_values.empty() && !input_max_values.empty();
min_values.insert(min_values.end(), min.begin(), min.end());
} }
const std::vector<float>& GetMinValues() const { void SetMinValues(const std::vector<float> &min, bool input = true) {
return min_values; if (input) {
input_min_values.clear();
input_min_values.insert(input_min_values.end(), min.begin(), min.end());
} else {
output_min_values.clear();
output_min_values.insert(output_min_values.end(), min.begin(), min.end());
} }
void SetMaxValues(const std::vector<float>& max) {
max_values.clear();
max_values.insert(max_values.end(), max.begin(), max.end());
} }
const std::vector<float>& GetMaxValues() const { std::vector<float>& GetMinValues(bool input = true) {
return max_values; if (input) {
return input_min_values;
}
return output_min_values;
}
void SetMaxValues(const std::vector<float>& max, bool input = true) {
if (input) {
input_max_values.clear();
input_max_values.insert(input_max_values.end(), max.begin(), max.end());
} else {
output_max_values.clear();
output_max_values.insert(output_max_values.end(), max.begin(), max.end());
}
}
std::vector<float>& GetMaxValues(bool input = true) {
if (input) {
return input_max_values;
}
return output_max_values;
}
void CopyStats(Quantization &src) {
levels = src.GetLevels();
SetMinValues(src.GetMinValues(true), true);
SetMaxValues(src.GetMaxValues(true), true);
SetMinValues(src.GetMinValues(false), false);
SetMaxValues(src.GetMaxValues(false), false);
} }
private: private:
float scale = 1.0f; float scale = 1.0f;
bool scale_set = false; bool scale_set = false;
int32_t levels = 0; int32_t levels = 0;
std::vector<float> min_values; std::vector<float> input_min_values;
std::vector<float> max_values; std::vector<float> input_max_values;
std::vector<float> output_min_values;
std::vector<float> output_max_values;
}; };
struct QuantizedLayerParams { struct QuantizedLayerParams {
@ -53,7 +83,6 @@ struct QuantizedLayerParams {
// deprecate this // deprecate this
Quantization _weights_quant; Quantization _weights_quant;
bool _weights_quantized = false;
Quantization _bias_quant; Quantization _bias_quant;
float _o_shift = 0.0f; float _o_shift = 0.0f;
float _b_shift = 0.0f; float _b_shift = 0.0f;

View File

@ -16,9 +16,13 @@
#include "layers/gna_layer_info.hpp" #include "layers/gna_layer_info.hpp"
#include "gna_plugin_log.hpp" #include "gna_plugin_log.hpp"
#include "gna_slope_scale.h" #include "gna_slope_scale.h"
#include "runtime/pwl.h"
namespace GNAPluginNS { namespace GNAPluginNS {
namespace frontend { namespace frontend {
static const float MIN_SEARCH_WEIGHTS_VAL = 1.0f;
static const float MAX_SEARCH_WEIGHTS_VAL = 1024.0f;
struct ScaleFactorUpdateResult { struct ScaleFactorUpdateResult {
InferenceEngine::CNNLayer *restartLayer = nullptr; InferenceEngine::CNNLayer *restartLayer = nullptr;
ScaleFactorUpdateResult() = default; ScaleFactorUpdateResult() = default;
@ -29,6 +33,146 @@ struct ScaleFactorUpdateResult {
} }
}; };
/**
* @brief Compares two float values and returns if they are equal
* @param p1 First float value
* @param p2 Second float value
* @return Returns true if two float values are equal
*/
static bool fp32eq(float p1, float p2) {
return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
}
/**
* @brief Calculates PWL slopes for specified function in a given input range
* @param info Layer information
* @return Array of slopes for a function
*/
static std::vector<double> getPWLSlopes(const LayerInfo& info) {
if (info.isIdentity() || info.isFakeQuantize() || info.isRelu() || info.isClamp() || info.isAbs()) {
return { 1.0f };
}
return {};
}
/**
* @brief Finds the best output activation scale factor that allows to get the most precise PWL slope
* @param inScale Input activation layer scale factor
* @param outScales Array of output activation scale factors
* @param slopes Array of slopes for a given function
* @return Best output activation scale factor
*/
static float selectBestOutputScaleFactors(float inScale, std::vector<float> outScales, const std::vector<double>& slopes) {
std::vector<float> scaleErrors;
for (size_t i = 0; i < outScales.size(); ++i) {
auto outScale = outScales[i];
auto sd = 0.0;
for (size_t j = 0; j < slopes.size(); ++j) {
auto s = gna_slope(slopes[j], inScale, outScale);
auto slope = static_cast<uint32_t>(s.slope * s.slope_scale);
if (slope < std::numeric_limits<int16_t>::min() && slope > std::numeric_limits<int16_t>::max()) {
sd += std::numeric_limits<int8_t>::max();
continue;
}
auto testSlope = static_cast<double>(slope) / s.slope_scale * inScale / outScale;
if (fp32eq(testSlope, slopes[j])) {
return outScale;
}
sd += pow(testSlope - slopes[j], 2.0);
}
sd /= slopes.size();
sd = sqrtf(sd);
scaleErrors.push_back(sd);
}
size_t minIndex = 0;
auto minError = scaleErrors[0];
for (size_t i = 1; i < scaleErrors.size(); ++i) {
if (scaleErrors[i] < minError) {
minError = scaleErrors[i];
minIndex = i;
}
}
return outScales[minIndex];
}
/**
* @brief Finds the weights scale factor that allows to get the most precise PWL slope
* @param inScale Input weightable layer scale factor
* @param outScale Output activation scale factor
* @param weightsScales Array of weights scales to check
* @return Best weights scale factor
*/
static float selectBestWeightsScaleFactors(float inScale, float outScale, std::vector<float> weightsScales,
const std::vector<double>& slopes) {
std::vector<float> scaleErrors;
for (size_t i = 0; i < weightsScales.size(); ++i) {
auto weightScale = weightsScales[i];
auto sd = 0.0;
for (size_t j = 0; j < slopes.size(); ++j) {
auto s = gna_slope(slopes[j], inScale * weightScale, outScale);
auto slope = static_cast<uint32_t>(s.slope * s.slope_scale);
if (slope < std::numeric_limits<int16_t>::min() && slope > std::numeric_limits<int16_t>::max()) {
sd += std::numeric_limits<int8_t>::max();
continue;
}
auto testSlope = static_cast<double>(slope) / s.slope_scale * (inScale * weightScale) / outScale;
if (fp32eq(testSlope, slopes[j])) {
return outScale;
}
sd += pow(testSlope - slopes[j], 2.0);
}
sd /= slopes.size();
sd = sqrtf(sd);
scaleErrors.push_back(sd);
}
size_t minIndex = 0;
auto minError = scaleErrors[0];
for (size_t i = 1; i < scaleErrors.size(); ++i) {
if (scaleErrors[i] < minError) {
minError = scaleErrors[i];
minIndex = i;
}
}
return weightsScales[minIndex];
}
/**
* @brief Generates specified number of scale factors in a given range.
* @param startRange First scale factor
* @param endRange Last scale factor
* @param numIterations number of scale factors to generate
* @return Array of scale factors
*/
static std::vector<float> generateScaleFactors(float startRange, float endRange, size_t numScaleFactors) {
if (!numScaleFactors) {
return { startRange, endRange };
}
auto scaleFactors = std::vector<float>{};
auto domain = endRange - startRange;
auto step = domain / numScaleFactors;
for (size_t i = 0; i <= numScaleFactors; ++i) {
auto scale = startRange + step * i;
if (!std::isnan(scale)) {
scaleFactors.push_back(scale);
}
}
return scaleFactors;
}
/** /**
* @brief calculates output scale factor per layer * @brief calculates output scale factor per layer
* @tparam T * @tparam T
@ -44,7 +188,7 @@ class ScaleFactorPerLayer {
* @param result * @param result
* @return * @return
*/ */
bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result) { bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
return false; return false;
} }
}; };
@ -54,17 +198,15 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
private : private :
const float activation_scale_factor = 2048.f; const float activation_scale_factor = 2048.f;
const float identity_scale_factor = 2049.0f; const float identity_scale_factor = 2049.0f;
const float max_activation_scale_factor = 4096.0f;
const float k = 5; const float k = 5;
const float k_identity = 6; const float k_identity = 6;
const double pow_domain = 16; const double pow_domain = 16;
protected : protected :
static bool fp32eq(float p1, float p2) {
return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
}
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer, float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
GNAPluginNS::LayerInfo const& layer) { GNAPluginNS::LayerInfo const& layer,
const bool fakeQuantize) {
auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer); auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
@ -136,18 +278,140 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
} }
} }
if (!quantizedParams->_dst_quant.GetMaxValues().empty()) { // Identity layer is inserted by GNA passes and requires statistics to correctly set output
auto min_value = quantizedParams->_dst_quant.GetMinValues().front(); // scale factor. POT does not produce any statistics for this layer as it does not exist
auto max_value = quantizedParams->_dst_quant.GetMaxValues().front(); // in the source IR.
auto newScaleFactor = (quantizedParams->_dst_quant.GetLevels() - 1) / (max_value - min_value); if (fakeQuantize && !quantizedParams->_dst_quant.IsScaleSet() && layer.isIdentity()) {
result = newScaleFactor < result ? newScaleFactor : result; auto prevLayer = CNNNetPrevLayer(cnnLayer);
while (prevLayer != nullptr) {
auto prevQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
if (prevQuantParams->_dst_quant.IsStatsSet()) {
quantizedParams->_dst_quant.CopyStats(prevQuantParams->_dst_quant);
quantizedParams->_src_quant.CopyStats(prevQuantParams->_dst_quant);
break;
}
// Take the input statistics only if layer does not modify input values.
if (prevQuantParams->_src_quant.IsStatsSet() &&
(LayerInfo(prevLayer).isNonFunctional() || LayerInfo(prevLayer).isMemory() ||
LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isInput())) {
quantizedParams->_dst_quant.CopyStats(prevQuantParams->_src_quant);
quantizedParams->_src_quant.CopyStats(prevQuantParams->_src_quant);
break;
}
// Stop searching for statistics if previous layer does not modify input values.
if ((LayerInfo(prevLayer).isWeightable() && !LayerInfo(prevLayer).isWeightableIdentity())
|| LayerInfo(prevLayer).isEltwise() || LayerInfo(prevLayer).isActivation()) {
break;
}
if (!CNNNetHasPrevLayer(prevLayer.get())) {
break;
}
prevLayer = CNNNetPrevLayer(prevLayer);
}
// If did not find statistics by searching previous layers, check if a next layer has
// statistics set.
if (!quantizedParams->_dst_quant.IsStatsSet()) {
auto donotSkip = [](InferenceEngine::CNNLayerPtr) {
return false;
};
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(cnnLayer, -1, donotSkip);
for (auto &l : nextLayers) {
auto nextQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*l);
if (nextQuantParams->_src_quant.IsStatsSet()) {
quantizedParams->_dst_quant.CopyStats(nextQuantParams->_src_quant);
quantizedParams->_src_quant.CopyStats(nextQuantParams->_src_quant);
break;
}
// Take output statistics only if a next layer does not modify input values
if (nextQuantParams->_dst_quant.IsStatsSet() &&
(LayerInfo(l).isNonFunctional() || LayerInfo(l).isMemory())) {
quantizedParams->_dst_quant.CopyStats(nextQuantParams->_dst_quant);
quantizedParams->_src_quant.CopyStats(nextQuantParams->_dst_quant);
break;
}
}
}
}
// Adjust output scale factor based on statistics (if present) in the following steps:
// 1. calculate scale factor based on output min and max values
// 2. (temporary W/A) clamp scale factor to maximum activation scale factor
// 3. search previous layers if there was already scale factor set
// 4. adjust output scale factor to get the most precise PWL slope
if (quantizedParams->_dst_quant.IsStatsSet()) {
auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front();
auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front();
auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue));
auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue));
result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) {
result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax);
}
//
//result = MAX_VAL_2B_FEAT / absMax;
if (std::isinf(result) || fp32eq(absMax, 0.0f)) {
result = max_activation_scale_factor;
}
// TODO: remove clamping maximum scale factor
result = result > max_activation_scale_factor ? max_activation_scale_factor : result;
if (!layer.isIdentity() && !layer.isFakeQuantize() && !layer.isRelu() && !layer.isClamp()) {
result = result > activation_scale_factor ? activation_scale_factor : result;
}
// Take input scale factor from previous layer if previous layer does not modify
// input values
bool usePrevScaleFactor = false;
auto skipNonFunctional = [](InferenceEngine::CNNLayerPtr l) {
return LayerInfo(l).isNonFunctional();
};
auto prevLayer = CNNNetPrevLayerSkipCertain(cnnLayer, 0, skipNonFunctional);
auto prevLayer2 = prevLayer != nullptr? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional): nullptr;
if (prevLayer != nullptr &&
(layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
(prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
result = prevLayerQuant->_src_quant.GetScale();
usePrevScaleFactor = true;
}
}
// Adjust output scale factor to get the most precise PWL slope.
// NOTE: Currently it is only implemented for identity, clamp, relu and FQ layers.
// For all other layers, it does not improve accuracy.
auto slopes = getPWLSlopes(layer);
if (!slopes.empty() && !usePrevScaleFactor) {
auto div = 10;
auto mul = 10;
auto startRange = result > 1.0f ? static_cast<int32_t>(result) : result;
auto endRange = startRange - startRange / div;
endRange = endRange > 1.0f ? static_cast<int32_t>(endRange) : endRange;
auto scaleFactors = generateScaleFactors(startRange, endRange, static_cast<int32_t>(startRange - endRange) * mul);
auto newScaleFactor = selectBestOutputScaleFactors(quantizedParams->_src_quant.GetScale(), scaleFactors, slopes);
if (!fp32eq(result, newScaleFactor) &&
!fp32eq(newScaleFactor, 1.0f) && !fp32eq(newScaleFactor, 0.0f) && !std::isinf(newScaleFactor)) {
gnalog() << "[INFO] Adjusting scale factor for " << cnnLayer->name
<< " from: " << result << " to: " << newScaleFactor << "\n";
result = newScaleFactor;
}
}
} }
return result; return result;
} }
public : public :
bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result) { bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !cnnLayer ) { if ( !cnnLayer ) {
THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n"; THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
} }
@ -156,7 +420,11 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer); auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) { if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
if (!CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsScaleSet()) { if (CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsStatsSet() && !quant->_dst_quant.IsScaleSet()) {
auto minOutValue = quant->_dst_quant.GetMinValues().front();
auto maxOutValue = quant->_dst_quant.GetMaxValues().front();
auto scale = (quant->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue);
quant->_dst_quant.SetScale(scale);
quant->_src_quant = quant->_dst_quant; quant->_src_quant = quant->_dst_quant;
} }
@ -180,7 +448,9 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
return true; return true;
} }
if (quantSibling->_dst_quant.IsScaleSet()) { if ((!fakeQuantize && quantSibling->_dst_quant.IsScaleSet()) ||
(fakeQuantize && quantSibling->_dst_quant.IsScaleSet() && !fp32eq(quantSibling->_dst_quant.GetScale(), 1.0) &&
quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale())) {
// means we already restarted propagation input memory layer // means we already restarted propagation input memory layer
// need to search for requantiseable layer prior memory output layer // need to search for requantiseable layer prior memory output layer
InferenceEngine::CNNLayerPtr restartedLayer; InferenceEngine::CNNLayerPtr restartedLayer;
@ -230,7 +500,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
<< activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl; << activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl;
// try updating memory input layer scale factor and restart from it // try updating memory input layer scale factor and restart from it
quantSibling->_src_quant = quantSibling->_dst_quant = inputQuant->_dst_quant; quantSibling->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
quantSibling->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
result = ScaleFactorUpdateResult(input.get()); result = ScaleFactorUpdateResult(input.get());
return true; return true;
} }
@ -241,9 +512,15 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
if (cnnLayer->type == "Const") { if (cnnLayer->type == "Const") {
if (quant->_dst_quant.IsScaleSet()) { if (quant->_dst_quant.IsScaleSet()) {
quant->_src_quant = quant->_dst_quant; quant->_src_quant = quant->_dst_quant;
return ScaleFactorUpdateResult(); return true;
} }
auto max_val = std::numeric_limits<float>::min();
auto min_val = std::numeric_limits<float>::max();
if (quant->_dst_quant.IsStatsSet()) {
min_val = quant->_dst_quant.GetMinValues().front();
max_val = quant->_dst_quant.GetMaxValues().front();
} else {
auto blob = cnnLayer->blobs["custom"]; auto blob = cnnLayer->blobs["custom"];
auto blob_precision = blob->getTensorDesc().getPrecision(); auto blob_precision = blob->getTensorDesc().getPrecision();
@ -256,9 +533,6 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
blob = make_fp32_blob(blob); blob = make_fp32_blob(blob);
} }
auto max_val = std::numeric_limits<float>::min();
auto min_val = std::numeric_limits<float>::max();
auto flt_buf = blob->buffer().as<float*>(); auto flt_buf = blob->buffer().as<float*>();
auto size = blob->size(); auto size = blob->size();
@ -267,23 +541,26 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
if (val > max_val) max_val = val; if (val > max_val) max_val = val;
if (val < min_val) min_val = val; if (val < min_val) min_val = val;
} }
}
auto levels = fakeQuantize ? MAX_VAL_2B_FEAT : std::numeric_limits<int16_t>::max();
auto abs_val = std::max(std::abs(max_val), std::abs(min_val)); auto abs_val = std::max(std::abs(max_val), std::abs(min_val));
auto scale_val = static_cast<float>(std::numeric_limits<int16_t>::max()) / abs_val; auto scale_val = static_cast<float>(levels) / abs_val;
//TODO: use FQ formula for scale factor calculation
// TODO: Investigate what should be the scale in such cases (31910) if (std::isinf(scale_val) || fp32eq(abs_val, 0.0f)) {
if (std::isinf(scale_val)) { quant->_dst_quant.SetScale(fakeQuantize ? levels : 1.0f);
quant->_dst_quant.SetScale(quant->_src_quant.GetScale());
} else { } else {
quant->_dst_quant.SetScale(scale_val); quant->_dst_quant.SetScale(scale_val);
} }
quant->_src_quant.SetScale(quant->_dst_quant.GetScale());
return ScaleFactorUpdateResult(); return true;
} }
if (!CNNNetHasPrevLayer(cnnLayer)) { if (!CNNNetHasPrevLayer(cnnLayer)) {
quant->_dst_quant = quant->_src_quant; quant->_dst_quant = quant->_src_quant;
return ScaleFactorUpdateResult(); return true;
} }
// by default layer is pass thru its scale factor // by default layer is pass thru its scale factor
@ -292,17 +569,41 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized"; THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized";
} }
quant->_src_quant = inputQuant->_dst_quant; if (layerInfo.isPower() && !layerInfo.isActivation()) {
if (layerInfo.isActivation()) { auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
auto powerLayer = dynamic_cast<InferenceEngine::PowerLayer const*>(cnnLayer);
if (!powerLayer) {
THROW_IE_EXCEPTION << "Incorrect Power Layer pointer \n";
}
auto powerScale = std::abs(powerLayer->scale);
if (fp32eq(powerScale, 0.0f)) {
powerScale = 1.0f;
}
auto weightsScaleFactor = MAX_VAL_2B_WEIGHT / powerScale;
quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
quant->_weights_quant.SetScale(weightsScaleFactor);
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
return true;
} else if (layerInfo.isActivation()) {
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
// set the initial value // set the initial value
if (!quant->_dst_quant.IsScaleSet()) { if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) ||
auto scale = getActivationScale(cnnLayer, layerInfo); !fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
auto scale = getActivationScale(cnnLayer, layerInfo, fakeQuantize);
quant->_dst_quant.SetScale(scale); quant->_dst_quant.SetScale(scale);
} }
return true; return true;
} else if (layerInfo.isCropAffined()) {
auto weightsScaleFactor = 1;
quant->_weights_quant.SetScale(weightsScaleFactor);
quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
return true;
} }
quant->_dst_quant = inputQuant->_dst_quant; quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
quant->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
return true; return true;
} }
@ -311,7 +612,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
template<> template<>
class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> { class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
public: public:
bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result) { bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !eltwiseLayer ) { if ( !eltwiseLayer ) {
THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n"; THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
} }
@ -325,7 +626,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
switch (eltwiseLayer->_operation) { switch (eltwiseLayer->_operation) {
case InferenceEngine::EltwiseLayer::Prod: { case InferenceEngine::EltwiseLayer::Prod: {
quantData->_weights_quant = quantParams1->_dst_quant; quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale()); quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale());
break; break;
} }
@ -344,9 +645,51 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
std::swap(quantParams0, quantParams1); std::swap(quantParams0, quantParams1);
} }
auto prevLayer = in1;
while (LayerInfo(prevLayer).isNonFunctional() && CNNNetHasPrevLayer(prevLayer.get(), 0)) {
prevLayer = CNNNetPrevLayer(prevLayer);
}
// this path might result in significant data loss // this path might result in significant data loss
quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale()); quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale()); auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale();
auto prevLayerIn1 = CNNNetPrevLayer(in1);
// If a previous layer is a layer where freely weights scale factor can be selected,
// try to find the scale factor that will allow to use integer as weights scale factor for eltwise
// operation.
// If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation.
if (fakeQuantize && LayerInfo(in1).isWeightableIdentity() &&
(prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has16BOutput())) {
auto bestWeightsScale = 0.0f;
auto bestError = static_cast<float>(std::numeric_limits<int16_t>::max());
auto scaleIn0Dst = quantParams0->_dst_quant.GetScale();
auto scaleIn1Src = quantParams1->_src_quant.GetScale();
for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) {
auto scaleIn1Dst = i * scaleIn1Src;
auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst;
if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits<int16_t>::max() - 1) {
continue;
}
auto error = std::abs(eltwiseWeightsScale - static_cast<int16_t>(eltwiseWeightsScale));
if (error < bestError) {
bestError = error;
bestWeightsScale = i;
}
if (fp32eq(error, 0.0f)) {
break;
}
}
if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
quantParams1->_weights_quant.SetScale(bestWeightsScale);
quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
result = ScaleFactorUpdateResult(in1.get());
return true;
}
}
quantData->_weights_quant.SetScale(weightsScale);
quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale()); quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
// eltwise will always work in int16 // eltwise will always work in int16
@ -382,6 +725,22 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
break; break;
} }
if (fakeQuantize && info.isWeightableIdentity()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
auto reducer = quantData->_weights_quant.GetScale() / std::numeric_limits<int16_t>::max();
reducer = std::max(1.0f, reducer);
auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer;
newWeightsScale = std::max(1.0f, newWeightsScale);
quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
quantDataForInputLayer->_src_quant.GetScale());
result = ScaleFactorUpdateResult(in.get());
return true;
}
}
// if we are here it means that we are in the port 1 // if we are here it means that we are in the port 1
if (info.isFullyConnected() || info.isConvolution()) { if (info.isFullyConnected() || info.isConvolution()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in); auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
@ -408,7 +767,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
template<> template<>
class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> { class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
public: public:
bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result) { bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !concatLayer ) { if ( !concatLayer ) {
THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n"; THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
} }
@ -417,10 +776,6 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers."; THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
} }
auto fp32eq = [](float p1, float p2) -> bool {
return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
};
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer); auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
std::vector<InferenceEngine::CNNLayerPtr> inputLayers; std::vector<InferenceEngine::CNNLayerPtr> inputLayers;
for (auto input_idx = 0; input_idx != concatLayer->insData.size(); input_idx++) { for (auto input_idx = 0; input_idx != concatLayer->insData.size(); input_idx++) {
@ -435,7 +790,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
auto in0 = inputLayers.front(); auto in0 = inputLayers.front();
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0); auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
auto scaleFactor = quantParams0->_dst_quant.GetScale(); auto scaleFactor = quantParams0->_dst_quant.GetScale();
auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto scaleFactorCheck = [scaleFactor](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor); return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor);
}; };
@ -453,14 +808,14 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
}; };
GNAPluginNS::QuantizedLayerParams* sourceQuantParams = nullptr; GNAPluginNS::QuantizedLayerParams* sourceQuantParams = nullptr;
auto firstInputIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck); auto sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck);
if (firstInputIt != inputLayers.end()) { if (sourceLayerIt != inputLayers.end()) {
auto quantParamsFirst = InferenceEngine::getInjectedData<QuantizedLayerParams>(*firstInputIt); auto quantParamsFirst = InferenceEngine::getInjectedData<QuantizedLayerParams>(*sourceLayerIt);
auto nextInputIt = firstInputIt + 1; auto nextInputIt = sourceLayerIt + 1;
while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) { while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) {
auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt); auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt);
if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) { if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) {
THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name THROW_GNA_EXCEPTION << "Two Input layers " << (*sourceLayerIt)->name
<< " and " << (*nextInputIt)->name << " have different scales in concat!!! \n"; << " and " << (*nextInputIt)->name << " have different scales in concat!!! \n";
} }
} }
@ -469,7 +824,6 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
// find a source quant value // find a source quant value
// - 1st candidate - input layer // - 1st candidate - input layer
// - 2nd candidate - non-activation layer with non-1 scale factor // - 2nd candidate - non-activation layer with non-1 scale factor
// - 3rd candidate - 1st layer with non-1 scale factor
static std::map<std::string, size_t> restarted_counter; static std::map<std::string, size_t> restarted_counter;
auto restartedCountIt = restarted_counter.find(concatLayer->name); auto restartedCountIt = restarted_counter.find(concatLayer->name);
if (restartedCountIt == restarted_counter.end()) { if (restartedCountIt == restarted_counter.end()) {
@ -477,30 +831,46 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
restartedCountIt = pos.first; restartedCountIt = pos.first;
} }
auto sourceLayerIt = firstInputIt;
if (sourceLayerIt == inputLayers.end()) { if (sourceLayerIt == inputLayers.end()) {
if (((restartedCountIt->second) / 2) % 2 == 1) { if (((restartedCountIt->second) / 2) % 2 == 1) {
std::reverse(inputLayers.begin(), inputLayers.end()); std::reverse(inputLayers.begin(), inputLayers.end());
} }
if (fakeQuantize) {
sourceLayerIt = inputLayers.begin();
auto quantParamsFirst = InferenceEngine::getInjectedData<QuantizedLayerParams>(*inputLayers.begin());
auto minScaleFactor = quantParamsFirst->_dst_quant.GetScale();
for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*it);
if (quantParams->_dst_quant.GetScale() < minScaleFactor &&
!fp32eq(quantParams->_dst_quant.GetScale(), 1.0f) ||
fp32eq(minScaleFactor, 1.0f)) {
minScaleFactor = quantParams->_dst_quant.GetScale();
sourceLayerIt = it;
}
}
} else {
if (((restartedCountIt->second) / 4) % 2 == 0) { if (((restartedCountIt->second) / 4) % 2 == 0) {
auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto sourceLayerCheck = [](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
LayerInfo info(inputLayer); LayerInfo info(inputLayer);
return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f); return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
}; };
sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck); sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck);
} }
}
++restartedCountIt->second;
if (sourceLayerIt == inputLayers.end()) { if (sourceLayerIt == inputLayers.end()) {
auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto nonDefaultScaleFactor = [](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f); return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
}; };
sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor); sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
} }
}
++restartedCountIt->second;
}
std::set<size_t> concatIdxToUpdate; std::set<size_t> concatIdxToUpdate;
if (sourceLayerIt != inputLayers.end()) { if (sourceLayerIt != inputLayers.end()) {
@ -514,6 +884,10 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
continue; continue;
} }
if (fakeQuantize) {
concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
} else {
// possible case when some of the concat inputs are free to select scale ex: const->concat<-affine // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) { if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) {
concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it)); concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
@ -522,16 +896,17 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale()); quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
} }
} }
}
auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.GetScale(); auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.GetScale();
auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto equalScaleFactor = [updatedScaleFactor](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor); return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor);
}; };
auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor); auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor);
if (layerIt != inputLayers.end()) { if (layerIt != inputLayers.end()) {
THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name; THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors. Layer name: " << concatLayer->name;
} }
quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
@ -555,7 +930,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name; gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name;
// found that direct input to concat is a indirect parent of align filter - so no link required // found that direct input to concat is a indirect parent of align filter - so no link required
auto info = LayerInfo(layer); auto info = LayerInfo(layer);
if (!info.isWeightable() && !info.isActivation() && !info.isConst() && !info.isMemory()) { if (!info.isWeightable() && !info.isActivation() && !info.isConst()) {
gnalog() << "... skipped\n"; gnalog() << "... skipped\n";
return; return;
} }
@ -575,16 +950,44 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
auto restarLayerInfo = LayerInfo(restartedLayer); auto restarLayerInfo = LayerInfo(restartedLayer);
if (restarLayerInfo.isActivation()) { if (restarLayerInfo.isActivation()) {
// requantize activation by just changing it's output scale factor // requantize activation by just changing it's output scale factor
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); auto newScaleFactor = sourceQuantParams->_dst_quant.GetScale();
auto skipNonFunctional = [](InferenceEngine::CNNLayerPtr l) {
return LayerInfo(l).isNonFunctional();
};
auto prevLayer = CNNNetPrevLayerSkipCertain(restartedLayer, 0, skipNonFunctional);
auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr;
if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() &&
(prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL,
MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL);
auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
auto bestWeightsScale = 1.0f;
auto slopes = getPWLSlopes(restarLayerInfo);
if (!slopes.empty() && !fp32eq(prevLayerQuant->_src_quant.GetScale(), newScaleFactor)) {
bestWeightsScale = selectBestWeightsScaleFactors(prevLayerQuant->_src_quant.GetScale(),
newScaleFactor, weightsScales, { 1.0f });
} }
if (restarLayerInfo.isConst()) { if (!slopes.empty() && !fp32eq(bestWeightsScale, prevLayerQuant->_weights_quant.GetScale())) {
gnalog() << "[INFO][Concat] Optimizing weights scale factor for '" << prevLayer->name << "' layer. Change from "
<< prevLayerQuant->_weights_quant.GetScale() << " to " << bestWeightsScale << "\n";
prevLayerQuant->_weights_quant.SetScale(bestWeightsScale);
prevLayerQuant->_dst_quant.SetScale(prevLayerQuant->_weights_quant.GetScale() * prevLayerQuant->_src_quant.GetScale());
result = ScaleFactorUpdateResult(prevLayer.get());
return true;
}
}
quantDataForConCatInput->_dst_quant.SetScale(newScaleFactor);
} else if (restarLayerInfo.isConst()) {
gnalog() << "... warning const layer will be requantized\n"; gnalog() << "... warning const layer will be requantized\n";
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
}
if (restarLayerInfo.isMemory()) {
gnalog() << "... warning memory layer will be requantized\n";
quantDataForConCatInput->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); quantDataForConCatInput->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
} else {
THROW_GNA_EXCEPTION << "cannot requantize '" << restartedLayer->name << "' input to concat: " << concatLayer->name;
} }
result = ScaleFactorUpdateResult(restartedLayer.get()); result = ScaleFactorUpdateResult(restartedLayer.get());
} }
@ -607,7 +1010,7 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
uint16_t const _scale_change_threshold_200 = 200; uint16_t const _scale_change_threshold_200 = 200;
public: public:
bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result) { bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
if ( !wl ) { if ( !wl ) {
THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n"; THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n";
} else if (!wl->_weights) { } else if (!wl->_weights) {
@ -620,8 +1023,30 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl); auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
quant->_src_quant = quantDataForInputLayer->_dst_quant; quant->_src_quant = quantDataForInputLayer->_dst_quant;
if (quant->_weights_quant.IsStatsSet() && !quant->_weights_quant.IsScaleSet()) {
auto getScale = [&quant](size_t i) {
return (quant->_weights_quant.GetLevels() - 1) /
(quant->_weights_quant.GetMaxValues(false)[i] - quant->_weights_quant.GetMinValues(false)[i]);
};
float min_channel_scale = getScale(0);
for (uint32_t i = 1; i < quant->_weights_quant.GetMinValues().size(); i++) {
min_channel_scale = std::min(min_channel_scale, getScale(i));
}
auto multiplier = 1.0f;
if (quant->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
// GNA supports additional multiplier for only 8bit weights.
// The multipler is used to extend dynamic range.
multiplier = MAX_OUT_MULTIPLIER;
}
// Common weights scale calculation
quant->_weights_quant.SetScale(min_channel_scale * multiplier);
}
// TODO: pass 8 bits somehow // TODO: pass 8 bits somehow
if (quant->_weights_quant.GetScale() == 1.0f) { if (!quant->_weights_quant.IsScaleSet()) {
size_t scaleRange = 0; size_t scaleRange = 0;
if (weightsSize == 2) { if (weightsSize == 2) {
scaleRange = MAX_VAL_2B_WEIGHT; scaleRange = MAX_VAL_2B_WEIGHT;
@ -632,7 +1057,7 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
} }
quant->_weights_quant.SetScale( quant->_weights_quant.SetScale(
ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size())); ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size()));
if (quant->_weights_quant.GetScale() == -1.0f) { if (quant->_weights_quant.GetScale() == -1.0f || (fakeQuantize && LayerInfo(wl).isConcatAlignFilter())) {
quant->_weights_quant.SetScale(1.0f); quant->_weights_quant.SetScale(1.0f);
} }
@ -685,6 +1110,39 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
} }
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale()); quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
if (quant->_dst_quant.IsStatsSet()) {
// Adjust weights scale factor if output values exceed int32 maximum value
if (wl->_biases && !quant->_bias_quant.IsScaleSet()) {
auto minMax = FindMinMaxValues(wl->_biases->buffer().as<float*>(), wl->_biases->size());
quant->_bias_quant.SetMinValues({ minMax.first });
quant->_bias_quant.SetMaxValues({ minMax.second });
auto biasScale = ScaleFactorForQuantization(wl->_biases->buffer().as<float*>(), MAX_VAL_4B_BIAS, wl->_biases->size());
quant->_bias_quant.SetScale(biasScale);
if (quant->_bias_quant.GetScale() != -1.0f && quant->_bias_quant.GetScale() < quant->_dst_quant.GetScale()) {
quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
}
}
auto maxAbsVal = std::max(std::abs(quant->_dst_quant.GetMinValues().front()),
std::abs(quant->_dst_quant.GetMaxValues().front()));
auto maxIntVal = static_cast<int64_t>(maxAbsVal * quant->_dst_quant.GetScale() + 0.5f);
auto weightsReducer = static_cast<double>(maxIntVal) / std::numeric_limits<int32_t>::max();
weightsReducer = std::max(1.0, weightsReducer);
if (!fp32eq(weightsReducer, 1.0f)) {
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weightsReducer);
}
if (fp32eq(quant->_weights_quant.GetScale(), 0.0f) || std::isinf(quant->_weights_quant.GetScale())) {
quant->_weights_quant.SetScale(1.0f);
}
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
}
return true; return true;
} }
}; };
@ -692,8 +1150,8 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
template<> template<>
class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> { class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
public: public:
bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result) { bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, result); return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, result, fakeQuantize);
} }
}; };
@ -717,10 +1175,11 @@ class ScaleFactorCalculator {
mutable Cnt::const_iterator idx; mutable Cnt::const_iterator idx;
mutable bool needRestart = false; mutable bool needRestart = false;
int weightsBytesSize; int weightsBytesSize;
bool isFakeQuantize;
public: public:
ScaleFactorCalculator(Cnt &net, int weightsBytesSize) ScaleFactorCalculator(Cnt &net, int weightsBytesSize, bool fakeQuantize)
: net(net), weightsBytesSize(weightsBytesSize) { : net(net), weightsBytesSize(weightsBytesSize), isFakeQuantize(fakeQuantize) {
idx = std::begin(this->net); idx = std::begin(this->net);
} }
bool needToRestart() const { bool needToRestart() const {
@ -736,7 +1195,7 @@ class ScaleFactorCalculator {
bool operator()(T ptr) const { bool operator()(T ptr) const {
needRestart = false; needRestart = false;
frontend::ScaleFactorUpdateResult result; frontend::ScaleFactorUpdateResult result;
if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, result)) { if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, result, isFakeQuantize)) {
return false; return false;
} }
if (result) { if (result) {

View File

@ -740,6 +740,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
auto orientation = kDnnInterleavedOrientation; auto orientation = kDnnInterleavedOrientation;
auto activation_type = DnnActivation::fromType(kActPow); auto activation_type = DnnActivation::fromType(kActPow);
activation_type.fqParams.set = false;
activation_type.srcFQParams.set = false;
activation_type.args.pow.exponent = power.power; activation_type.args.pow.exponent = power.power;
activation_type.args.pow.scale = power.scale; activation_type.args.pow.scale = power.scale;
activation_type.args.pow.offset = power.offset; activation_type.args.pow.offset = power.offset;
@ -768,7 +770,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
PwlDesignOpt16(activation_type, PwlDesignOpt16(activation_type,
ptr_pwl_segments, ptr_pwl_segments,
input_pwl_scale_factor, input_pwl_scale_factor,
output_pwl_scale_factor); output_pwl_scale_factor,
gnaFlags->pwlMaxErrorPercent);
} }
} }
@ -1668,14 +1671,6 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
} }
} }
void GNAGraphCompiler::FakeQuantizePrimitive(InferenceEngine::CNNLayerPtr layer) {
// in FP32 mode lets use special form of activation that satisfies fakeQuantize formula
if (gnaFlags->sw_fp32) {
PWLPrimitive(layer);
return;
}
}
void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) { void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
auto* generic = dynamic_cast<GenericLayer*>(layer.get()); auto* generic = dynamic_cast<GenericLayer*>(layer.get());
std::string type; std::string type;
@ -1768,6 +1763,24 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type; THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type;
} }
auto activation_type = DnnActivation::fromType(it->second); auto activation_type = DnnActivation::fromType(it->second);
activation_type.fqParams.set = false;
if (quantized != nullptr && quantized->_dst_quant.IsStatsSet()) {
activation_type.fqParams.set = true;
activation_type.fqParams.levels = quantized->_dst_quant.GetLevels();
activation_type.fqParams.inputPerChannel = false;
activation_type.fqParams.input_low = &(quantized->_dst_quant.GetMinValues(true).front());
activation_type.fqParams.input_high = &(quantized->_dst_quant.GetMaxValues(true).front());
}
activation_type.srcFQParams.set = false;
if (quantized != nullptr && quantized->_src_quant.IsStatsSet()) {
activation_type.srcFQParams.set = true;
activation_type.srcFQParams.levels = quantized->_src_quant.GetLevels();
activation_type.srcFQParams.inputPerChannel = false;
activation_type.srcFQParams.input_low = &(quantized->_src_quant.GetMinValues(true).front());
activation_type.srcFQParams.input_high = &(quantized->_src_quant.GetMaxValues(true).front());
}
if (it->second == kActRelu) { if (it->second == kActRelu) {
auto reluLayer = dynamic_cast<ReLULayer*>(layer.get()); auto reluLayer = dynamic_cast<ReLULayer*>(layer.get());
activation_type.args.lrelu.negative_slope = reluLayer != nullptr ? reluLayer->negative_slope : 0.0f; activation_type.args.lrelu.negative_slope = reluLayer != nullptr ? reluLayer->negative_slope : 0.0f;
@ -1775,11 +1788,9 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
activation_type.args.lrelu.negative_slope = 0.0f; activation_type.args.lrelu.negative_slope = 0.0f;
} }
if (it->second == kActFakeQuantize) { if (quantized == nullptr && it->second == kActFakeQuantize) {
activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation(); activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation();
} } else if (it->second == kActKaldiLstmClipping) {
if (it->second == kActKaldiLstmClipping) {
auto clamp_layer = dynamic_cast<ClampLayer*>(layer.get()); auto clamp_layer = dynamic_cast<ClampLayer*>(layer.get());
if (clamp_layer) { if (clamp_layer) {
if (clamp_layer->min_value == 0 && clamp_layer->max_value == 0) { if (clamp_layer->min_value == 0 && clamp_layer->max_value == 0) {
@ -1856,7 +1867,8 @@ case name:\
PwlDesignOpt16(activation_type, PwlDesignOpt16(activation_type,
ptr_pwl_segments, ptr_pwl_segments,
input_pwl_scale_factor, input_pwl_scale_factor,
output_pwl_scale_factor); output_pwl_scale_factor,
gnaFlags->pwlMaxErrorPercent);
} }
ptr_pwl_segments_target = reinterpret_cast<gna_pwl_segment_t*>(&ptr_pwl_segments_target); ptr_pwl_segments_target = reinterpret_cast<gna_pwl_segment_t*>(&ptr_pwl_segments_target);
} }
@ -2001,7 +2013,7 @@ void GNAGraphCompiler::CreateLayerPrimitive(CNNLayerPtr layer) {
{{DelayedCopyLayerName}, CREATE(CopyPrimitive)}, {{DelayedCopyLayerName}, CREATE(CopyPrimitive)},
{{"TensorIterator"}, SKIP}, {{"TensorIterator"}, SKIP},
{{"LSTMCell"}, SKIP}, {{"LSTMCell"}, SKIP},
{{"FakeQuantize"}, CREATE(FakeQuantizePrimitive)} // TODO: fakequantize layer should be properly converted to GNA scale factors for integer case {{"FakeQuantize"}, CREATE(PWLPrimitive)}
}; };
(void)layersBuilder; (void)layersBuilder;
auto it = LayersBuilder::getStorage().find(layer->type); auto it = LayersBuilder::getStorage().find(layer->type);

View File

@ -663,10 +663,10 @@ inline void CNNNetworkRemoveLayer(CNNLayerPtr layer, bool checkDims = true) {
} }
gnalog() << "Removing " << layer->name << " layer\n"; gnalog() << "Removing " << layer->name << " layer\n";
if (layer->insData.size() != 1) { if (layer->insData.size() != 1) {
THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 input"; THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has different number of inputs than 1";
} }
if (layer->outData.size() != 1) { if (layer->outData.size() != 1) {
THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 output"; THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has different number of outputs than 1";
} }
auto isp = layer->insData.front().lock(); auto isp = layer->insData.front().lock();

View File

@ -24,7 +24,6 @@
#include <debug.h> #include <debug.h>
#include <gna/gna_config.hpp> #include <gna/gna_config.hpp>
#include "gna_plugin_config.hpp" #include "gna_plugin_config.hpp"
#include <legacy/ie_util_internal.hpp>
#include "gna_plugin.hpp" #include "gna_plugin.hpp"
#include "optimizer/gna_pass_manager.hpp" #include "optimizer/gna_pass_manager.hpp"
#include "layers/gna_layer_type.hpp" #include "layers/gna_layer_type.hpp"
@ -50,6 +49,10 @@
#include <transformations/init_node_info.hpp> #include <transformations/init_node_info.hpp>
#include <transformations/opset_conversions/convert_opset3_to_opset2.hpp> #include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
#include <transformations/opset_conversions/convert_opset2_to_opset1.hpp> #include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
#include <transformations/common_optimizations/fq_mul_fusion.hpp>
#include <transformations/common_optimizations/fq_reshape_fusion.hpp>
#include <transformations/common_optimizations/pull_transpose_through_fq.hpp>
#include <transformations/common_optimizations/relu_fake_quantize_fusion.hpp>
#if GNA_LIB_VER == 2 #if GNA_LIB_VER == 2
#include <gna2-model-api.h> #include <gna2-model-api.h>
@ -394,9 +397,9 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
// search for FQ layers // search for FQ layers
// only supports cases of int16 or int8 // only supports cases of int16 or int8
InputsDataMap inputs = network.getInputsInfo(); InputsDataMap inputs = network.getInputsInfo();
size_t inputIdx = 0;
for (auto&& input : inputs) { for (auto&& input : inputs) {
auto data = input.second->getInputData(); auto data = input.second->getInputData();
size_t inputIdx = 0;
for (auto && nextToInputLayer : getInputTo(data)) { for (auto && nextToInputLayer : getInputTo(data)) {
if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) { if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
inputIdx++; inputIdx++;
@ -411,7 +414,16 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ
THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second) THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second)
<< "unsupported, per-channel quantization for input layer : " << input.second->name(); << "unsupported, per-channel quantization for input layer : " << input.second->name();
} }
auto fp32eq = [](float p1, float p2) -> bool {
return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
};
float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]); float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]);
auto minAbsVal = std::min(std::abs(inputRange.second[0]), std::abs(inputRange.first[0]));
auto maxAbsVal = std::max(std::abs(inputRange.second[0]), std::abs(inputRange.first[0]));
if (fp32eq(minAbsVal, 0.0f) && !fp32eq(maxAbsVal, 0.0f)) {
scaleInput = (fqLayer.getLevels() - 1) / (2 * maxAbsVal);
}
if (!config.inputScaleFactors.empty()) { if (!config.inputScaleFactors.empty()) {
gnalog() << "Scale factor calculated during model quantization (" << scaleInput gnalog() << "Scale factor calculated during model quantization (" << scaleInput
@ -676,6 +688,68 @@ void GNAPlugin::ConvertModelLayoutFromNCHWToNHWC(const std::vector<CNNLayerPtr>
} }
} }
#ifdef PLOT
void GNAPlugin::AddDebugProperties(const InferenceEngine::CNNLayerPtr layer,
InferenceEngine::ordered_properties& printed_properties,
InferenceEngine::ordered_properties& node_properties) {
// printing quantized params
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
if (!quantized) {
return;
}
if (LayerInfo(layer).isWeightable() || LayerInfo(layer).isEltwise()) {
printed_properties.emplace_back(
"weights scale factor", std::to_string(quantized->_weights_quant.GetScale()));
if (quantized->_weights_quant.IsStatsSet()) {
for (auto& min : quantized->_weights_quant.GetMinValues()) {
printed_properties.emplace_back(
"weights min val", std::to_string(min));
}
for (auto& max : quantized->_weights_quant.GetMaxValues()) {
printed_properties.emplace_back(
"weights max val", std::to_string(max));
}
}
if (quantized->_bias_quant.IsStatsSet()) {
for (auto& min : quantized->_bias_quant.GetMinValues()) {
printed_properties.emplace_back(
"bias min val", std::to_string(min));
}
for (auto& max : quantized->_bias_quant.GetMaxValues()) {
printed_properties.emplace_back(
"bias max val", std::to_string(max));
}
}
}
printed_properties.emplace_back(
"src scale factor", std::to_string(quantized->_src_quant.GetScale()));
if (quantized->_src_quant.IsStatsSet()) {
for (auto& min : quantized->_src_quant.GetMinValues()) {
printed_properties.emplace_back(
"src min val", std::to_string(min));
}
for (auto& max : quantized->_src_quant.GetMaxValues()) {
printed_properties.emplace_back(
"src max val", std::to_string(max));
}
}
printed_properties.emplace_back(
"dst scale factor", std::to_string(quantized->_dst_quant.GetScale()));
if (quantized->_dst_quant.IsStatsSet()) {
for (auto& min : quantized->_dst_quant.GetMinValues()) {
printed_properties.emplace_back(
"dst min val", std::to_string(min));
}
for (auto& max : quantized->_dst_quant.GetMaxValues()) {
printed_properties.emplace_back(
"dst max val", std::to_string(max));
}
}
}
#endif
void GNAPlugin::LoadNetwork(CNNNetwork & _network) { void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork; std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
if (_network.getFunction()) { if (_network.getFunction()) {
@ -698,6 +772,10 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation // UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0; return node->get_rt_info().count("UNROLL_TI") == 0;
}); });
pass_config->disable<ngraph::pass::FakeQuantizeMulFusion>();
pass_config->disable<ngraph::pass::FakeQuantizeReshapeFusion>();
pass_config->disable<ngraph::pass::PullTransposeThroughFQUp>();
pass_config->disable<ngraph::pass::ReluFakeQuantizeFusion>();
manager.run_passes(graph); manager.run_passes(graph);
convertedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, clonedNetwork); convertedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, clonedNetwork);
} }
@ -809,16 +887,10 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
#ifdef PLOT #ifdef PLOT
std::ofstream file("gna_passes.dot"); std::ofstream file("gna_passes.dot");
saveGraphToDot(newNet, file, [](const CNNLayerPtr layer, saveGraphToDot(newNet, file, [this](const CNNLayerPtr layer,
ordered_properties& printed_properties, ordered_properties& printed_properties,
ordered_properties& node_properties) { ordered_properties& node_properties) {
// printing quantized params AddDebugProperties(layer, printed_properties, node_properties);
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
if (!quantized) {
return;
}
printed_properties.emplace_back(
"scale factor", std::to_string(quantized->_dst_quant.GetScale()));
}); });
#endif #endif

View File

@ -23,6 +23,7 @@
#include "gna_plugin_policy.hpp" #include "gna_plugin_policy.hpp"
#include "gna_plugin_log.hpp" #include "gna_plugin_log.hpp"
#include "gna_plugin_config.hpp" #include "gna_plugin_config.hpp"
#include <legacy/ie_util_internal.hpp>
#if GNA_LIB_VER == 2 #if GNA_LIB_VER == 2
#include <gna2-model-api.h> #include <gna2-model-api.h>
@ -237,6 +238,11 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
* @param layers model sorted layers * @param layers model sorted layers
*/ */
void ConvertModelLayoutFromNCHWToNHWC(const std::vector<InferenceEngine::CNNLayerPtr> &layers); void ConvertModelLayoutFromNCHWToNHWC(const std::vector<InferenceEngine::CNNLayerPtr> &layers);
#ifdef PLOT
void AddDebugProperties(const InferenceEngine::CNNLayerPtr layer,
InferenceEngine::ordered_properties& printed_properties,
InferenceEngine::ordered_properties& node_properties);
#endif
}; };
} // namespace GNAPluginNS } // namespace GNAPluginNS

View File

@ -156,6 +156,24 @@ void Config::UpdateFromMap(const std::map<std::string, std::string>& config) {
THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter " THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
<< "should be equal to YES/NO, but not" << value; << "should be equal to YES/NO, but not" << value;
} }
} else if (key == GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)) {
float max_error;
try {
max_error = InferenceEngine::CNNLayer::ie_parse_float(value);
if (max_error < 0.0f || max_error > 100.0f) {
throw std::out_of_range("");
}
}
catch (std::invalid_argument&) {
THROW_GNA_EXCEPTION << "Invalid value of PWL max error percent";
}
catch (std::out_of_range&) {
log << "Unsupported PWL error percent value: " << value
<< ", should be greater than 0 and less than 100";
THROW_GNA_EXCEPTION << "Unsupported PWL error percent value: " << value
<< ", should be greater than 0 and less than 100";
}
gnaFlags.pwlMaxErrorPercent = max_error;
} else if (key == CONFIG_KEY(PERF_COUNT)) { } else if (key == CONFIG_KEY(PERF_COUNT)) {
if (value == PluginConfigParams::YES) { if (value == PluginConfigParams::YES) {
gnaFlags.performance_counting = true; gnaFlags.performance_counting = true;
@ -252,6 +270,7 @@ void Config::AdjustKeyMapValues() {
keyConfigMap[GNA_CONFIG_KEY(PRECISION)] = gnaPrecision.name(); keyConfigMap[GNA_CONFIG_KEY(PRECISION)] = gnaPrecision.name();
keyConfigMap[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] = keyConfigMap[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] =
gnaFlags.uniformPwlDesign ? PluginConfigParams::YES: PluginConfigParams::NO; gnaFlags.uniformPwlDesign ? PluginConfigParams::YES: PluginConfigParams::NO;
keyConfigMap[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(gnaFlags.pwlMaxErrorPercent);
keyConfigMap[CONFIG_KEY(PERF_COUNT)] = keyConfigMap[CONFIG_KEY(PERF_COUNT)] =
gnaFlags.performance_counting ? PluginConfigParams::YES: PluginConfigParams::NO; gnaFlags.performance_counting ? PluginConfigParams::YES: PluginConfigParams::NO;
keyConfigMap[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(gnaFlags.gna_lib_async_threads_num); keyConfigMap[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(gnaFlags.gna_lib_async_threads_num);

View File

@ -29,7 +29,7 @@ class GNAFakeQuantizeLayer {
DnnActivation parseAsActivation() const { DnnActivation parseAsActivation() const {
DnnActivation fqActivation; DnnActivation fqActivation;
fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels"); fqActivation.fqParams.levels = fqLayer->GetParamAsInt("levels");
auto inputShape = getShapeForRange(fqLayer, 1); auto inputShape = getShapeForRange(fqLayer, 1);
auto outputShape = getShapeForRange(fqLayer, 3); auto outputShape = getShapeForRange(fqLayer, 3);
@ -37,13 +37,15 @@ class GNAFakeQuantizeLayer {
auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end()); auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end());
auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end()); auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end());
fqActivation.args.fakeQuantize.inputPerChannel = inputRangeSize != 1; fqActivation.fqParams.set = true;
fqActivation.args.fakeQuantize.input_low = getParamFromInputAsFloats(fqLayer, 1);
fqActivation.args.fakeQuantize.input_high = getParamFromInputAsFloats(fqLayer, 2);
fqActivation.args.fakeQuantize.outputPerChannel = outputRangeSize != 1; fqActivation.fqParams.inputPerChannel = inputRangeSize != 1;
fqActivation.args.fakeQuantize.output_low = getParamFromInputAsFloats(fqLayer, 3); fqActivation.fqParams.input_low = getParamFromInputAsFloats(fqLayer, 1);
fqActivation.args.fakeQuantize.output_high = getParamFromInputAsFloats(fqLayer, 4); fqActivation.fqParams.input_high = getParamFromInputAsFloats(fqLayer, 2);
fqActivation.fqParams.outputPerChannel = outputRangeSize != 1;
fqActivation.fqParams.output_low = getParamFromInputAsFloats(fqLayer, 3);
fqActivation.fqParams.output_high = getParamFromInputAsFloats(fqLayer, 4);
fqActivation.type = kActFakeQuantize; fqActivation.type = kActFakeQuantize;
return fqActivation; return fqActivation;

View File

@ -103,7 +103,8 @@ class LayerInfo {
"neglog", "neglog",
"neghalflog", "neghalflog",
"softsign", "softsign",
"power"}; "power",
"fakequantize"};
if (isPower()) { if (isPower()) {
auto powerLayer = as<const InferenceEngine::PowerLayer*>(); auto powerLayer = as<const InferenceEngine::PowerLayer*>();
@ -157,7 +158,10 @@ class LayerInfo {
IS_VALID(); IS_VALID();
return nullptr != as<const InferenceEngine::ScaleShiftLayer*>(); return nullptr != as<const InferenceEngine::ScaleShiftLayer*>();
} }
bool isSyntheticScaleShift() const noexcept {
IS_VALID();
return layer->name.find("SyntheticScaleShift") != std::string::npos;
}
bool isEltwise() const noexcept { bool isEltwise() const noexcept {
IS_VALID(); IS_VALID();
return nullptr != as<const InferenceEngine::EltwiseLayer*>(); return nullptr != as<const InferenceEngine::EltwiseLayer*>();
@ -193,6 +197,18 @@ class LayerInfo {
bool isIdentity() const noexcept { bool isIdentity() const noexcept {
return isOfType("identity"); return isOfType("identity");
} }
bool isTanh() const noexcept {
return isOfType("tanh");
}
bool isSigmoid() const noexcept {
return isOfType("sigmoid");
}
bool isSoftSign() const noexcept {
return isOfType("softsign");
}
bool isClamp() const noexcept {
return isOfType("clamp");
}
bool isFullyConnected() const noexcept { bool isFullyConnected() const noexcept {
return isOfType("FullyConnected") || isOfType("InnerProduct"); return isOfType("FullyConnected") || isOfType("InnerProduct");
} }
@ -283,6 +299,9 @@ class LayerInfo {
bool isCopyDelayed() const noexcept { bool isCopyDelayed() const noexcept {
return isOfType(DelayedCopyLayerName); return isOfType(DelayedCopyLayerName);
} }
bool isWeightableIdentity() const noexcept {
return isConcatAlignFilter() || isSyntheticScaleShift() || isCropAffined();
}
size_t paddingSize() const { size_t paddingSize() const {
static InferenceEngine::details::caseless_set<std::string> layersWithPossiblePadding = {"FullyConnected", static InferenceEngine::details::caseless_set<std::string> layersWithPossiblePadding = {"FullyConnected",

View File

@ -39,6 +39,7 @@
#include "frontend/quantization.h" #include "frontend/quantization.h"
#include "gna_groups.hpp" #include "gna_groups.hpp"
#include "gna_graph_patterns.hpp" #include "gna_graph_patterns.hpp"
#include "gna_data_types.hpp"
using namespace InferenceEngine; using namespace InferenceEngine;
using namespace InferenceEngine::details; using namespace InferenceEngine::details;
@ -54,6 +55,10 @@ std::shared_ptr<IPassManager> BasePass::getPassManager() {
return sharedMgr; return sharedMgr;
} }
static bool fp32eq(float p1, float p2) {
return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
}
// indexes stored in pass manager // indexes stored in pass manager
static const char identityLayersCounterName[] = "identityLayerCounter"; static const char identityLayersCounterName[] = "identityLayerCounter";
static const char diagonalLayersCounterName[] = "diagonalLayerCounter"; static const char diagonalLayersCounterName[] = "diagonalLayerCounter";
@ -1836,9 +1841,6 @@ void FuseFQIntoWeightsPass::run() {
weightableLayer->insData.resize(1); weightableLayer->insData.resize(1);
// 2. running FQ function for given layer // 2. running FQ function for given layer
if (weightDims.size() != 2) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " layout of weigths not equal to NC not yet supported";
}
auto outputSize = details::product(weightDims.begin(), weightDims.end()); auto outputSize = details::product(weightDims.begin(), weightDims.end());
// depending on compute precision weights will be recreated // depending on compute precision weights will be recreated
@ -1874,61 +1876,42 @@ void FuseFQIntoWeightsPass::run() {
// check if // check if
// - weights were float values and need to be quantized, // - weights were float values and need to be quantized,
// - weights are integer values and quantization can be skipped // - weights are integer values and quantization can be skipped
for (size_t i = 0; i < outputRange.first.size(); ++i) { quantized->_weights_quant.SetMinValues(inputRange.first, true);
if (inputRange.first[i] > outputRange.first[i] || quantized->_weights_quant.SetMaxValues(inputRange.second, true);
inputRange.second[i] > outputRange.second[i]) { quantized->_weights_quant.SetMinValues(outputRange.first, false);
quantized->_weights_quantized = true; quantized->_weights_quant.SetMaxValues(outputRange.second, false);
break;
}
}
quantized->_weights_quant.SetMinValues(outputRange.first);
quantized->_weights_quant.SetMaxValues(outputRange.second);
quantized->_weights_quant.SetLevels(levels); quantized->_weights_quant.SetLevels(levels);
// lets find out minimum scale factor among channels // lets find out minimum scale factor among channels
if (quantized->_weights_quant.GetMinValues().empty()) { if (!quantized->_weights_quant.IsStatsSet()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed"; THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed";
} }
auto getScale = [&quantized](size_t i) {
return (quantized->_weights_quant.GetLevels() - 1) /
(quantized->_weights_quant.GetMaxValues()[i] - quantized->_weights_quant.GetMinValues()[i]);
};
float min_channel_scale = getScale(0);
for (uint32_t i = 1; i < quantized->_weights_quant.GetMinValues().size(); i++) {
min_channel_scale = std::min(min_channel_scale, getScale(i));
}
auto multiplier = 1.0f;
if (quantized->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
// GNA supports additional multiplier for only 8bit weights.
// The multipler is used to extend dynamic range.
multiplier = MAX_OUT_MULTIPLIER;
}
// Common weights scale calculation
quantized->_weights_quant.SetScale(min_channel_scale * multiplier);
continue; continue;
} }
size_t depth = 1;
intel_dnn_component_t component; intel_dnn_component_t component;
component.num_columns_in = weightDims[1]; component.num_columns_in = weightDims[1];
component.num_rows_in = weightDims[0]; component.num_rows_in = weightDims[0];
if (LayerInfo(weightableLayer).isConvolution()) {
depth = (weightDims.size() == 4)? weightDims[3]: 1;
}
intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component.op.pwl); intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component.op.pwl);
transform->func_id = gnaFakeQuantizeLayer.parseAsActivation(); transform->func_id = gnaFakeQuantizeLayer.parseAsActivation();
auto quantizedWeightsData = quantizedWeights->buffer(); auto quantizedWeightsData = quantizedWeights->buffer();
component.ptr_inputs = quantizedWeightsData.as<float*>();
auto dequantizedWeights = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C)); auto dequantizedWeights = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
dequantizedWeights->allocate(); dequantizedWeights->allocate();
auto resultBuffer = dequantizedWeights->buffer(); auto resultBuffer = dequantizedWeights->buffer();
component.ptr_outputs = resultBuffer.as<float*>(); for (size_t i = 0; i < depth; ++i) {
component.ptr_inputs = quantizedWeightsData.as<float*>() + i * component.num_columns_in * component.num_rows_in;
component.ptr_outputs = resultBuffer.as<float*>() + i * component.num_columns_in * component.num_rows_in;
PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1); PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
}
// 3. assign dequantized const blob to weightable layer // 3. assign dequantized const blob to weightable layer
assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases); assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases);
@ -1944,6 +1927,97 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
auto donotSkip = [](CNNLayerPtr) { auto donotSkip = [](CNNLayerPtr) {
return false; return false;
}; };
auto allowFQFuse = [](CNNLayerPtr layer) -> bool {
auto doNotSkup = [](CNNLayerPtr layer) {
return false;
};
if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkup).empty()) {
return false;
}
auto skipNonFunctional = [](CNNLayerPtr layer) {
return LayerInfo(layer).isNonFunctional();
};
auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional);
if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst()) {
return true;
}
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional);
for (auto& l : nextLayers) {
if (!LayerInfo(l).isActivation()) {
return false;
}
}
return true;
};
std::function<void(QuantizedLayerParams*, CNNLayerPtr)> propagateStatistics =
[&propagateStatistics](QuantizedLayerParams* srcQuantParams, CNNLayerPtr layer) {
if (LayerInfo(layer).isFakeQuantize()) {
return;
}
auto donotSkip = [](CNNLayerPtr) {
return false;
};
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
// Find all output layers connected to FQ
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer.get(), -1, donotSkip);
if (nextLayers.empty()) {
quantParams->_src_quant.CopyStats(srcQuantParams->_dst_quant);
if (LayerInfo(layer).isNonFunctional()) {
quantParams->_dst_quant.CopyStats(srcQuantParams->_dst_quant);
}
return;
}
auto srcMinVals = srcQuantParams->_dst_quant.GetMinValues().front();
auto srcMaxVals = srcQuantParams->_dst_quant.GetMaxValues().front();
// If a next layer is concat, find minimum nad maximum statistics
if (LayerInfo(layer).isConcat() && quantParams->_src_quant.IsStatsSet()) {
auto concatMinVal = quantParams->_src_quant.GetMinValues().front();
auto concatMaxVal = quantParams->_src_quant.GetMaxValues().front();
quantParams->_src_quant.SetMinValues({ std::min(srcMinVals, concatMinVal) });
quantParams->_src_quant.SetMaxValues({ std::max(srcMaxVals, concatMaxVal) });
} else if (quantParams->_src_quant.IsStatsSet()) {
return;
} else {
quantParams->_src_quant.CopyStats(srcQuantParams->_dst_quant);
}
if (!LayerInfo(layer).isWeightable() && !LayerInfo(layer).isEltwise() &&
!LayerInfo(layer).isActivation() && !LayerInfo(layer).isFakeQuantize()) {
auto doNotSetDstStats = false;
for (auto& l : nextLayers) {
if (LayerInfo(l).isFakeQuantize()) {
doNotSetDstStats = true;
continue;
}
}
if (doNotSetDstStats) {
return;
}
quantParams->_dst_quant.CopyStats(quantParams->_src_quant);
for (auto& l : nextLayers) {
if (LayerInfo(l).isFakeQuantize()) {
continue;
}
propagateStatistics(quantParams, l);
}
}
};
for (auto &&l : *pLayers) { for (auto &&l : *pLayers) {
if (!LayerInfo(l).isFakeQuantize()) { if (!LayerInfo(l).isFakeQuantize()) {
continue; continue;
@ -1956,28 +2030,56 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
auto inputRange = fqLayer.getInputRange(); auto inputRange = fqLayer.getInputRange();
auto outputRange = fqLayer.getOutputRange(); auto outputRange = fqLayer.getOutputRange();
if (inputRange.second.size() != 1 || inputRange.second.size() != 1 || if (inputRange.first.size() != 1 || inputRange.second.size() != 1 ||
outputRange.second.size() != 1 || outputRange.second.size() != 1) { outputRange.first.size() != 1 || outputRange.second.size() != 1) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantisation"; THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantisation";
} }
if (!LayerInfo(prevLayer).isConst() &&
!fp32eq(inputRange.first.front(), outputRange.first.front()) &&
!fp32eq(inputRange.second.front(), outputRange.second.front())) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported data range conversion. Input: (" <<
inputRange.first.front() << "," << inputRange.second.front() << "), output: (" <<
outputRange.first.front() << "," << outputRange.second.front() << ")";
}
float fqLevels = fqLayer.getLevels(); float fqLevels = fqLayer.getLevels();
float scaleOutputs = (fqLevels - 1) / (outputRange.second[0] - outputRange.first[0]);
// Before FQ layer is removed, the previous layer has to be updated with its quantization data // Before FQ layer is removed, the previous layer has to be updated with its quantization data
auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer); auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
quantParamsPrevLayer->_dst_quant.SetScale(scaleOutputs);
quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels); quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels);
quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] }); quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] }, true);
quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] }); quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] }, true);
quantParamsPrevLayer->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
quantParamsPrevLayer->_dst_quant.SetMaxValues({ outputRange.second[0] }, false);
auto fqQauntParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
fqQauntParams->_dst_quant.SetLevels(fqLevels);
fqQauntParams->_dst_quant.SetMinValues({ inputRange.first[0] }, true);
fqQauntParams->_dst_quant.SetMaxValues({ inputRange.second[0] }, true);
fqQauntParams->_dst_quant.SetMinValues({ outputRange.first[0] }, false);
fqQauntParams->_dst_quant.SetMaxValues({ outputRange.second[0] }, false);
fqQauntParams->_src_quant = fqQauntParams->_dst_quant;
l->insData.resize(1);
if (!CNNNetHasPrevLayer(prevLayer.get())) {
quantParamsPrevLayer->_src_quant = quantParamsPrevLayer->_dst_quant;
}
// Allow FQ Fuse checks if FQ layer can be fused to a layer before or after.
// FQ Layer is fused only when previous layer is const or activation layer
// or a next layer is activation layer.
bool isFQFuseAllowed = allowFQFuse(l);
auto prevData = prevLayer->outData.front(); auto prevData = prevLayer->outData.front();
getInputTo(prevLayer->outData.front()).clear();
// Find all output layers connected to FQ // Find all output layers connected to FQ
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip); auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
if (nextLayers.empty()) { if (nextLayers.empty()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize does not have any output layers connected"; return;
}
if (isFQFuseAllowed) {
getInputTo(prevLayer->outData.front()).clear();
} }
// Connect all next layers after FQ to the layer that is before FQ // Connect all next layers after FQ to the layer that is before FQ
@ -1989,16 +2091,12 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
<< LAYER_NAME(nextLayers[i]) << " is not correct"; << LAYER_NAME(nextLayers[i]) << " is not correct";
} }
if (isFQFuseAllowed) {
nextLayers[i]->insData[insDatas.front()] = prevData; nextLayers[i]->insData[insDatas.front()] = prevData;
getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i]; getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
}
// After layer gets removed lets absorb its params in QuantParams structure propagateStatistics(quantParamsPrevLayer, nextLayers[i]);
// replacing scale factor from this fq layer
auto quantParamsNextLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(nextLayers[i]);
quantParamsNextLayer->_src_quant.SetScale(scaleOutputs);
quantParamsNextLayer->_src_quant.SetLevels(fqLevels);
quantParamsNextLayer->_src_quant.SetMinValues({ outputRange.first[0] });
quantParamsNextLayer->_src_quant.SetMaxValues({ outputRange.second[0] });
} }
} }
} }
@ -2013,7 +2111,9 @@ int PassManager::run(int index) {
ordered_properties &printed_properties, ordered_properties &printed_properties,
ordered_properties &node_properties) {}); ordered_properties &node_properties) {});
#endif #endif
#ifdef ENABLE_V7_SERIALIZE
network.serialize(name + ".xml", name + ".bin"); network.serialize(name + ".xml", name + ".bin");
#endif
}; };
#else #else
auto dumpNetworkAfterPass = [] (std::shared_ptr<Pass> ) {}; auto dumpNetworkAfterPass = [] (std::shared_ptr<Pass> ) {};

View File

@ -499,22 +499,41 @@ std::vector<pwl_t> pwl_search(const DnnActivation& activation_type,
void PwlDesignOpt16(const DnnActivation activation_type, void PwlDesignOpt16(const DnnActivation activation_type,
std::vector<gna_pwl_segment_t> &ptr_segment, std::vector<gna_pwl_segment_t> &ptr_segment,
const float scale_in, const float scale_in,
const float scale_out) { const float scale_out,
const float pwlMaxErrorPercent) {
std::vector<pwl_t> pwl; std::vector<pwl_t> pwl;
double err_pct = 0.0; double err_pct = 0.0;
auto minInputStats = 0.0f;
auto maxInputStats = 0.0f;
if (activation_type.srcFQParams.set) {
minInputStats = std::min(*activation_type.srcFQParams.input_low, *activation_type.srcFQParams.input_high) * 1.25f;
maxInputStats = std::max(*activation_type.srcFQParams.input_low, *activation_type.srcFQParams.input_high) * 1.25f;
}
switch (activation_type) { switch (activation_type) {
case kActSigmoid: case kActSigmoid: {
pwl = pwl_search(activation_type, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); auto absMax = std::max(std::abs(minInputStats), std::abs(maxInputStats));
make_gna_pwl(activation_type, pwl, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, scale_in, scale_out, ptr_segment); auto minInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? -absMax : -SIGMOID_DOMAIN;
auto maxInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? absMax : SIGMOID_DOMAIN;
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
break; break;
case kActTanh: }
pwl = pwl_search(activation_type, -TANH_DOMAIN, TANH_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); case kActTanh: {
make_gna_pwl(activation_type, pwl, -TANH_DOMAIN, TANH_DOMAIN, scale_in, scale_out, ptr_segment); auto absMax = std::max(std::abs(minInputStats), std::abs(maxInputStats));
auto minInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? -absMax : -TANH_DOMAIN;
auto maxInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? absMax : TANH_DOMAIN;
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
break; break;
case kActSoftSign: }
pwl = pwl_search(activation_type, -SOFTSIGN_DOMAIN, SOFTSIGN_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); case kActSoftSign: {
make_gna_pwl(activation_type, pwl, -SOFTSIGN_DOMAIN, SOFTSIGN_DOMAIN, scale_in, scale_out, ptr_segment); auto absMax = std::max(std::abs(minInputStats), std::abs(maxInputStats));
auto minInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? -absMax : -SOFTSIGN_DOMAIN;
auto maxInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? absMax : SOFTSIGN_DOMAIN;
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
break; break;
}
case kActRelu: case kActRelu:
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
break; break;
@ -522,6 +541,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
break; break;
case kActIdentity: case kActIdentity:
case kActFakeQuantize:
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
break; break;
case kActKaldiLstmClipping: case kActKaldiLstmClipping:
@ -530,28 +550,28 @@ void PwlDesignOpt16(const DnnActivation activation_type,
case kActLog: { case kActLog: {
double x_min = (1 + ~XBASEMASK) / scale_in; double x_min = (1 + ~XBASEMASK) / scale_in;
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.066*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
break; break;
} }
case kActNegLog: { case kActNegLog: {
double x_min = (1 + ~XBASEMASK) / scale_in; double x_min = (1 + ~XBASEMASK) / scale_in;
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.066*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
break; break;
} }
case kActNegHalfLog: { case kActNegHalfLog: {
double x_min = (1 + ~XBASEMASK) / scale_in; double x_min = (1 + ~XBASEMASK) / scale_in;
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.066*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
break; break;
} }
case kActExp: { case kActExp: {
double x_min = -log(scale_out); double x_min = -log(scale_out);
double x_max = x_min + log(INT16_MAX); double x_max = x_min + log(INT16_MAX);
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.5*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
break; break;
} }
@ -576,7 +596,8 @@ void PwlDesignOpt16(const DnnActivation activation_type,
x_max = std::min(x_max, POW_DOMAIN); x_max = std::min(x_max, POW_DOMAIN);
if (activation_type.args.pow.exponent != 0.0f && activation_type.args.pow.exponent != 1.0f) { if (activation_type.args.pow.exponent != 0.0f && activation_type.args.pow.exponent != 1.0f) {
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.015 * PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); auto maxError = pwlMaxErrorPercent > 0.015f? 0.015f: pwlMaxErrorPercent;
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, maxError, PWL_DESIGN_SAMPLES, err_pct);
} }
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
@ -980,15 +1001,14 @@ void PwlApply32(intel_dnn_component_t *component,
break; break;
case kActKaldiLstmClipping: { case kActKaldiLstmClipping: {
float upper_limit = component->op.pwl.func_id.args.clamp.high; float upper_limit = component->op.pwl.func_id.args.clamp.high;
float lowwer_limit = component->op.pwl.func_id.args.clamp.low; float lower_limit = component->op.pwl.func_id.args.clamp.low;
for (uint32_t i = num_row_start; i <= num_row_end; i++) { for (uint32_t i = num_row_start; i <= num_row_end; i++) {
for (uint32_t j = num_col_start; j <= num_col_end; j++) { for (uint32_t j = num_col_start; j <= num_col_end; j++) {
float val = ptr_in[i * num_columns + j]; float val = ptr_in[i * num_columns + j];
if (val > upper_limit) { if (val > upper_limit) {
ptr_out[i * num_columns + j] = upper_limit; ptr_out[i * num_columns + j] = upper_limit;
} else if (val < lowwer_limit) { } else if (val < lower_limit) {
ptr_out[i * num_columns + j] = lowwer_limit; ptr_out[i * num_columns + j] = lower_limit;
} else { } else {
ptr_out[i * num_columns + j] = val; ptr_out[i * num_columns + j] = val;
} }
@ -1050,32 +1070,36 @@ void PwlApply32(intel_dnn_component_t *component,
} }
break; break;
case kActFakeQuantize: { case kActFakeQuantize: {
auto levels = transform->func_id.args.fakeQuantize.levels; bool clamping = true;
double levels = transform->func_id.fqParams.levels;
for (uint32_t i = num_row_start; i <= num_row_end; i++) { for (uint32_t i = num_row_start; i <= num_row_end; i++) {
auto inputChannel = transform->func_id.args.fakeQuantize.inputPerChannel ? i : 0; auto inputChannel = transform->func_id.fqParams.inputPerChannel ? i : 0;
auto outputChannel = transform->func_id.args.fakeQuantize.outputPerChannel ? i : 0; auto outputChannel = transform->func_id.fqParams.outputPerChannel ? i : 0;
auto input_low = transform->func_id.args.fakeQuantize.input_low[inputChannel]; double input_low = transform->func_id.fqParams.input_low[inputChannel];
auto input_high = transform->func_id.args.fakeQuantize.input_high[inputChannel]; double input_high = transform->func_id.fqParams.input_high[inputChannel];
auto output_low = transform->func_id.args.fakeQuantize.output_low[outputChannel]; double output_low = transform->func_id.fqParams.output_low[outputChannel];
auto output_high = transform->func_id.args.fakeQuantize.output_high[outputChannel]; double output_high = transform->func_id.fqParams.output_high[outputChannel];
// TODO: this special modification for spedup-compute give different result with straight FQ formulae auto scaleInput = (levels - 1) / (input_high - input_low);
// but this used in reference graph FakeQuantize implementations so we need to honor it for a while auto scaleOutput = (levels - 1) / (output_high - output_low);
float scaleInput = (input_high - input_low) / (levels-1);
float scaleOutputs = (output_high - output_low) / (levels-1);
for (uint32_t j = num_col_start; j <= num_col_end; j++) { for (uint32_t j = num_col_start; j <= num_col_end; j++) {
auto offset = i * num_columns + j; auto offset = i * num_columns + j;
auto x = ptr_in[offset]; auto x = ptr_in[offset];
if (!clamping) {
ptr_out[offset] = ptr_in[offset] * scaleInput / scaleOutput;
continue;
}
if (x < std::min(input_low, input_high)) { if (x <= std::min(input_low, input_high)) {
ptr_out[offset] = output_low; ptr_out[offset] = output_low;
} else if (x > std::max(input_low, input_high)) { } else if (x > std::max(input_low, input_high)) {
ptr_out[offset] = output_high; ptr_out[offset] = output_high;
} else { } else {
ptr_out[offset] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low; ptr_out[offset] = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) /
(levels - 1) * (output_high - output_low) + output_low;
} }
} }
} }

View File

@ -103,4 +103,5 @@ void PwlDesign16(const DnnActivation activation_type,
void PwlDesignOpt16(const DnnActivation activation_type, void PwlDesignOpt16(const DnnActivation activation_type,
std::vector<gna_pwl_segment_t> &ptr_segment, std::vector<gna_pwl_segment_t> &ptr_segment,
const float scale_in, const float scale_in,
const float scale_out); const float scale_out,
const float pwlMaxErrorPercent);

View File

@ -0,0 +1,134 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <vector>
#include <memory>
#include <tuple>
#include <vector>
#include <string>
#include <ie_core.hpp>
#include "common_test_utils/common_utils.hpp"
#include "functional_test_utils/plugin_cache.hpp"
#include "shared_test_classes/base/layer_test_utils.hpp"
#include "functional_test_utils/blob_utils.hpp"
#include "ngraph_functions/utils/ngraph_helpers.hpp"
#include "ngraph_functions/builders.hpp"
#include "ngraph_functions/pass/convert_prc.hpp"
typedef std::tuple<
InferenceEngine::Precision, // Network Precision
std::string, // Target Device
std::map<std::string, std::string>, // Configuration
std::vector<size_t>, // Input Shape
std::pair<float, float>, // Input Min and Max
size_t // Levels
> fqActivationParams;
namespace LayerTestsDefinitions {
class FQActivation : public testing::WithParamInterface<fqActivationParams>,
public LayerTestsUtils::LayerTestsCommon {
float inputDataMin = 0.0f;
float inputDataMax = 0.0f;
float inputDataResolution = 1.0f;
public:
static std::string getTestCaseName(testing::TestParamInfo<fqActivationParams> obj) {
InferenceEngine::Precision netPrecision;
std::string targetDevice;
std::map<std::string, std::string> configuration;
std::vector<size_t> inputShape;
std::pair<float, float> inputMinMax;
size_t levels = 0;
std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax, levels) = obj.param;
std::ostringstream result;
result << "netPRC=" << netPrecision.name() << "_";
result << "targetDevice=" << targetDevice << "_";
for (auto const& configItem : configuration) {
result << "_configItem=" << configItem.first << "_" << configItem.second;
}
result << "_inputShape=" << CommonTestUtils::vec2str(inputShape);
result << "_inputMinMax=(" << inputMinMax.first << ".." << inputMinMax.second << ")";
result << "_levels=" << levels;
return result.str();
}
InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const {
return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin, 1 / inputDataResolution);
}
protected:
void SetUp() override {
InferenceEngine::Precision netPrecision;
std::vector<size_t> inputShape;
std::pair<float, float> inputMinMax;
size_t levels = 0;
std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax, levels) = this->GetParam();
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto inputLowNode = ngraph::builder::makeConstant<float>(ngPrc, { 1 }, { inputMinMax.first });
auto inputHighNode = ngraph::builder::makeConstant<float>(ngPrc, { 1 }, { inputMinMax.second });
auto inputVector = ngraph::builder::makeParams(ngPrc, { inputShape });
auto inputFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(inputVector[0],
inputLowNode, inputHighNode, inputLowNode, inputHighNode, levels);
auto relu = ngraph::builder::makeActivation(inputFQNode, ngraph::element::f32, ngraph::helpers::ActivationTypes::Relu);
auto reluFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(relu,
inputLowNode, inputHighNode, inputLowNode, inputHighNode, levels);
ngraph::ResultVector results{ std::make_shared<ngraph::opset1::Result>(reluFQNode) };
function = std::make_shared<ngraph::Function>(results, inputVector, "FQActivation");
}
};
TEST_P(FQActivation, CompareWithRefImpl) {
Run();
};
const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::FP16
};
const std::vector<std::map<std::string, std::string>> configs = {
{
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
}
};
const std::vector<std::vector<size_t>> inputShape = {
{1, 1024},
};
const std::vector<std::pair<float, float>> inputMinMax = {
{-0.5, 0.5},
{-2, 2},
{-8, 8},
{-16, 16},
{-50, 50},
{-100, 100},
};
const std::vector<size_t> levels = {
65535,
};
INSTANTIATE_TEST_CASE_P(smoke_fq_activation, FQActivation,
::testing::Combine(
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GNA),
::testing::ValuesIn(configs),
::testing::ValuesIn(inputShape),
::testing::ValuesIn(inputMinMax),
::testing::ValuesIn(levels)),
FQActivation::getTestCaseName);
} // namespace LayerTestsDefinitions

View File

@ -20,6 +20,7 @@ const std::map<std::string, std::string> supportedConfigKeysWithDefaults = {
{CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)}, {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)},
{GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()}, {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()},
{GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)}, {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)},
{GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), "1.000000"},
{CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)}, {CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)},
{GNA_CONFIG_KEY(LIB_N_THREADS), "1"}, {GNA_CONFIG_KEY(LIB_N_THREADS), "1"},
{CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)} {CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)}
@ -153,6 +154,17 @@ TEST_F(GNAPluginConfigTest, GnaConfigPwlUniformDesignTest) {
config.gnaFlags.uniformPwlDesign); config.gnaFlags.uniformPwlDesign);
} }
TEST_F(GNAPluginConfigTest, GnaConfigPwlMaxErrorPercentTest) {
SetAndCompare(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), std::string("0.100000"));
EXPECT_FLOAT_EQ(config.gnaFlags.pwlMaxErrorPercent, 0.1f);
SetAndCompare(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), std::string("1.000000"));
EXPECT_FLOAT_EQ(config.gnaFlags.pwlMaxErrorPercent, 1);
SetAndCompare(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), std::string("5.000000"));
EXPECT_FLOAT_EQ(config.gnaFlags.pwlMaxErrorPercent, 5);
ExpectThrow(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), "-1");
ExpectThrow(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), "100.1");
}
TEST_F(GNAPluginConfigTest, GnaConfigPerfCountTest) { TEST_F(GNAPluginConfigTest, GnaConfigPerfCountTest) {
SetAndCheckFlag(CONFIG_KEY(PERF_COUNT), SetAndCheckFlag(CONFIG_KEY(PERF_COUNT),
config.gnaFlags.performance_counting); config.gnaFlags.performance_counting);