diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp index fd7dd701bdd..41e287832a9 100644 --- a/inference-engine/include/gna/gna_config.hpp +++ b/inference-engine/include/gna/gna_config.hpp @@ -92,6 +92,13 @@ DECLARE_GNA_CONFIG_KEY(COMPACT_MODE); */ DECLARE_GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN); +/** +* @brief The option to allow to specify the maximum error percent that the optimized algorithm finding +* will use to find PWL functions. +* By default (in case of NO value set), 1.0 value is used. +*/ +DECLARE_GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT); + /** * @brief By default, the GNA plugin uses one worker thread for inference computations. * This parameter allows you to create up to 127 threads for software modes. diff --git a/inference-engine/samples/speech_sample/main.cpp b/inference-engine/samples/speech_sample/main.cpp index 8f3b3b5cd15..e117ca79da9 100644 --- a/inference-engine/samples/speech_sample/main.cpp +++ b/inference-engine/samples/speech_sample/main.cpp @@ -519,6 +519,10 @@ bool ParseAndCheckCommandLine(int argc, char *argv[]) { throw std::logic_error("Invalid value for 'cw_l' argument. It must be greater than or equal to 0"); } + if (FLAGS_pwl_me < 0.0 || FLAGS_pwl_me > 100.0) { + throw std::logic_error("Invalid value for 'pwl_me' argument. It must be greater than 0.0 and less than 100.0"); + } + return true; } @@ -671,6 +675,7 @@ int main(int argc, char *argv[]) { gnaPluginConfig[GNAConfigParams::KEY_GNA_LIB_N_THREADS] = std::to_string((FLAGS_cw_r > 0 || FLAGS_cw_l > 0) ? 1 : FLAGS_nthreads); gnaPluginConfig[GNA_CONFIG_KEY(COMPACT_MODE)] = CONFIG_VALUE(NO); + gnaPluginConfig[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(FLAGS_pwl_me); // ----------------------------------------------------------------------------------------------------- // --------------------------- 5. Write model to file -------------------------------------------------- diff --git a/inference-engine/samples/speech_sample/speech_sample.hpp b/inference-engine/samples/speech_sample/speech_sample.hpp index 5fc905ae689..1409d557d60 100644 --- a/inference-engine/samples/speech_sample/speech_sample.hpp +++ b/inference-engine/samples/speech_sample/speech_sample.hpp @@ -91,6 +91,10 @@ static const char input_layer_names_message[] = "Optional. Layer names for input "The names are separated with \",\" " \ "Example: Input1,Input2 "; +/// @brief message for PWL max error percent +static const char pwl_max_error_percent_message[] = "Optional. The maximum percent of error for PWL function." \ + "The value must be in <0, 100> range. The default value is 1.0."; + /// \brief Define flag for showing help message
DEFINE_bool(h, false, help_message); @@ -161,6 +165,9 @@ DEFINE_string(oname, "", output_layer_names_message); /// @brief Input layer name DEFINE_string(iname, "", input_layer_names_message); +/// @brief PWL max error percent +DEFINE_double(pwl_me, 1.0, pwl_max_error_percent_message); + /** * \brief This function show a help message */ @@ -191,5 +198,6 @@ static void showUsage() { std::cout << " -cw_r \"\" " << context_window_message_r << std::endl; std::cout << " -oname \"\" " << output_layer_names_message << std::endl; std::cout << " -iname \"\" " << input_layer_names_message << std::endl; + std::cout << " -pwl_me \"\" " << pwl_max_error_percent_message << std::endl; } diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp index 0f641e5473f..4a758649e94 100644 --- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp +++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp @@ -1243,15 +1243,15 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_ break; case kActFakeQuantize : out_file << " " << - std::dec << component[i].op.pwl.func_id.args.fakeQuantize.levels << "\n"; + std::dec << component[i].op.pwl.func_id.fqParams.levels << "\n"; out_file << " " << - std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.input_low << "\n"; + std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.input_low << "\n"; out_file << " " << - std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.input_high << "\n"; + std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.input_high << "\n"; out_file << " " << - std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.output_low << "\n"; + std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.output_low << "\n"; out_file << " " << - std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.args.fakeQuantize.output_high << "\n"; + std::setprecision(12) << std::scientific << component[i].op.pwl.func_id.fqParams.output_high << "\n"; break; default: break; diff --git a/inference-engine/src/gna_plugin/backend/dnn_types.h b/inference-engine/src/gna_plugin/backend/dnn_types.h index 341f890490e..ea0b5a1e399 100644 --- a/inference-engine/src/gna_plugin/backend/dnn_types.h +++ b/inference-engine/src/gna_plugin/backend/dnn_types.h @@ -34,9 +34,25 @@ enum DnnActivationType : uint8_t { kActNumType }; +struct FakeQuantizeParams { + int8_t set; + int32_t levels; + // if input is per-channel quantization - input pointers contains per-channel ranges + int8_t inputPerChannel; + float* input_low; + float* input_high; + // if output is per-channel quantization - output pointers contains per-channel ranges + int8_t outputPerChannel; + float* output_low; + float* output_high; +}; + struct DnnActivation { // for prelu DnnActivationType type; + FakeQuantizeParams fqParams; + FakeQuantizeParams srcFQParams; + union { struct { float negative_slope; @@ -50,17 +66,6 @@ struct DnnActivation { float low; float high; } clamp; - struct { - int32_t levels; - // if input is per-channel quantization - input pointers contains per-channel ranges - int8_t inputPerChannel; - float *input_low; - float *input_high; - // if output is per-channel quantization - output pointers contains per-channel ranges - int8_t outputPerChannel; - float *output_low; - float *output_high; - } fakeQuantize; } args; operator DnnActivationType () const noexcept { return type; diff --git a/inference-engine/src/gna_plugin/backend/make_pwl.cpp b/inference-engine/src/gna_plugin/backend/make_pwl.cpp index f7f34d33270..ddf73975b88 100644 --- a/inference-engine/src/gna_plugin/backend/make_pwl.cpp +++ b/inference-engine/src/gna_plugin/backend/make_pwl.cpp @@ -34,15 +34,20 @@ void make_gna_pwl(const DnnActivation fun, gna_pwl[0].xBase = static_cast (INT32_MIN & XBASEMASK); // zero out the 2 lsb if (fun == kActSigmoid) { gnalog() << "=========================== Sigmoid Segments ===========================\n"; - gna_pwl[0].yBase = gna_pwl[1].yBase = 0; + auto minVal = fun.fqParams.set? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale): 0; + gna_pwl[0].yBase = gna_pwl[1].yBase = minVal; gna_pwl[1].xBase = (static_cast (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK; } else if (fun == kActTanh) { gnalog() << "=========================== Tanh Segments ===========================\n"; - gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast(-1.0 * out_scale); + auto minVal = fun.fqParams.set ? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale) : + static_cast(-1.0 * out_scale); + gna_pwl[0].yBase = gna_pwl[1].yBase = minVal; gna_pwl[1].xBase = (static_cast (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK; } else { gnalog() << "=========================== SoftSign Segments ===========================\n"; - gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast(-1.0 * out_scale); + auto minVal = fun.fqParams.set ? FLOAT_TO_INT16(*fun.fqParams.input_low * out_scale) : + static_cast(-1.0 * out_scale); + gna_pwl[0].yBase = gna_pwl[1].yBase = minVal; gna_pwl[1].xBase = (static_cast (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK; } gna_pwl[0].slope = 0; @@ -74,9 +79,10 @@ void make_gna_pwl(const DnnActivation fun, << "\n"; } // insert extra segment for xvalues > u_bound + auto maxVal = fun.fqParams.set ? *fun.fqParams.input_high : 1.0; gna_pwl[n_segments - 1].xBase = ((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK; - gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(1.0 * out_scale); + gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(maxVal * out_scale); gna_pwl[n_segments - 1].slope = 0; gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale) @@ -223,9 +229,19 @@ void make_gna_pwl(const DnnActivation fun, else gnalog() << "=========================== LeakyReLU Segments ======================\n"; int32_t x_lower = INT32_MIN; + int32_t x_upper = INT32_MAX; int16_t y_lower = INT16_MIN; - if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); - if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); + int16_t y_upper = INT16_MAX; + if (fun.fqParams.set) { + x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale); + x_upper = FLOAT_TO_INT32(*fun.fqParams.input_high * 1.25 * in_scale); + y_lower = FLOAT_TO_INT16(*fun.fqParams.input_low * 1.25 * out_scale); + y_upper = FLOAT_TO_INT16(*fun.fqParams.input_high * 1.25 * out_scale); + } else { + if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); + if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); + } + gna_pwl[0].yBase = y_lower * fun.args.lrelu.negative_slope; s = gna_slope(fun.args.lrelu.negative_slope, in_scale, out_scale); gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index; // zero out the 2 lsb @@ -244,6 +260,18 @@ void make_gna_pwl(const DnnActivation fun, << " " << 0.0 << " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale) << "\n"; + + if (fun.fqParams.set) { // need a right segment + gna_pwl.push_back({ + static_cast(x_upper & XBASEMASK), // zero out the 2 lsb + y_upper, + 0 }); + + gnalog() << (x_upper & XBASEMASK) / in_scale + << " " << gna_pwl[n_segments].yBase / out_scale + << " " << 0 + << "\n"; + } break; } case kActSign: { @@ -281,11 +309,18 @@ void make_gna_pwl(const DnnActivation fun, break; } case kActIdentity: - case kActKaldiLstmClipping: { + case kActKaldiLstmClipping: + case kActFakeQuantize: { int32_t x_lower = INT32_MIN; int32_t x_upper = INT32_MAX; int16_t y_lower = INT16_MIN; int16_t y_upper = INT16_MAX; + if (fun == kActFakeQuantize && fun.fqParams.set) { + x_lower = *fun.fqParams.input_low * in_scale; + x_upper = *fun.fqParams.input_high * in_scale; + y_lower = *fun.fqParams.input_low * out_scale; + y_upper = *fun.fqParams.input_high * out_scale; + } auto n_segments = 2; if (fun == kActKaldiLstmClipping) { gnalog() << "=========================== Clipping Segments ===========================\n"; @@ -311,6 +346,8 @@ void make_gna_pwl(const DnnActivation fun, if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale); if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale); + } else if (fun == kActFakeQuantize) { + gnalog() << "=========================== Fake Quantize Segments ===========================\n"; } gna_pwl.resize(n_segments); gna_pwl[0].xBase = INT32_MIN & XBASEMASK; // zero out the 2 lsb diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp index 34af49e5586..9bb0169183a 100644 --- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp +++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp @@ -13,6 +13,7 @@ struct GNAFlags { bool compact_mode = false; bool exclusive_async_requests = false; bool uniformPwlDesign = false; + float pwlMaxErrorPercent = 1.0f; bool gna_openmp_multithreading = false; bool sw_fp32 = false; bool fake_quantized = false; diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp index dba694b8055..6f38366f6e5 100644 --- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp @@ -95,6 +95,15 @@ struct QuantPair { static B optional () { return B();} }; +struct FakeQuantizeParams { + bool paramsSet = false; + uint32_t levelsNum = 1; + float inputMinValue = 1.0f; + float inputMaxValue = 1.0f; + float outputMinValue = 1.0f; + float outputMaxValue = 1.0f; +}; + /** * @brief should allocated blob for specific data type, in case of src blob is nullptr * @tparam T @@ -170,14 +179,41 @@ class Quant { template -inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) { +inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, + float scale_factor, const FakeQuantizeParams& fqParams) { auto prec_blob = InferenceEngine::make_shared_blob({ precision, fp32_blob->getTensorDesc().getDims(), fp32_blob->getTensorDesc().getLayout() }); prec_blob->allocate(); + auto input_low = 0.0f; + auto input_high = 0.0f; + auto output_low = 0.0f; + auto output_high = 0.0f; + auto levels = 1; + if (fqParams.paramsSet) { + input_low = fqParams.inputMinValue; + input_high = fqParams.inputMaxValue; + output_low = fqParams.outputMinValue; + output_high = fqParams.outputMaxValue; + levels = fqParams.levelsNum; + } + int i = 0; for (auto& precValue : *prec_blob) { - auto f32Value = fp32_blob->buffer().template as::value_type*>()[i++] * scale_factor; + auto f32Value = fp32_blob->buffer().template as::value_type*>()[i++]; + if (fqParams.paramsSet) { + auto x = f32Value; + if (x <= std::min(input_low, input_high)) { + f32Value = output_low; + } else if (x > std::max(input_low, input_high)) { + f32Value = output_high; + } else { + f32Value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) / + (levels - 1) * (output_high - output_low) + output_low; + } + } + + f32Value = f32Value * scale_factor; if (f32Value > std::numeric_limits::max()) { precValue = std::numeric_limits::max(); } else if (f32Value < std::numeric_limits::min()) { @@ -190,20 +226,21 @@ inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob:: return static_cast(prec_blob); } -inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) { +inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, + float scale_factor, const FakeQuantizeParams &fqParams) { InferenceEngine::Blob::Ptr result_ptr = nullptr; switch (precision) { case InferenceEngine::Precision::FP32: - result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor); + result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams); break; case InferenceEngine::Precision::I32: - result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor); + result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams); break; case InferenceEngine::Precision::I16: - result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor); + result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams); break; case InferenceEngine::Precision::I8: - result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor); + result_ptr = fp32_to_precision_blob(fp32_blob, precision, scale_factor, fqParams); break; default: THROW_GNA_EXCEPTION << "FP32 to " << precision << " not supported"; @@ -304,13 +341,15 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc, auto quantData = InferenceEngine::getInjectedData(*wl); { - auto per_channel_weights = !quantData->_weights_quant.GetMinValues().empty(); + auto weightsStats = !quantData->_weights_quant.GetMinValues().empty(); auto weightsScale = quantData->_weights_quant.GetScale(); auto dstScale = quantData->_dst_quant.GetScale(); - fnc(wl->_weights->buffer().as(), - wl->_biases ? wl->_biases->buffer().as() : nullptr, + auto blob_precision = wl->_weights->getTensorDesc().getPrecision(); + auto quantizedWeights = blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16; + fnc(wl->_weights->buffer().as(), + wl->_biases ? wl->_biases->buffer().as() : nullptr, intWeights->buffer(), - intBiases ? intBiases->buffer() : static_cast(nullptr), + intBiases ? intBiases->buffer() : static_cast(nullptr), input_scale_factor, &weightsScale, &dstScale, @@ -318,12 +357,13 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc, num_columns, num_rows_padded, num_columns_padded, + quantizedWeights, quantData->_weights_quant.GetLevels(), - nullptr, - nullptr, - per_channel_weights ? &quantData->_weights_quant.GetMinValues().front(): nullptr, - per_channel_weights ? &quantData->_weights_quant.GetMaxValues().front(): nullptr, - &quantData->_weights_quantized); + quantData->_weights_quant.GetMinValues().size(), + weightsStats ? &quantData->_weights_quant.GetMinValues(true).front() : nullptr, + weightsStats ? &quantData->_weights_quant.GetMaxValues(true).front() : nullptr, + weightsStats ? &quantData->_weights_quant.GetMinValues(false).front() : nullptr, + weightsStats ? &quantData->_weights_quant.GetMaxValues(false).front() : nullptr); } wl->_weights = intWeights; wl->_biases = intBiases; @@ -410,19 +450,29 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc, auto quantData = InferenceEngine::getInjectedData(*conv); { + auto weightsStats = !quantData->_weights_quant.GetMinValues().empty(); auto weightsScale = quantData->_weights_quant.GetScale(); auto dstScale = quantData->_dst_quant.GetScale(); - fnc(conv->_weights->buffer().as(), - conv->_biases ? conv->_biases->buffer().as() : nullptr, + auto blob_precision = conv->_weights->getTensorDesc().getPrecision(); + auto quantizedWeights = blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16; + fnc(conv->_weights->buffer().as(), + conv->_biases ? conv->_biases->buffer().as() : nullptr, intWeights->buffer(), - intBiases ? intBiases->buffer() : static_cast(nullptr), + intBiases ? intBiases->buffer() : static_cast(nullptr), input_scale_factor, &weightsScale, &dstScale, num_rows, num_columns, num_rows_padded, - num_columns_padded); + num_columns_padded, + quantizedWeights, + quantData->_weights_quant.GetLevels(), + quantData->_weights_quant.GetMinValues().size(), + weightsStats ? &quantData->_weights_quant.GetMinValues(true).front() : nullptr, + weightsStats ? &quantData->_weights_quant.GetMaxValues(true).front() : nullptr, + weightsStats ? &quantData->_weights_quant.GetMinValues(false).front() : nullptr, + weightsStats ? &quantData->_weights_quant.GetMaxValues(false).front() : nullptr); } conv->_weights = intWeights; conv->_biases = intBiases; @@ -494,11 +544,22 @@ class DataQuantizer : public DataQuantizerBas if (initial_precision == InferenceEngine::Precision::FP16) { cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]); } - auto const_scale_factor = InferenceEngine::getInjectedData(*cnnLayer)->_dst_quant.GetScale(); + auto quantParams = InferenceEngine::getInjectedData(*cnnLayer); auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]); auto const_blob = cnnLayer->blobs["custom"]; if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) { - cnnLayer->blobs["custom"] = fp32_to_precision_blob(const_blob, cnnLayer->outData[0]->getPrecision(), const_scale_factor); + auto fqParams = FakeQuantizeParams{}; + if (quantParams->_dst_quant.IsStatsSet()) { + fqParams.paramsSet = true; + fqParams.levelsNum = quantParams->_dst_quant.GetLevels(); + fqParams.inputMinValue = quantParams->_dst_quant.GetMinValues(true).front(); + fqParams.inputMaxValue = quantParams->_dst_quant.GetMaxValues(true).front(); + fqParams.outputMinValue = quantParams->_dst_quant.GetMinValues(false).front(); + fqParams.outputMaxValue = quantParams->_dst_quant.GetMaxValues(false).front(); + } + + cnnLayer->blobs["custom"] = fp32_to_precision_blob(const_blob, cnnLayer->outData[0]->getPrecision(), + quantParams->_dst_quant.GetScale(), fqParams); } } diff --git a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp index 523fdb3d47a..dc867be0a9a 100644 --- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "gna_graph_tools.hpp" @@ -77,7 +78,8 @@ class ModelQuantizer { scaleIndex++; } - propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size()); + bool isFakeQuantize = std::is_same() || std::is_same(); + propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), isFakeQuantize); // sorted order gives possibility for propagate quantisation along depended layers for (auto &&layer : sortedNewNet) { @@ -88,8 +90,8 @@ class ModelQuantizer { } private : - void propagateScaleFactor(std::vector & net, int weightsBytesSize) const { - ScaleFactorCalculator sf(net, weightsBytesSize); + void propagateScaleFactor(std::vector & net, int weightsBytesSize, bool fakeQuantize) const { + ScaleFactorCalculator sf(net, weightsBytesSize, fakeQuantize); while (!sf.allLayersProcessed()) { for (auto &&layer : sf.getStartLayers()) { diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp index 33999cffe3e..d8b5f9d4da3 100644 --- a/inference-engine/src/gna_plugin/frontend/quantization.cpp +++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp @@ -9,6 +9,7 @@ #include #include "backend/gna_types.h" #include "quantization.h" +#include #ifdef DEBUG #define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__)) @@ -19,26 +20,44 @@ template<> void QuantizationCallback::runFakeQuantize() const { + if (quantizedWeights) { + THROW_GNA_EXCEPTION << "Quantized weights are not yet supported in int16 quantization mode"; + } + uint32_t num_saturate = 0; + auto input_low = 0.0f; + auto input_high = 0.0f; + auto output_low = 0.0f; + auto output_high = 0.0f; + auto levels = 1; + if (fq_num_stats > 0) { + input_low = *fq_ptr_input_low; + input_high = *fq_ptr_input_high; + output_low = *fq_ptr_output_low; + output_high = *fq_ptr_output_high; + levels = fq_levels; + } for (uint32_t row = 0; row < num_rows; row++) { for (uint32_t col = 0; col < num_columns; col++) { float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; float value = ptr_float_weights[row * num_columns + col]; - if (!*ptr_quantized_weights) { - value = value * *ptr_weight_scale_factor + rounding_value; - } else { - value -= MAX_VAL_2B_WEIGHT; + if (fq_num_stats > 0) { + auto x = value; + if (x <= std::min(input_low, input_high)) { + value = output_low; + } else if (x > std::max(input_low, input_high)) { + value = output_high; + } else { + value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) / + (levels - 1) * (output_high - output_low) + output_low; + } } + value = value * *ptr_weight_scale_factor + rounding_value; + int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); - if (*ptr_quantized_weights && - (value > std::numeric_limits::max() || - value < std::numeric_limits::min())) { - THROW_GNA_EXCEPTION << "unsupported weights range for I16 quantisation: " << value; - } - if (value > std::numeric_limits::max()) { *ptr_weight_16 = std::numeric_limits::max(); num_saturate++; @@ -91,37 +110,6 @@ void QuantizationCallback::runFakeQuantize() const { template<> void QuantizationCallback::runQuantize() const { uint32_t num_saturate = 0; - - if (*ptr_weight_scale_factor == 1.0) { - // scale factor for weights is not calculated yet - float mean_weight = 0.0; - float mean_weight_squared = 0.0; - float max_weight = -1e20f; - float var_weight; - float mean_plus_2stdev; - - for (uint32_t i = 0; i < num_rows; i++) { - for (uint32_t j = 0; j < num_columns; j++) { - float weight = ptr_float_weights[i * num_columns + j]; - mean_weight += weight; - mean_weight_squared += weight * weight; - if (fabs(weight) > max_weight) { - max_weight = fabs(weight); - } - } - } - - mean_weight /= static_cast(num_rows * num_columns); - mean_weight_squared /= static_cast(num_rows * num_columns); - var_weight = mean_weight_squared - mean_weight * mean_weight; - mean_plus_2stdev = mean_weight + 2.0f * static_cast(sqrtf(var_weight)); - - if (max_weight != 0.0f) { - *ptr_weight_scale_factor = static_cast(MAX_VAL_2B_WEIGHT) / max_weight; - } - *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor; - } - for (uint32_t row = 0; row < num_rows; row++) { for (uint32_t col = 0; col < num_columns; col++) { float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; @@ -176,6 +164,24 @@ void QuantizationCallback::runQuantize() const { } } +std::pair FindMinMaxValues(void* ptr_float_memory, size_t num_elements) { + float* ptr_float_feat = reinterpret_cast(ptr_float_memory); + float min = num_elements ? ptr_float_feat[0] : 0.0; + float max = num_elements ? ptr_float_feat[0] : 0.0; + + for (size_t i = 1; i < num_elements; i++) { + if (fabs(ptr_float_feat[i]) > max) { + max = fabs(ptr_float_feat[i]); + } + + if (fabs(ptr_float_feat[i]) < min) { + min = fabs(ptr_float_feat[i]); + } + } + + return { min, max }; +} + float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) { float *ptr_float_feat = reinterpret_cast(ptr_float_memory); float max = 0.0; @@ -224,17 +230,37 @@ template<> void QuantizationCallback::runFakeQuantize() const { uint32_t num_saturate = 0; - if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) { - THROW_GNA_EXCEPTION << "Fake quantized output range not set"; - } - if (fq_levels == 0 || fq_levels == 1) { - THROW_GNA_EXCEPTION << "Fake quantized levels not set"; - } - + auto input_low = 0.0f; + auto input_high = 0.0f; + auto output_low = 0.0f; + auto output_high = 0.0f; + auto levels = 1; + float valueAcc = 0.0; for (uint32_t i = 0; i < num_rows; i++) { - uint32_t channel_multiplier = ((fq_ptr_output_high[i] - fq_ptr_output_low[i]) * - *ptr_weight_scale_factor) / (fq_levels - 1) + 0.5f; - ptr_int_biases[i].multiplier = static_cast (channel_multiplier); + uint32_t channel_multiplier = 1; + if (fq_num_stats > 0) { + auto idx = fq_num_stats == 1 ? 0 : i; + input_low = fq_ptr_input_low[idx]; + input_high = fq_ptr_input_high[idx]; + output_low = fq_ptr_output_low[idx]; + output_high = fq_ptr_output_high[idx]; + levels = fq_levels; + + channel_multiplier = ((input_high - input_low) * *ptr_weight_scale_factor) / (levels - 1); + } else { + float scaled_row_max = 0; + for (uint32_t col = 0; col < num_columns; col++) { + float value = ptr_float_weights[i * num_columns + col] * *ptr_weight_scale_factor; + valueAcc += value; + if (fabs(value) > scaled_row_max) { + scaled_row_max = fabs(value); + } + } + + channel_multiplier = scaled_row_max / static_cast(MAX_VAL_1B_WEIGHT); + } + + ptr_int_biases[i].multiplier = static_cast (channel_multiplier + 0.5f); if (channel_multiplier > MAX_OUT_MULTIPLIER) { THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier; } @@ -243,19 +269,25 @@ void QuantizationCallback::runFakeQuantize() const auto offset = i * num_columns + j; auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f; float value = ptr_float_weights[offset]; - if (!*ptr_quantized_weights) { + if (!quantizedWeights) { + if (fq_num_stats > 0) { + auto x = value; + if (x <= std::min(input_low, input_high)) { + value = output_low; + } else if (x > std::max(input_low, input_high)) { + value = output_high; + } else { + value = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) / + (levels - 1) * (output_high - output_low) + output_low; + } + } + value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value; } else { value -= MAX_VAL_1B_WEIGHT; } auto normalizedWeight = static_cast(value); - if (*ptr_quantized_weights && - (value > std::numeric_limits::max() || - value < std::numeric_limits::min())) { - THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantization: " << value; - } - if (value > std::numeric_limits::max()) { normalizedWeight = std::numeric_limits::max(); num_saturate++; @@ -309,40 +341,6 @@ void QuantizationCallback::runQuantize() const { } uint32_t num_saturate = 0; - if (*ptr_weight_scale_factor == 1.0) { - // scale factor for weights is not calculated yet - float mean_weight = 0.0; - float mean_weight_squared = 0.0; - float max_weight = -1e20f; - float var_weight; - float mean_plus_2stdev; - - for (uint32_t i = 0; i < num_rows; i++) { - for (uint32_t j = 0; j < num_columns; j++) { - float weight = ptr_float_weights[i*num_columns + j]; - mean_weight += weight; - mean_weight_squared += weight * weight; - if (fabs(weight) > max_weight) { - max_weight = fabs(weight); - } - } - } - - mean_weight /= static_cast(num_rows * num_columns); - mean_weight_squared /= static_cast(num_rows * num_columns); - var_weight = mean_weight_squared - mean_weight * mean_weight; - mean_plus_2stdev = mean_weight + 2.0f * static_cast(sqrtf(var_weight)); - - *ptr_weight_scale_factor = static_cast(MAX_VAL_1B_WEIGHT) / max_weight; - - // For 8 bit weights quantize as follows: - // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier - // 2. find maximum scaled weight for each row - // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range - // 4. quantize and store scaled row - *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor; // increase dynamic range by max multiplier - *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor; - } float valueAcc = 0.0; for (uint32_t row = 0; row < num_rows; row++) { float scaled_row_max = 0; diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h index 67a72aadadf..1916bba298e 100644 --- a/inference-engine/src/gna_plugin/frontend/quantization.h +++ b/inference-engine/src/gna_plugin/frontend/quantization.h @@ -31,12 +31,13 @@ struct QuantizationCallback { uint32_t num_rows_padded; uint32_t num_columns_padded; + bool quantizedWeights; int32_t fq_levels; + const size_t fq_num_stats; const float *fq_ptr_input_low; const float *fq_ptr_input_high; - const float *fq_ptr_output_low; - const float *fq_ptr_output_high; - const bool* ptr_quantized_weights; + const float* fq_ptr_output_low; + const float* fq_ptr_output_high; void runQuantize() const; void runFakeQuantize() const; @@ -45,5 +46,6 @@ struct QuantizationCallback { template class QuantizationCallback; template class QuantizationCallback; +std::pair FindMinMaxValues(void* ptr_float_memory, size_t num_elements); float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements); void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor); diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp index 5f6c6a60907..bf510c7bb50 100644 --- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp +++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp @@ -24,27 +24,57 @@ public: int32_t GetLevels() const { return levels; } - void SetMinValues(const std::vector &min) { - min_values.clear(); - min_values.insert(min_values.end(), min.begin(), min.end()); + bool IsStatsSet() const { + return !input_min_values.empty() && !input_max_values.empty(); } - const std::vector& GetMinValues() const { - return min_values; + void SetMinValues(const std::vector &min, bool input = true) { + if (input) { + input_min_values.clear(); + input_min_values.insert(input_min_values.end(), min.begin(), min.end()); + } else { + output_min_values.clear(); + output_min_values.insert(output_min_values.end(), min.begin(), min.end()); + } } - void SetMaxValues(const std::vector& max) { - max_values.clear(); - max_values.insert(max_values.end(), max.begin(), max.end()); + std::vector& GetMinValues(bool input = true) { + if (input) { + return input_min_values; + } + + return output_min_values; } - const std::vector& GetMaxValues() const { - return max_values; + void SetMaxValues(const std::vector& max, bool input = true) { + if (input) { + input_max_values.clear(); + input_max_values.insert(input_max_values.end(), max.begin(), max.end()); + } else { + output_max_values.clear(); + output_max_values.insert(output_max_values.end(), max.begin(), max.end()); + } + } + std::vector& GetMaxValues(bool input = true) { + if (input) { + return input_max_values; + } + + return output_max_values; + } + void CopyStats(Quantization &src) { + levels = src.GetLevels(); + SetMinValues(src.GetMinValues(true), true); + SetMaxValues(src.GetMaxValues(true), true); + SetMinValues(src.GetMinValues(false), false); + SetMaxValues(src.GetMaxValues(false), false); } private: float scale = 1.0f; bool scale_set = false; int32_t levels = 0; - std::vector min_values; - std::vector max_values; + std::vector input_min_values; + std::vector input_max_values; + std::vector output_min_values; + std::vector output_max_values; }; struct QuantizedLayerParams { @@ -53,7 +83,6 @@ struct QuantizedLayerParams { // deprecate this Quantization _weights_quant; - bool _weights_quantized = false; Quantization _bias_quant; float _o_shift = 0.0f; float _b_shift = 0.0f; diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp index 6791768e4e9..b6f5912a814 100644 --- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp +++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp @@ -16,9 +16,13 @@ #include "layers/gna_layer_info.hpp" #include "gna_plugin_log.hpp" #include "gna_slope_scale.h" +#include "runtime/pwl.h" namespace GNAPluginNS { namespace frontend { +static const float MIN_SEARCH_WEIGHTS_VAL = 1.0f; +static const float MAX_SEARCH_WEIGHTS_VAL = 1024.0f; + struct ScaleFactorUpdateResult { InferenceEngine::CNNLayer *restartLayer = nullptr; ScaleFactorUpdateResult() = default; @@ -29,6 +33,146 @@ struct ScaleFactorUpdateResult { } }; +/** + * @brief Compares two float values and returns if they are equal + * @param p1 First float value + * @param p2 Second float value + * @return Returns true if two float values are equal + */ +static bool fp32eq(float p1, float p2) { + return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); +} + +/** + * @brief Calculates PWL slopes for specified function in a given input range + * @param info Layer information + * @return Array of slopes for a function + */ +static std::vector getPWLSlopes(const LayerInfo& info) { + if (info.isIdentity() || info.isFakeQuantize() || info.isRelu() || info.isClamp() || info.isAbs()) { + return { 1.0f }; + } + + return {}; +} + +/** + * @brief Finds the best output activation scale factor that allows to get the most precise PWL slope + * @param inScale Input activation layer scale factor + * @param outScales Array of output activation scale factors + * @param slopes Array of slopes for a given function + * @return Best output activation scale factor + */ +static float selectBestOutputScaleFactors(float inScale, std::vector outScales, const std::vector& slopes) { + std::vector scaleErrors; + for (size_t i = 0; i < outScales.size(); ++i) { + auto outScale = outScales[i]; + + auto sd = 0.0; + for (size_t j = 0; j < slopes.size(); ++j) { + auto s = gna_slope(slopes[j], inScale, outScale); + auto slope = static_cast(s.slope * s.slope_scale); + if (slope < std::numeric_limits::min() && slope > std::numeric_limits::max()) { + sd += std::numeric_limits::max(); + continue; + } + + auto testSlope = static_cast(slope) / s.slope_scale * inScale / outScale; + if (fp32eq(testSlope, slopes[j])) { + return outScale; + } + + sd += pow(testSlope - slopes[j], 2.0); + } + + sd /= slopes.size(); + sd = sqrtf(sd); + scaleErrors.push_back(sd); + } + + size_t minIndex = 0; + auto minError = scaleErrors[0]; + for (size_t i = 1; i < scaleErrors.size(); ++i) { + if (scaleErrors[i] < minError) { + minError = scaleErrors[i]; + minIndex = i; + } + } + + return outScales[minIndex]; +} + +/** + * @brief Finds the weights scale factor that allows to get the most precise PWL slope + * @param inScale Input weightable layer scale factor + * @param outScale Output activation scale factor + * @param weightsScales Array of weights scales to check + * @return Best weights scale factor + */ +static float selectBestWeightsScaleFactors(float inScale, float outScale, std::vector weightsScales, + const std::vector& slopes) { + std::vector scaleErrors; + for (size_t i = 0; i < weightsScales.size(); ++i) { + auto weightScale = weightsScales[i]; + + auto sd = 0.0; + for (size_t j = 0; j < slopes.size(); ++j) { + auto s = gna_slope(slopes[j], inScale * weightScale, outScale); + auto slope = static_cast(s.slope * s.slope_scale); + if (slope < std::numeric_limits::min() && slope > std::numeric_limits::max()) { + sd += std::numeric_limits::max(); + continue; + } + + auto testSlope = static_cast(slope) / s.slope_scale * (inScale * weightScale) / outScale; + if (fp32eq(testSlope, slopes[j])) { + return outScale; + } + sd += pow(testSlope - slopes[j], 2.0); + } + + sd /= slopes.size(); + sd = sqrtf(sd); + scaleErrors.push_back(sd); + } + + size_t minIndex = 0; + auto minError = scaleErrors[0]; + for (size_t i = 1; i < scaleErrors.size(); ++i) { + if (scaleErrors[i] < minError) { + minError = scaleErrors[i]; + minIndex = i; + } + } + + return weightsScales[minIndex]; +} + +/** + * @brief Generates specified number of scale factors in a given range. + * @param startRange First scale factor + * @param endRange Last scale factor + * @param numIterations number of scale factors to generate + * @return Array of scale factors + */ +static std::vector generateScaleFactors(float startRange, float endRange, size_t numScaleFactors) { + if (!numScaleFactors) { + return { startRange, endRange }; + } + + auto scaleFactors = std::vector{}; + auto domain = endRange - startRange; + auto step = domain / numScaleFactors; + for (size_t i = 0; i <= numScaleFactors; ++i) { + auto scale = startRange + step * i; + if (!std::isnan(scale)) { + scaleFactors.push_back(scale); + } + } + + return scaleFactors; +} + /** * @brief calculates output scale factor per layer * @tparam T @@ -44,7 +188,7 @@ class ScaleFactorPerLayer { * @param result * @return */ - bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result) { + bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { return false; } }; @@ -54,17 +198,15 @@ class ScaleFactorPerLayer { private : const float activation_scale_factor = 2048.f; const float identity_scale_factor = 2049.0f; + const float max_activation_scale_factor = 4096.0f; const float k = 5; const float k_identity = 6; const double pow_domain = 16; protected : - static bool fp32eq(float p1, float p2) { - return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); - } - float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer, - GNAPluginNS::LayerInfo const& layer) { + GNAPluginNS::LayerInfo const& layer, + const bool fakeQuantize) { auto quantizedParams = InferenceEngine::getInjectedData(*cnnLayer); // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights @@ -136,18 +278,140 @@ class ScaleFactorPerLayer { } } - if (!quantizedParams->_dst_quant.GetMaxValues().empty()) { - auto min_value = quantizedParams->_dst_quant.GetMinValues().front(); - auto max_value = quantizedParams->_dst_quant.GetMaxValues().front(); - auto newScaleFactor = (quantizedParams->_dst_quant.GetLevels() - 1) / (max_value - min_value); - result = newScaleFactor < result ? newScaleFactor : result; + // Identity layer is inserted by GNA passes and requires statistics to correctly set output + // scale factor. POT does not produce any statistics for this layer as it does not exist + // in the source IR. + if (fakeQuantize && !quantizedParams->_dst_quant.IsScaleSet() && layer.isIdentity()) { + auto prevLayer = CNNNetPrevLayer(cnnLayer); + while (prevLayer != nullptr) { + auto prevQuantParams = InferenceEngine::getInjectedData(*prevLayer); + if (prevQuantParams->_dst_quant.IsStatsSet()) { + quantizedParams->_dst_quant.CopyStats(prevQuantParams->_dst_quant); + quantizedParams->_src_quant.CopyStats(prevQuantParams->_dst_quant); + break; + } + + // Take the input statistics only if layer does not modify input values. + if (prevQuantParams->_src_quant.IsStatsSet() && + (LayerInfo(prevLayer).isNonFunctional() || LayerInfo(prevLayer).isMemory() || + LayerInfo(prevLayer).isConst() || LayerInfo(prevLayer).isInput())) { + quantizedParams->_dst_quant.CopyStats(prevQuantParams->_src_quant); + quantizedParams->_src_quant.CopyStats(prevQuantParams->_src_quant); + break; + } + + // Stop searching for statistics if previous layer does not modify input values. + if ((LayerInfo(prevLayer).isWeightable() && !LayerInfo(prevLayer).isWeightableIdentity()) + || LayerInfo(prevLayer).isEltwise() || LayerInfo(prevLayer).isActivation()) { + break; + } + + if (!CNNNetHasPrevLayer(prevLayer.get())) { + break; + } + + prevLayer = CNNNetPrevLayer(prevLayer); + } + + // If did not find statistics by searching previous layers, check if a next layer has + // statistics set. + if (!quantizedParams->_dst_quant.IsStatsSet()) { + auto donotSkip = [](InferenceEngine::CNNLayerPtr) { + return false; + }; + + auto nextLayers = CNNNetGetAllNextLayersSkipCertain(cnnLayer, -1, donotSkip); + for (auto &l : nextLayers) { + auto nextQuantParams = InferenceEngine::getInjectedData(*l); + if (nextQuantParams->_src_quant.IsStatsSet()) { + quantizedParams->_dst_quant.CopyStats(nextQuantParams->_src_quant); + quantizedParams->_src_quant.CopyStats(nextQuantParams->_src_quant); + break; + } + + // Take output statistics only if a next layer does not modify input values + if (nextQuantParams->_dst_quant.IsStatsSet() && + (LayerInfo(l).isNonFunctional() || LayerInfo(l).isMemory())) { + quantizedParams->_dst_quant.CopyStats(nextQuantParams->_dst_quant); + quantizedParams->_src_quant.CopyStats(nextQuantParams->_dst_quant); + break; + } + } + } + } + + // Adjust output scale factor based on statistics (if present) in the following steps: + // 1. calculate scale factor based on output min and max values + // 2. (temporary W/A) clamp scale factor to maximum activation scale factor + // 3. search previous layers if there was already scale factor set + // 4. adjust output scale factor to get the most precise PWL slope + if (quantizedParams->_dst_quant.IsStatsSet()) { + auto minOutValue = quantizedParams->_dst_quant.GetMinValues().front(); + auto maxOutValue = quantizedParams->_dst_quant.GetMaxValues().front(); + auto absMax = std::max(std::abs(minOutValue), std::abs(maxOutValue)); + auto absMin = std::min(std::abs(minOutValue), std::abs(maxOutValue)); + + result = (quantizedParams->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue); + if (0 && fp32eq(absMin, 0.0f) && !fp32eq(absMax, 0.0f)) { + result = (quantizedParams->_dst_quant.GetLevels() - 1) / (2 * absMax); + } + // + //result = MAX_VAL_2B_FEAT / absMax; + if (std::isinf(result) || fp32eq(absMax, 0.0f)) { + result = max_activation_scale_factor; + } + + // TODO: remove clamping maximum scale factor + result = result > max_activation_scale_factor ? max_activation_scale_factor : result; + if (!layer.isIdentity() && !layer.isFakeQuantize() && !layer.isRelu() && !layer.isClamp()) { + result = result > activation_scale_factor ? activation_scale_factor : result; + } + + // Take input scale factor from previous layer if previous layer does not modify + // input values + bool usePrevScaleFactor = false; + auto skipNonFunctional = [](InferenceEngine::CNNLayerPtr l) { + return LayerInfo(l).isNonFunctional(); + }; + + auto prevLayer = CNNNetPrevLayerSkipCertain(cnnLayer, 0, skipNonFunctional); + auto prevLayer2 = prevLayer != nullptr? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional): nullptr; + if (prevLayer != nullptr && + (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) { + auto prevLayerQuant = InferenceEngine::getInjectedData(*prevLayer); + if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) && + (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) { + result = prevLayerQuant->_src_quant.GetScale(); + usePrevScaleFactor = true; + } + } + + // Adjust output scale factor to get the most precise PWL slope. + // NOTE: Currently it is only implemented for identity, clamp, relu and FQ layers. + // For all other layers, it does not improve accuracy. + auto slopes = getPWLSlopes(layer); + if (!slopes.empty() && !usePrevScaleFactor) { + auto div = 10; + auto mul = 10; + auto startRange = result > 1.0f ? static_cast(result) : result; + auto endRange = startRange - startRange / div; + endRange = endRange > 1.0f ? static_cast(endRange) : endRange; + auto scaleFactors = generateScaleFactors(startRange, endRange, static_cast(startRange - endRange) * mul); + auto newScaleFactor = selectBestOutputScaleFactors(quantizedParams->_src_quant.GetScale(), scaleFactors, slopes); + if (!fp32eq(result, newScaleFactor) && + !fp32eq(newScaleFactor, 1.0f) && !fp32eq(newScaleFactor, 0.0f) && !std::isinf(newScaleFactor)) { + gnalog() << "[INFO] Adjusting scale factor for " << cnnLayer->name + << " from: " << result << " to: " << newScaleFactor << "\n"; + result = newScaleFactor; + } + } } return result; } public : - bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result) { + bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !cnnLayer ) { THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n"; } @@ -156,7 +420,11 @@ class ScaleFactorPerLayer { auto quant = InferenceEngine::getInjectedData(*cnnLayer); if (InferenceEngine::details::CaselessEq()(cnnLayer->type, "Memory")) { - if (!CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsScaleSet()) { + if (CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsStatsSet() && !quant->_dst_quant.IsScaleSet()) { + auto minOutValue = quant->_dst_quant.GetMinValues().front(); + auto maxOutValue = quant->_dst_quant.GetMaxValues().front(); + auto scale = (quant->_dst_quant.GetLevels() - 1) / (maxOutValue - minOutValue); + quant->_dst_quant.SetScale(scale); quant->_src_quant = quant->_dst_quant; } @@ -180,7 +448,9 @@ class ScaleFactorPerLayer { return true; } - if (quantSibling->_dst_quant.IsScaleSet()) { + if ((!fakeQuantize && quantSibling->_dst_quant.IsScaleSet()) || + (fakeQuantize && quantSibling->_dst_quant.IsScaleSet() && !fp32eq(quantSibling->_dst_quant.GetScale(), 1.0) && + quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale())) { // means we already restarted propagation input memory layer // need to search for requantiseable layer prior memory output layer InferenceEngine::CNNLayerPtr restartedLayer; @@ -230,7 +500,8 @@ class ScaleFactorPerLayer { << activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl; // try updating memory input layer scale factor and restart from it - quantSibling->_src_quant = quantSibling->_dst_quant = inputQuant->_dst_quant; + quantSibling->_src_quant.SetScale(inputQuant->_dst_quant.GetScale()); + quantSibling->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale()); result = ScaleFactorUpdateResult(input.get()); return true; } @@ -241,49 +512,55 @@ class ScaleFactorPerLayer { if (cnnLayer->type == "Const") { if (quant->_dst_quant.IsScaleSet()) { quant->_src_quant = quant->_dst_quant; - return ScaleFactorUpdateResult(); - } - - auto blob = cnnLayer->blobs["custom"]; - auto blob_precision = blob->getTensorDesc().getPrecision(); - - if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) { - quant->_dst_quant.SetScale(1.0f); return true; } - if (blob_precision == InferenceEngine::Precision::FP16) { - blob = make_fp32_blob(blob); - } - auto max_val = std::numeric_limits::min(); auto min_val = std::numeric_limits::max(); + if (quant->_dst_quant.IsStatsSet()) { + min_val = quant->_dst_quant.GetMinValues().front(); + max_val = quant->_dst_quant.GetMaxValues().front(); + } else { + auto blob = cnnLayer->blobs["custom"]; + auto blob_precision = blob->getTensorDesc().getPrecision(); - auto flt_buf = blob->buffer().as(); - auto size = blob->size(); + if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) { + quant->_dst_quant.SetScale(1.0f); + return true; + } - for (int i=0; i < size; i++) { - auto val = flt_buf[i]; - if (val > max_val) max_val = val; - if (val < min_val) min_val = val; + if (blob_precision == InferenceEngine::Precision::FP16) { + blob = make_fp32_blob(blob); + } + + auto flt_buf = blob->buffer().as(); + auto size = blob->size(); + + for (int i = 0; i < size; i++) { + auto val = flt_buf[i]; + if (val > max_val) max_val = val; + if (val < min_val) min_val = val; + } } + auto levels = fakeQuantize ? MAX_VAL_2B_FEAT : std::numeric_limits::max(); auto abs_val = std::max(std::abs(max_val), std::abs(min_val)); - auto scale_val = static_cast(std::numeric_limits::max()) / abs_val; + auto scale_val = static_cast(levels) / abs_val; + //TODO: use FQ formula for scale factor calculation - // TODO: Investigate what should be the scale in such cases (31910) - if (std::isinf(scale_val)) { - quant->_dst_quant.SetScale(quant->_src_quant.GetScale()); + if (std::isinf(scale_val) || fp32eq(abs_val, 0.0f)) { + quant->_dst_quant.SetScale(fakeQuantize ? levels : 1.0f); } else { quant->_dst_quant.SetScale(scale_val); } + quant->_src_quant.SetScale(quant->_dst_quant.GetScale()); - return ScaleFactorUpdateResult(); + return true; } if (!CNNNetHasPrevLayer(cnnLayer)) { quant->_dst_quant = quant->_src_quant; - return ScaleFactorUpdateResult(); + return true; } // by default layer is pass thru its scale factor @@ -292,17 +569,41 @@ class ScaleFactorPerLayer { THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized"; } - quant->_src_quant = inputQuant->_dst_quant; - if (layerInfo.isActivation()) { + if (layerInfo.isPower() && !layerInfo.isActivation()) { + auto quant = InferenceEngine::getInjectedData(*cnnLayer); + auto powerLayer = dynamic_cast(cnnLayer); + if (!powerLayer) { + THROW_IE_EXCEPTION << "Incorrect Power Layer pointer \n"; + } + + auto powerScale = std::abs(powerLayer->scale); + if (fp32eq(powerScale, 0.0f)) { + powerScale = 1.0f; + } + auto weightsScaleFactor = MAX_VAL_2B_WEIGHT / powerScale; + quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale()); + quant->_weights_quant.SetScale(weightsScaleFactor); + quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale()); + return true; + } else if (layerInfo.isActivation()) { // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // set the initial value - if (!quant->_dst_quant.IsScaleSet()) { - auto scale = getActivationScale(cnnLayer, layerInfo); + if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) || + !fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) { + quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale()); + auto scale = getActivationScale(cnnLayer, layerInfo, fakeQuantize); quant->_dst_quant.SetScale(scale); } return true; + } else if (layerInfo.isCropAffined()) { + auto weightsScaleFactor = 1; + quant->_weights_quant.SetScale(weightsScaleFactor); + quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale()); + quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale()); + return true; } - quant->_dst_quant = inputQuant->_dst_quant; + quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale()); + quant->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale()); return true; } @@ -311,7 +612,7 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer { public: - bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result) { + bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !eltwiseLayer ) { THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n"; } @@ -325,7 +626,7 @@ class ScaleFactorPerLayer { switch (eltwiseLayer->_operation) { case InferenceEngine::EltwiseLayer::Prod: { - quantData->_weights_quant = quantParams1->_dst_quant; + quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale()); quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale()); break; } @@ -344,9 +645,51 @@ class ScaleFactorPerLayer { std::swap(quantParams0, quantParams1); } + auto prevLayer = in1; + while (LayerInfo(prevLayer).isNonFunctional() && CNNNetHasPrevLayer(prevLayer.get(), 0)) { + prevLayer = CNNNetPrevLayer(prevLayer); + } + // this path might result in significant data loss quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale()); - quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale()); + auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale(); + auto prevLayerIn1 = CNNNetPrevLayer(in1); + // If a previous layer is a layer where freely weights scale factor can be selected, + // try to find the scale factor that will allow to use integer as weights scale factor for eltwise + // operation. + // If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation. + if (fakeQuantize && LayerInfo(in1).isWeightableIdentity() && + (prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has16BOutput())) { + auto bestWeightsScale = 0.0f; + auto bestError = static_cast(std::numeric_limits::max()); + auto scaleIn0Dst = quantParams0->_dst_quant.GetScale(); + auto scaleIn1Src = quantParams1->_src_quant.GetScale(); + for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) { + auto scaleIn1Dst = i * scaleIn1Src; + auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst; + if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits::max() - 1) { + continue; + } + + auto error = std::abs(eltwiseWeightsScale - static_cast(eltwiseWeightsScale)); + if (error < bestError) { + bestError = error; + bestWeightsScale = i; + } + + if (fp32eq(error, 0.0f)) { + break; + } + } + + if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) { + quantParams1->_weights_quant.SetScale(bestWeightsScale); + quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale()); + result = ScaleFactorUpdateResult(in1.get()); + return true; + } + } + quantData->_weights_quant.SetScale(weightsScale); quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale()); // eltwise will always work in int16 @@ -382,6 +725,22 @@ class ScaleFactorPerLayer { break; } + if (fakeQuantize && info.isWeightableIdentity()) { + auto quantDataForInputLayer = InferenceEngine::getInjectedData(*in); + if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) { + auto reducer = quantData->_weights_quant.GetScale() / std::numeric_limits::max(); + reducer = std::max(1.0f, reducer); + auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer; + newWeightsScale = std::max(1.0f, newWeightsScale); + quantDataForInputLayer->_weights_quant.SetScale(static_cast(newWeightsScale)); + quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() * + quantDataForInputLayer->_src_quant.GetScale()); + + result = ScaleFactorUpdateResult(in.get()); + return true; + } + } + // if we are here it means that we are in the port 1 if (info.isFullyConnected() || info.isConvolution()) { auto quantDataForInputLayer = InferenceEngine::getInjectedData(*in); @@ -408,7 +767,7 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer { public: - bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result) { + bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !concatLayer ) { THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n"; } @@ -417,10 +776,6 @@ class ScaleFactorPerLayer { THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers."; } - auto fp32eq = [](float p1, float p2) -> bool { - return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); - }; - auto quantData = InferenceEngine::getInjectedData(*concatLayer); std::vector inputLayers; for (auto input_idx = 0; input_idx != concatLayer->insData.size(); input_idx++) { @@ -435,7 +790,7 @@ class ScaleFactorPerLayer { auto in0 = inputLayers.front(); auto quantParams0 = InferenceEngine::getInjectedData(in0); auto scaleFactor = quantParams0->_dst_quant.GetScale(); - auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { + auto scaleFactorCheck = [scaleFactor](InferenceEngine::CNNLayerPtr& inputLayer) { auto quantParams = InferenceEngine::getInjectedData(inputLayer); return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor); }; @@ -453,14 +808,14 @@ class ScaleFactorPerLayer { }; GNAPluginNS::QuantizedLayerParams* sourceQuantParams = nullptr; - auto firstInputIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck); - if (firstInputIt != inputLayers.end()) { - auto quantParamsFirst = InferenceEngine::getInjectedData(*firstInputIt); - auto nextInputIt = firstInputIt + 1; + auto sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), inputLayerCheck); + if (sourceLayerIt != inputLayers.end()) { + auto quantParamsFirst = InferenceEngine::getInjectedData(*sourceLayerIt); + auto nextInputIt = sourceLayerIt + 1; while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) { auto quantParamsSecond = InferenceEngine::getInjectedData(*nextInputIt); if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) { - THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name + THROW_GNA_EXCEPTION << "Two Input layers " << (*sourceLayerIt)->name << " and " << (*nextInputIt)->name << " have different scales in concat!!! \n"; } } @@ -469,7 +824,6 @@ class ScaleFactorPerLayer { // find a source quant value // - 1st candidate - input layer // - 2nd candidate - non-activation layer with non-1 scale factor - // - 3rd candidate - 1st layer with non-1 scale factor static std::map restarted_counter; auto restartedCountIt = restarted_counter.find(concatLayer->name); if (restartedCountIt == restarted_counter.end()) { @@ -477,29 +831,45 @@ class ScaleFactorPerLayer { restartedCountIt = pos.first; } - auto sourceLayerIt = firstInputIt; if (sourceLayerIt == inputLayers.end()) { if (((restartedCountIt->second) / 2) % 2 == 1) { std::reverse(inputLayers.begin(), inputLayers.end()); } - if (((restartedCountIt->second) / 4) % 2 == 0) { - auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { - auto quantParams = InferenceEngine::getInjectedData(inputLayer); - LayerInfo info(inputLayer); - return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f); - }; - sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck); + + if (fakeQuantize) { + sourceLayerIt = inputLayers.begin(); + auto quantParamsFirst = InferenceEngine::getInjectedData(*inputLayers.begin()); + auto minScaleFactor = quantParamsFirst->_dst_quant.GetScale(); + for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) { + auto quantParams = InferenceEngine::getInjectedData(*it); + if (quantParams->_dst_quant.GetScale() < minScaleFactor && + !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f) || + fp32eq(minScaleFactor, 1.0f)) { + minScaleFactor = quantParams->_dst_quant.GetScale(); + sourceLayerIt = it; + } + } + } else { + if (((restartedCountIt->second) / 4) % 2 == 0) { + auto sourceLayerCheck = [](InferenceEngine::CNNLayerPtr& inputLayer) { + auto quantParams = InferenceEngine::getInjectedData(inputLayer); + LayerInfo info(inputLayer); + return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f); + }; + sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), sourceLayerCheck); + } + + if (sourceLayerIt == inputLayers.end()) { + auto nonDefaultScaleFactor = [](InferenceEngine::CNNLayerPtr& inputLayer) { + auto quantParams = InferenceEngine::getInjectedData(inputLayer); + return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f); + }; + + sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor); + } } - } - ++restartedCountIt->second; - if (sourceLayerIt == inputLayers.end()) { - auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { - auto quantParams = InferenceEngine::getInjectedData(inputLayer); - return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f); - }; - - sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor); + ++restartedCountIt->second; } std::set concatIdxToUpdate; @@ -514,24 +884,29 @@ class ScaleFactorPerLayer { continue; } - // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine - if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) { + if (fakeQuantize) { concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it)); - } + quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale()); + } else { + // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine + if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) { + concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it)); + } - quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale()); + quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale()); + } } } auto updatedScaleFactor = InferenceEngine::getInjectedData(in0)->_dst_quant.GetScale(); - auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { + auto equalScaleFactor = [updatedScaleFactor](InferenceEngine::CNNLayerPtr& inputLayer) { auto quantParams = InferenceEngine::getInjectedData(inputLayer); return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor); }; auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor); if (layerIt != inputLayers.end()) { - THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name; + THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors. Layer name: " << concatLayer->name; } quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); @@ -555,7 +930,7 @@ class ScaleFactorPerLayer { gnalog() << "[UFS] from : " << concatLayer->name << " reached: " << layer->name; // found that direct input to concat is a indirect parent of align filter - so no link required auto info = LayerInfo(layer); - if (!info.isWeightable() && !info.isActivation() && !info.isConst() && !info.isMemory()) { + if (!info.isWeightable() && !info.isActivation() && !info.isConst()) { gnalog() << "... skipped\n"; return; } @@ -575,16 +950,44 @@ class ScaleFactorPerLayer { auto restarLayerInfo = LayerInfo(restartedLayer); if (restarLayerInfo.isActivation()) { // requantize activation by just changing it's output scale factor - quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); - } - if (restarLayerInfo.isConst()) { + auto newScaleFactor = sourceQuantParams->_dst_quant.GetScale(); + auto skipNonFunctional = [](InferenceEngine::CNNLayerPtr l) { + return LayerInfo(l).isNonFunctional(); + }; + + auto prevLayer = CNNNetPrevLayerSkipCertain(restartedLayer, 0, skipNonFunctional); + auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr; + + if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() && + (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) { + auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL, + MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL); + + auto prevLayerQuant = InferenceEngine::getInjectedData(*prevLayer); + auto bestWeightsScale = 1.0f; + auto slopes = getPWLSlopes(restarLayerInfo); + if (!slopes.empty() && !fp32eq(prevLayerQuant->_src_quant.GetScale(), newScaleFactor)) { + bestWeightsScale = selectBestWeightsScaleFactors(prevLayerQuant->_src_quant.GetScale(), + newScaleFactor, weightsScales, { 1.0f }); + } + if (!slopes.empty() && !fp32eq(bestWeightsScale, prevLayerQuant->_weights_quant.GetScale())) { + gnalog() << "[INFO][Concat] Optimizing weights scale factor for '" << prevLayer->name << "' layer. Change from " + << prevLayerQuant->_weights_quant.GetScale() << " to " << bestWeightsScale << "\n"; + + prevLayerQuant->_weights_quant.SetScale(bestWeightsScale); + prevLayerQuant->_dst_quant.SetScale(prevLayerQuant->_weights_quant.GetScale() * prevLayerQuant->_src_quant.GetScale()); + result = ScaleFactorUpdateResult(prevLayer.get()); + return true; + } + } + + quantDataForConCatInput->_dst_quant.SetScale(newScaleFactor); + } else if (restarLayerInfo.isConst()) { gnalog() << "... warning const layer will be requantized\n"; - quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); - } - if (restarLayerInfo.isMemory()) { - gnalog() << "... warning memory layer will be requantized\n"; quantDataForConCatInput->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale()); + } else { + THROW_GNA_EXCEPTION << "cannot requantize '" << restartedLayer->name << "' input to concat: " << concatLayer->name; } result = ScaleFactorUpdateResult(restartedLayer.get()); } @@ -607,7 +1010,7 @@ class ScaleFactorPerLayer { uint16_t const _scale_change_threshold_200 = 200; public: - bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result) { + bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !wl ) { THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n"; } else if (!wl->_weights) { @@ -620,8 +1023,30 @@ class ScaleFactorPerLayer { auto quant = InferenceEngine::getInjectedData(*wl); quant->_src_quant = quantDataForInputLayer->_dst_quant; + if (quant->_weights_quant.IsStatsSet() && !quant->_weights_quant.IsScaleSet()) { + auto getScale = [&quant](size_t i) { + return (quant->_weights_quant.GetLevels() - 1) / + (quant->_weights_quant.GetMaxValues(false)[i] - quant->_weights_quant.GetMinValues(false)[i]); + }; + + float min_channel_scale = getScale(0); + for (uint32_t i = 1; i < quant->_weights_quant.GetMinValues().size(); i++) { + min_channel_scale = std::min(min_channel_scale, getScale(i)); + } + + auto multiplier = 1.0f; + if (quant->_weights_quant.GetLevels() <= std::numeric_limits::max()) { + // GNA supports additional multiplier for only 8bit weights. + // The multipler is used to extend dynamic range. + multiplier = MAX_OUT_MULTIPLIER; + } + + // Common weights scale calculation + quant->_weights_quant.SetScale(min_channel_scale * multiplier); + } + // TODO: pass 8 bits somehow - if (quant->_weights_quant.GetScale() == 1.0f) { + if (!quant->_weights_quant.IsScaleSet()) { size_t scaleRange = 0; if (weightsSize == 2) { scaleRange = MAX_VAL_2B_WEIGHT; @@ -632,7 +1057,7 @@ class ScaleFactorPerLayer { } quant->_weights_quant.SetScale( ScaleFactorForQuantization(wl->_weights->buffer().as(), scaleRange, wl->_weights->size())); - if (quant->_weights_quant.GetScale() == -1.0f) { + if (quant->_weights_quant.GetScale() == -1.0f || (fakeQuantize && LayerInfo(wl).isConcatAlignFilter())) { quant->_weights_quant.SetScale(1.0f); } @@ -685,6 +1110,39 @@ class ScaleFactorPerLayer { } quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale()); + if (quant->_dst_quant.IsStatsSet()) { + // Adjust weights scale factor if output values exceed int32 maximum value + + if (wl->_biases && !quant->_bias_quant.IsScaleSet()) { + auto minMax = FindMinMaxValues(wl->_biases->buffer().as(), wl->_biases->size()); + quant->_bias_quant.SetMinValues({ minMax.first }); + quant->_bias_quant.SetMaxValues({ minMax.second }); + + auto biasScale = ScaleFactorForQuantization(wl->_biases->buffer().as(), MAX_VAL_4B_BIAS, wl->_biases->size()); + quant->_bias_quant.SetScale(biasScale); + if (quant->_bias_quant.GetScale() != -1.0f && quant->_bias_quant.GetScale() < quant->_dst_quant.GetScale()) { + quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale()); + quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale()); + } + } + + auto maxAbsVal = std::max(std::abs(quant->_dst_quant.GetMinValues().front()), + std::abs(quant->_dst_quant.GetMaxValues().front())); + + auto maxIntVal = static_cast(maxAbsVal * quant->_dst_quant.GetScale() + 0.5f); + auto weightsReducer = static_cast(maxIntVal) / std::numeric_limits::max(); + weightsReducer = std::max(1.0, weightsReducer); + if (!fp32eq(weightsReducer, 1.0f)) { + quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weightsReducer); + } + + if (fp32eq(quant->_weights_quant.GetScale(), 0.0f) || std::isinf(quant->_weights_quant.GetScale())) { + quant->_weights_quant.SetScale(1.0f); + } + + quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale()); + } + return true; } }; @@ -692,8 +1150,8 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer : public ScaleFactorPerLayer { public: - bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result) { - return ScaleFactorPerLayer::operator()(wl, 2, result); + bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { + return ScaleFactorPerLayer::operator()(wl, 2, result, fakeQuantize); } }; @@ -717,10 +1175,11 @@ class ScaleFactorCalculator { mutable Cnt::const_iterator idx; mutable bool needRestart = false; int weightsBytesSize; + bool isFakeQuantize; public: - ScaleFactorCalculator(Cnt &net, int weightsBytesSize) - : net(net), weightsBytesSize(weightsBytesSize) { + ScaleFactorCalculator(Cnt &net, int weightsBytesSize, bool fakeQuantize) + : net(net), weightsBytesSize(weightsBytesSize), isFakeQuantize(fakeQuantize) { idx = std::begin(this->net); } bool needToRestart() const { @@ -736,7 +1195,7 @@ class ScaleFactorCalculator { bool operator()(T ptr) const { needRestart = false; frontend::ScaleFactorUpdateResult result; - if (!frontend::ScaleFactorPerLayer()(ptr, weightsBytesSize, result)) { + if (!frontend::ScaleFactorPerLayer()(ptr, weightsBytesSize, result, isFakeQuantize)) { return false; } if (result) { diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp index 19f22520a90..87afd6deb7d 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp @@ -740,6 +740,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) { auto orientation = kDnnInterleavedOrientation; auto activation_type = DnnActivation::fromType(kActPow); + activation_type.fqParams.set = false; + activation_type.srcFQParams.set = false; activation_type.args.pow.exponent = power.power; activation_type.args.pow.scale = power.scale; activation_type.args.pow.offset = power.offset; @@ -768,7 +770,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) { PwlDesignOpt16(activation_type, ptr_pwl_segments, input_pwl_scale_factor, - output_pwl_scale_factor); + output_pwl_scale_factor, + gnaFlags->pwlMaxErrorPercent); } } @@ -1668,14 +1671,6 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer) } } -void GNAGraphCompiler::FakeQuantizePrimitive(InferenceEngine::CNNLayerPtr layer) { - // in FP32 mode lets use special form of activation that satisfies fakeQuantize formula - if (gnaFlags->sw_fp32) { - PWLPrimitive(layer); - return; - } -} - void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) { auto* generic = dynamic_cast(layer.get()); std::string type; @@ -1768,6 +1763,24 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) { THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type; } auto activation_type = DnnActivation::fromType(it->second); + activation_type.fqParams.set = false; + if (quantized != nullptr && quantized->_dst_quant.IsStatsSet()) { + activation_type.fqParams.set = true; + activation_type.fqParams.levels = quantized->_dst_quant.GetLevels(); + activation_type.fqParams.inputPerChannel = false; + activation_type.fqParams.input_low = &(quantized->_dst_quant.GetMinValues(true).front()); + activation_type.fqParams.input_high = &(quantized->_dst_quant.GetMaxValues(true).front()); + } + + activation_type.srcFQParams.set = false; + if (quantized != nullptr && quantized->_src_quant.IsStatsSet()) { + activation_type.srcFQParams.set = true; + activation_type.srcFQParams.levels = quantized->_src_quant.GetLevels(); + activation_type.srcFQParams.inputPerChannel = false; + activation_type.srcFQParams.input_low = &(quantized->_src_quant.GetMinValues(true).front()); + activation_type.srcFQParams.input_high = &(quantized->_src_quant.GetMaxValues(true).front()); + } + if (it->second == kActRelu) { auto reluLayer = dynamic_cast(layer.get()); activation_type.args.lrelu.negative_slope = reluLayer != nullptr ? reluLayer->negative_slope : 0.0f; @@ -1775,11 +1788,9 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) { activation_type.args.lrelu.negative_slope = 0.0f; } - if (it->second == kActFakeQuantize) { + if (quantized == nullptr && it->second == kActFakeQuantize) { activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation(); - } - - if (it->second == kActKaldiLstmClipping) { + } else if (it->second == kActKaldiLstmClipping) { auto clamp_layer = dynamic_cast(layer.get()); if (clamp_layer) { if (clamp_layer->min_value == 0 && clamp_layer->max_value == 0) { @@ -1856,7 +1867,8 @@ case name:\ PwlDesignOpt16(activation_type, ptr_pwl_segments, input_pwl_scale_factor, - output_pwl_scale_factor); + output_pwl_scale_factor, + gnaFlags->pwlMaxErrorPercent); } ptr_pwl_segments_target = reinterpret_cast(&ptr_pwl_segments_target); } @@ -2001,7 +2013,7 @@ void GNAGraphCompiler::CreateLayerPrimitive(CNNLayerPtr layer) { {{DelayedCopyLayerName}, CREATE(CopyPrimitive)}, {{"TensorIterator"}, SKIP}, {{"LSTMCell"}, SKIP}, - {{"FakeQuantize"}, CREATE(FakeQuantizePrimitive)} // TODO: fakequantize layer should be properly converted to GNA scale factors for integer case + {{"FakeQuantize"}, CREATE(PWLPrimitive)} }; (void)layersBuilder; auto it = LayersBuilder::getStorage().find(layer->type); diff --git a/inference-engine/src/gna_plugin/gna_graph_tools.hpp b/inference-engine/src/gna_plugin/gna_graph_tools.hpp index 112e6060c30..bd3dfe90a9b 100644 --- a/inference-engine/src/gna_plugin/gna_graph_tools.hpp +++ b/inference-engine/src/gna_plugin/gna_graph_tools.hpp @@ -663,10 +663,10 @@ inline void CNNNetworkRemoveLayer(CNNLayerPtr layer, bool checkDims = true) { } gnalog() << "Removing " << layer->name << " layer\n"; if (layer->insData.size() != 1) { - THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 input"; + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has different number of inputs than 1"; } if (layer->outData.size() != 1) { - THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has not 1 output"; + THROW_IE_EXCEPTION << "Cannot remove layer : "<< layer->name <<" that has different number of outputs than 1"; } auto isp = layer->insData.front().lock(); diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index c8a337c3617..d978bbd46f5 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -24,7 +24,6 @@ #include #include #include "gna_plugin_config.hpp" -#include #include "gna_plugin.hpp" #include "optimizer/gna_pass_manager.hpp" #include "layers/gna_layer_type.hpp" @@ -50,6 +49,10 @@ #include #include #include +#include +#include +#include +#include #if GNA_LIB_VER == 2 #include @@ -394,9 +397,9 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ // search for FQ layers // only supports cases of int16 or int8 InputsDataMap inputs = network.getInputsInfo(); - for (auto && input : inputs) { + size_t inputIdx = 0; + for (auto&& input : inputs) { auto data = input.second->getInputData(); - size_t inputIdx = 0; for (auto && nextToInputLayer : getInputTo(data)) { if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) { inputIdx++; @@ -411,7 +414,16 @@ void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::CNNNetwork & networ THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second) << "unsupported, per-channel quantization for input layer : " << input.second->name(); } + + auto fp32eq = [](float p1, float p2) -> bool { + return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); + }; float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]); + auto minAbsVal = std::min(std::abs(inputRange.second[0]), std::abs(inputRange.first[0])); + auto maxAbsVal = std::max(std::abs(inputRange.second[0]), std::abs(inputRange.first[0])); + if (fp32eq(minAbsVal, 0.0f) && !fp32eq(maxAbsVal, 0.0f)) { + scaleInput = (fqLayer.getLevels() - 1) / (2 * maxAbsVal); + } if (!config.inputScaleFactors.empty()) { gnalog() << "Scale factor calculated during model quantization (" << scaleInput @@ -676,6 +688,68 @@ void GNAPlugin::ConvertModelLayoutFromNCHWToNHWC(const std::vector } } +#ifdef PLOT +void GNAPlugin::AddDebugProperties(const InferenceEngine::CNNLayerPtr layer, + InferenceEngine::ordered_properties& printed_properties, + InferenceEngine::ordered_properties& node_properties) { + // printing quantized params + auto quantized = InferenceEngine::getInjectedData(layer); + if (!quantized) { + return; + } + if (LayerInfo(layer).isWeightable() || LayerInfo(layer).isEltwise()) { + printed_properties.emplace_back( + "weights scale factor", std::to_string(quantized->_weights_quant.GetScale())); + if (quantized->_weights_quant.IsStatsSet()) { + for (auto& min : quantized->_weights_quant.GetMinValues()) { + printed_properties.emplace_back( + "weights min val", std::to_string(min)); + } + for (auto& max : quantized->_weights_quant.GetMaxValues()) { + printed_properties.emplace_back( + "weights max val", std::to_string(max)); + } + } + + if (quantized->_bias_quant.IsStatsSet()) { + for (auto& min : quantized->_bias_quant.GetMinValues()) { + printed_properties.emplace_back( + "bias min val", std::to_string(min)); + } + for (auto& max : quantized->_bias_quant.GetMaxValues()) { + printed_properties.emplace_back( + "bias max val", std::to_string(max)); + } + } + } + printed_properties.emplace_back( + "src scale factor", std::to_string(quantized->_src_quant.GetScale())); + if (quantized->_src_quant.IsStatsSet()) { + for (auto& min : quantized->_src_quant.GetMinValues()) { + printed_properties.emplace_back( + "src min val", std::to_string(min)); + } + for (auto& max : quantized->_src_quant.GetMaxValues()) { + printed_properties.emplace_back( + "src max val", std::to_string(max)); + } + } + + printed_properties.emplace_back( + "dst scale factor", std::to_string(quantized->_dst_quant.GetScale())); + if (quantized->_dst_quant.IsStatsSet()) { + for (auto& min : quantized->_dst_quant.GetMinValues()) { + printed_properties.emplace_back( + "dst min val", std::to_string(min)); + } + for (auto& max : quantized->_dst_quant.GetMaxValues()) { + printed_properties.emplace_back( + "dst max val", std::to_string(max)); + } + } +} +#endif + void GNAPlugin::LoadNetwork(CNNNetwork & _network) { std::shared_ptr convertedNetwork; if (_network.getFunction()) { @@ -698,6 +772,10 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { // UnrollTI transformation is disabled by default, is turned on by LowLatency transformation return node->get_rt_info().count("UNROLL_TI") == 0; }); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); + pass_config->disable(); manager.run_passes(graph); convertedNetwork = InferenceEngine::details::convertFunctionToICNNNetwork(graph, clonedNetwork); } @@ -809,17 +887,11 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { #ifdef PLOT std::ofstream file("gna_passes.dot"); - saveGraphToDot(newNet, file, [](const CNNLayerPtr layer, - ordered_properties &printed_properties, - ordered_properties &node_properties) { - // printing quantized params - auto quantized = InferenceEngine::getInjectedData(layer); - if (!quantized) { - return; - } - printed_properties.emplace_back( - "scale factor", std::to_string(quantized->_dst_quant.GetScale())); - }); + saveGraphToDot(newNet, file, [this](const CNNLayerPtr layer, + ordered_properties& printed_properties, + ordered_properties& node_properties) { + AddDebugProperties(layer, printed_properties, node_properties); + }); #endif auto sortedNet = CNNNetSortTopologicallyEx(newNet, make_fuzed_order); diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp index 1a6e20d558c..0af27ba6572 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin.hpp @@ -23,6 +23,7 @@ #include "gna_plugin_policy.hpp" #include "gna_plugin_log.hpp" #include "gna_plugin_config.hpp" +#include #if GNA_LIB_VER == 2 #include @@ -237,6 +238,11 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin { * @param layers model sorted layers */ void ConvertModelLayoutFromNCHWToNHWC(const std::vector &layers); +#ifdef PLOT + void AddDebugProperties(const InferenceEngine::CNNLayerPtr layer, + InferenceEngine::ordered_properties& printed_properties, + InferenceEngine::ordered_properties& node_properties); +#endif }; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.cpp b/inference-engine/src/gna_plugin/gna_plugin_config.cpp index 60d4d854214..b7d20534733 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_config.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin_config.cpp @@ -156,6 +156,24 @@ void Config::UpdateFromMap(const std::map& config) { THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter " << "should be equal to YES/NO, but not" << value; } + } else if (key == GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)) { + float max_error; + try { + max_error = InferenceEngine::CNNLayer::ie_parse_float(value); + if (max_error < 0.0f || max_error > 100.0f) { + throw std::out_of_range(""); + } + } + catch (std::invalid_argument&) { + THROW_GNA_EXCEPTION << "Invalid value of PWL max error percent"; + } + catch (std::out_of_range&) { + log << "Unsupported PWL error percent value: " << value + << ", should be greater than 0 and less than 100"; + THROW_GNA_EXCEPTION << "Unsupported PWL error percent value: " << value + << ", should be greater than 0 and less than 100"; + } + gnaFlags.pwlMaxErrorPercent = max_error; } else if (key == CONFIG_KEY(PERF_COUNT)) { if (value == PluginConfigParams::YES) { gnaFlags.performance_counting = true; @@ -252,6 +270,7 @@ void Config::AdjustKeyMapValues() { keyConfigMap[GNA_CONFIG_KEY(PRECISION)] = gnaPrecision.name(); keyConfigMap[GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN)] = gnaFlags.uniformPwlDesign ? PluginConfigParams::YES: PluginConfigParams::NO; + keyConfigMap[GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT)] = std::to_string(gnaFlags.pwlMaxErrorPercent); keyConfigMap[CONFIG_KEY(PERF_COUNT)] = gnaFlags.performance_counting ? PluginConfigParams::YES: PluginConfigParams::NO; keyConfigMap[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(gnaFlags.gna_lib_async_threads_num); diff --git a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp index c80cb62d6e5..9d30126a1ce 100644 --- a/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_fake_quantize_layer.hpp @@ -29,7 +29,7 @@ class GNAFakeQuantizeLayer { DnnActivation parseAsActivation() const { DnnActivation fqActivation; - fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels"); + fqActivation.fqParams.levels = fqLayer->GetParamAsInt("levels"); auto inputShape = getShapeForRange(fqLayer, 1); auto outputShape = getShapeForRange(fqLayer, 3); @@ -37,13 +37,15 @@ class GNAFakeQuantizeLayer { auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end()); auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end()); - fqActivation.args.fakeQuantize.inputPerChannel = inputRangeSize != 1; - fqActivation.args.fakeQuantize.input_low = getParamFromInputAsFloats(fqLayer, 1); - fqActivation.args.fakeQuantize.input_high = getParamFromInputAsFloats(fqLayer, 2); + fqActivation.fqParams.set = true; - fqActivation.args.fakeQuantize.outputPerChannel = outputRangeSize != 1; - fqActivation.args.fakeQuantize.output_low = getParamFromInputAsFloats(fqLayer, 3); - fqActivation.args.fakeQuantize.output_high = getParamFromInputAsFloats(fqLayer, 4); + fqActivation.fqParams.inputPerChannel = inputRangeSize != 1; + fqActivation.fqParams.input_low = getParamFromInputAsFloats(fqLayer, 1); + fqActivation.fqParams.input_high = getParamFromInputAsFloats(fqLayer, 2); + + fqActivation.fqParams.outputPerChannel = outputRangeSize != 1; + fqActivation.fqParams.output_low = getParamFromInputAsFloats(fqLayer, 3); + fqActivation.fqParams.output_high = getParamFromInputAsFloats(fqLayer, 4); fqActivation.type = kActFakeQuantize; return fqActivation; diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp index 6c1bf161e28..1112160974b 100644 --- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp @@ -103,7 +103,8 @@ class LayerInfo { "neglog", "neghalflog", "softsign", - "power"}; + "power", + "fakequantize"}; if (isPower()) { auto powerLayer = as(); @@ -157,7 +158,10 @@ class LayerInfo { IS_VALID(); return nullptr != as(); } - + bool isSyntheticScaleShift() const noexcept { + IS_VALID(); + return layer->name.find("SyntheticScaleShift") != std::string::npos; + } bool isEltwise() const noexcept { IS_VALID(); return nullptr != as(); @@ -193,6 +197,18 @@ class LayerInfo { bool isIdentity() const noexcept { return isOfType("identity"); } + bool isTanh() const noexcept { + return isOfType("tanh"); + } + bool isSigmoid() const noexcept { + return isOfType("sigmoid"); + } + bool isSoftSign() const noexcept { + return isOfType("softsign"); + } + bool isClamp() const noexcept { + return isOfType("clamp"); + } bool isFullyConnected() const noexcept { return isOfType("FullyConnected") || isOfType("InnerProduct"); } @@ -283,6 +299,9 @@ class LayerInfo { bool isCopyDelayed() const noexcept { return isOfType(DelayedCopyLayerName); } + bool isWeightableIdentity() const noexcept { + return isConcatAlignFilter() || isSyntheticScaleShift() || isCropAffined(); + } size_t paddingSize() const { static InferenceEngine::details::caseless_set layersWithPossiblePadding = {"FullyConnected", diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp index c6233547677..d32b49c42c7 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp @@ -39,6 +39,7 @@ #include "frontend/quantization.h" #include "gna_groups.hpp" #include "gna_graph_patterns.hpp" +#include "gna_data_types.hpp" using namespace InferenceEngine; using namespace InferenceEngine::details; @@ -54,6 +55,10 @@ std::shared_ptr BasePass::getPassManager() { return sharedMgr; } + +static bool fp32eq(float p1, float p2) { + return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2))); +} // indexes stored in pass manager static const char identityLayersCounterName[] = "identityLayerCounter"; static const char diagonalLayersCounterName[] = "diagonalLayerCounter"; @@ -1836,9 +1841,6 @@ void FuseFQIntoWeightsPass::run() { weightableLayer->insData.resize(1); // 2. running FQ function for given layer - if (weightDims.size() != 2) { - THROW_GNA_LAYER_EXCEPTION(fqLayer) << " layout of weigths not equal to NC not yet supported"; - } auto outputSize = details::product(weightDims.begin(), weightDims.end()); // depending on compute precision weights will be recreated @@ -1874,61 +1876,42 @@ void FuseFQIntoWeightsPass::run() { // check if // - weights were float values and need to be quantized, // - weights are integer values and quantization can be skipped - for (size_t i = 0; i < outputRange.first.size(); ++i) { - if (inputRange.first[i] > outputRange.first[i] || - inputRange.second[i] > outputRange.second[i]) { - quantized->_weights_quantized = true; - break; - } - } - - quantized->_weights_quant.SetMinValues(outputRange.first); - quantized->_weights_quant.SetMaxValues(outputRange.second); + quantized->_weights_quant.SetMinValues(inputRange.first, true); + quantized->_weights_quant.SetMaxValues(inputRange.second, true); + quantized->_weights_quant.SetMinValues(outputRange.first, false); + quantized->_weights_quant.SetMaxValues(outputRange.second, false); quantized->_weights_quant.SetLevels(levels); // lets find out minimum scale factor among channels - if (quantized->_weights_quant.GetMinValues().empty()) { + if (!quantized->_weights_quant.IsStatsSet()) { THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed"; } - auto getScale = [&quantized](size_t i) { - return (quantized->_weights_quant.GetLevels() - 1) / - (quantized->_weights_quant.GetMaxValues()[i] - quantized->_weights_quant.GetMinValues()[i]); - }; - - float min_channel_scale = getScale(0); - for (uint32_t i = 1; i < quantized->_weights_quant.GetMinValues().size(); i++) { - min_channel_scale = std::min(min_channel_scale, getScale(i)); - } - - auto multiplier = 1.0f; - if (quantized->_weights_quant.GetLevels() <= std::numeric_limits::max()) { - // GNA supports additional multiplier for only 8bit weights. - // The multipler is used to extend dynamic range. - multiplier = MAX_OUT_MULTIPLIER; - } - - // Common weights scale calculation - quantized->_weights_quant.SetScale(min_channel_scale * multiplier); continue; } + size_t depth = 1; intel_dnn_component_t component; component.num_columns_in = weightDims[1]; component.num_rows_in = weightDims[0]; + if (LayerInfo(weightableLayer).isConvolution()) { + depth = (weightDims.size() == 4)? weightDims[3]: 1; + } + intel_piecewiselinear_t *transform = reinterpret_cast(&component.op.pwl); transform->func_id = gnaFakeQuantizeLayer.parseAsActivation(); auto quantizedWeightsData = quantizedWeights->buffer(); - component.ptr_inputs = quantizedWeightsData.as(); - auto dequantizedWeights = make_shared_blob(TensorDesc(Precision::FP32, {outputSize}, Layout::C)); dequantizedWeights->allocate(); auto resultBuffer = dequantizedWeights->buffer(); - component.ptr_outputs = resultBuffer.as(); + for (size_t i = 0; i < depth; ++i) { + component.ptr_inputs = quantizedWeightsData.as() + i * component.num_columns_in * component.num_rows_in; + component.ptr_outputs = resultBuffer.as() + i * component.num_columns_in * component.num_rows_in; - PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1); + PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1); + } // 3. assign dequantized const blob to weightable layer assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases); @@ -1944,6 +1927,97 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { auto donotSkip = [](CNNLayerPtr) { return false; }; + + auto allowFQFuse = [](CNNLayerPtr layer) -> bool { + auto doNotSkup = [](CNNLayerPtr layer) { + return false; + }; + + if (CNNNetGetAllNextLayersSkipCertain(layer, -1, doNotSkup).empty()) { + return false; + } + + auto skipNonFunctional = [](CNNLayerPtr layer) { + return LayerInfo(layer).isNonFunctional(); + }; + + auto prevLayer = CNNNetPrevLayerSkipCertain(layer, 0, skipNonFunctional); + if (LayerInfo(prevLayer).isActivation() || LayerInfo(prevLayer).isConst()) { + return true; + } + + auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer, -1, skipNonFunctional); + for (auto& l : nextLayers) { + if (!LayerInfo(l).isActivation()) { + return false; + } + } + + return true; + }; + + std::function propagateStatistics = + [&propagateStatistics](QuantizedLayerParams* srcQuantParams, CNNLayerPtr layer) { + if (LayerInfo(layer).isFakeQuantize()) { + return; + } + + auto donotSkip = [](CNNLayerPtr) { + return false; + }; + + auto quantParams = InferenceEngine::getInjectedData(layer); + + // Find all output layers connected to FQ + auto nextLayers = CNNNetGetAllNextLayersSkipCertain(layer.get(), -1, donotSkip); + if (nextLayers.empty()) { + quantParams->_src_quant.CopyStats(srcQuantParams->_dst_quant); + if (LayerInfo(layer).isNonFunctional()) { + quantParams->_dst_quant.CopyStats(srcQuantParams->_dst_quant); + } + return; + } + + auto srcMinVals = srcQuantParams->_dst_quant.GetMinValues().front(); + auto srcMaxVals = srcQuantParams->_dst_quant.GetMaxValues().front(); + // If a next layer is concat, find minimum nad maximum statistics + if (LayerInfo(layer).isConcat() && quantParams->_src_quant.IsStatsSet()) { + auto concatMinVal = quantParams->_src_quant.GetMinValues().front(); + auto concatMaxVal = quantParams->_src_quant.GetMaxValues().front(); + quantParams->_src_quant.SetMinValues({ std::min(srcMinVals, concatMinVal) }); + quantParams->_src_quant.SetMaxValues({ std::max(srcMaxVals, concatMaxVal) }); + } else if (quantParams->_src_quant.IsStatsSet()) { + return; + } else { + quantParams->_src_quant.CopyStats(srcQuantParams->_dst_quant); + } + + if (!LayerInfo(layer).isWeightable() && !LayerInfo(layer).isEltwise() && + !LayerInfo(layer).isActivation() && !LayerInfo(layer).isFakeQuantize()) { + auto doNotSetDstStats = false; + for (auto& l : nextLayers) { + if (LayerInfo(l).isFakeQuantize()) { + doNotSetDstStats = true; + continue; + } + } + + if (doNotSetDstStats) { + return; + } + + quantParams->_dst_quant.CopyStats(quantParams->_src_quant); + + for (auto& l : nextLayers) { + if (LayerInfo(l).isFakeQuantize()) { + continue; + } + + propagateStatistics(quantParams, l); + } + } + }; + for (auto &&l : *pLayers) { if (!LayerInfo(l).isFakeQuantize()) { continue; @@ -1956,28 +2030,56 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { auto inputRange = fqLayer.getInputRange(); auto outputRange = fqLayer.getOutputRange(); - if (inputRange.second.size() != 1 || inputRange.second.size() != 1 || - outputRange.second.size() != 1 || outputRange.second.size() != 1) { + if (inputRange.first.size() != 1 || inputRange.second.size() != 1 || + outputRange.first.size() != 1 || outputRange.second.size() != 1) { THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantisation"; } + if (!LayerInfo(prevLayer).isConst() && + !fp32eq(inputRange.first.front(), outputRange.first.front()) && + !fp32eq(inputRange.second.front(), outputRange.second.front())) { + THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported data range conversion. Input: (" << + inputRange.first.front() << "," << inputRange.second.front() << "), output: (" << + outputRange.first.front() << "," << outputRange.second.front() << ")"; + } + float fqLevels = fqLayer.getLevels(); - float scaleOutputs = (fqLevels - 1) / (outputRange.second[0] - outputRange.first[0]); // Before FQ layer is removed, the previous layer has to be updated with its quantization data auto quantParamsPrevLayer = InferenceEngine::getInjectedData(prevLayer); - quantParamsPrevLayer->_dst_quant.SetScale(scaleOutputs); quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels); - quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] }); - quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] }); + quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] }, true); + quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] }, true); + quantParamsPrevLayer->_dst_quant.SetMinValues({ outputRange.first[0] }, false); + quantParamsPrevLayer->_dst_quant.SetMaxValues({ outputRange.second[0] }, false); + auto fqQauntParams = InferenceEngine::getInjectedData(l); + fqQauntParams->_dst_quant.SetLevels(fqLevels); + fqQauntParams->_dst_quant.SetMinValues({ inputRange.first[0] }, true); + fqQauntParams->_dst_quant.SetMaxValues({ inputRange.second[0] }, true); + fqQauntParams->_dst_quant.SetMinValues({ outputRange.first[0] }, false); + fqQauntParams->_dst_quant.SetMaxValues({ outputRange.second[0] }, false); + fqQauntParams->_src_quant = fqQauntParams->_dst_quant; + + l->insData.resize(1); + if (!CNNNetHasPrevLayer(prevLayer.get())) { + quantParamsPrevLayer->_src_quant = quantParamsPrevLayer->_dst_quant; + } + + // Allow FQ Fuse checks if FQ layer can be fused to a layer before or after. + // FQ Layer is fused only when previous layer is const or activation layer + // or a next layer is activation layer. + bool isFQFuseAllowed = allowFQFuse(l); auto prevData = prevLayer->outData.front(); - getInputTo(prevLayer->outData.front()).clear(); // Find all output layers connected to FQ auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip); if (nextLayers.empty()) { - THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize does not have any output layers connected"; + return; + } + + if (isFQFuseAllowed) { + getInputTo(prevLayer->outData.front()).clear(); } // Connect all next layers after FQ to the layer that is before FQ @@ -1989,16 +2091,12 @@ void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() { << LAYER_NAME(nextLayers[i]) << " is not correct"; } - nextLayers[i]->insData[insDatas.front()] = prevData; - getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i]; + if (isFQFuseAllowed) { + nextLayers[i]->insData[insDatas.front()] = prevData; + getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i]; + } - // After layer gets removed lets absorb its params in QuantParams structure - // replacing scale factor from this fq layer - auto quantParamsNextLayer = InferenceEngine::getInjectedData(nextLayers[i]); - quantParamsNextLayer->_src_quant.SetScale(scaleOutputs); - quantParamsNextLayer->_src_quant.SetLevels(fqLevels); - quantParamsNextLayer->_src_quant.SetMinValues({ outputRange.first[0] }); - quantParamsNextLayer->_src_quant.SetMaxValues({ outputRange.second[0] }); + propagateStatistics(quantParamsPrevLayer, nextLayers[i]); } } } @@ -2013,7 +2111,9 @@ int PassManager::run(int index) { ordered_properties &printed_properties, ordered_properties &node_properties) {}); #endif +#ifdef ENABLE_V7_SERIALIZE network.serialize(name + ".xml", name + ".bin"); +#endif }; #else auto dumpNetworkAfterPass = [] (std::shared_ptr ) {}; diff --git a/inference-engine/src/gna_plugin/runtime/pwl.cpp b/inference-engine/src/gna_plugin/runtime/pwl.cpp index 4c2a07aa954..8d8528a0b11 100644 --- a/inference-engine/src/gna_plugin/runtime/pwl.cpp +++ b/inference-engine/src/gna_plugin/runtime/pwl.cpp @@ -499,22 +499,41 @@ std::vector pwl_search(const DnnActivation& activation_type, void PwlDesignOpt16(const DnnActivation activation_type, std::vector &ptr_segment, const float scale_in, - const float scale_out) { + const float scale_out, + const float pwlMaxErrorPercent) { std::vector pwl; double err_pct = 0.0; + auto minInputStats = 0.0f; + auto maxInputStats = 0.0f; + if (activation_type.srcFQParams.set) { + minInputStats = std::min(*activation_type.srcFQParams.input_low, *activation_type.srcFQParams.input_high) * 1.25f; + maxInputStats = std::max(*activation_type.srcFQParams.input_low, *activation_type.srcFQParams.input_high) * 1.25f; + } switch (activation_type) { - case kActSigmoid: - pwl = pwl_search(activation_type, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, scale_in, scale_out, ptr_segment); + case kActSigmoid: { + auto absMax = std::max(std::abs(minInputStats), std::abs(maxInputStats)); + auto minInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? -absMax : -SIGMOID_DOMAIN; + auto maxInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? absMax : SIGMOID_DOMAIN; + pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); + make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment); break; - case kActTanh: - pwl = pwl_search(activation_type, -TANH_DOMAIN, TANH_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, -TANH_DOMAIN, TANH_DOMAIN, scale_in, scale_out, ptr_segment); + } + case kActTanh: { + auto absMax = std::max(std::abs(minInputStats), std::abs(maxInputStats)); + auto minInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? -absMax : -TANH_DOMAIN; + auto maxInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? absMax : TANH_DOMAIN; + pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); + make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment); break; - case kActSoftSign: - pwl = pwl_search(activation_type, -SOFTSIGN_DOMAIN, SOFTSIGN_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, -SOFTSIGN_DOMAIN, SOFTSIGN_DOMAIN, scale_in, scale_out, ptr_segment); + } + case kActSoftSign: { + auto absMax = std::max(std::abs(minInputStats), std::abs(maxInputStats)); + auto minInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? -absMax : -SOFTSIGN_DOMAIN; + auto maxInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? absMax : SOFTSIGN_DOMAIN; + pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); + make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment); break; + } case kActRelu: make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); break; @@ -522,6 +541,7 @@ void PwlDesignOpt16(const DnnActivation activation_type, make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); break; case kActIdentity: + case kActFakeQuantize: make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); break; case kActKaldiLstmClipping: @@ -530,28 +550,28 @@ void PwlDesignOpt16(const DnnActivation activation_type, case kActLog: { double x_min = (1 + ~XBASEMASK) / scale_in; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; - pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.066*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); break; } case kActNegLog: { double x_min = (1 + ~XBASEMASK) / scale_in; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; - pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.066*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); break; } case kActNegHalfLog: { double x_min = (1 + ~XBASEMASK) / scale_in; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; - pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.066*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); break; } case kActExp: { double x_min = -log(scale_out); double x_max = x_min + log(INT16_MAX); - pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.5*PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); break; } @@ -576,7 +596,8 @@ void PwlDesignOpt16(const DnnActivation activation_type, x_max = std::min(x_max, POW_DOMAIN); if (activation_type.args.pow.exponent != 0.0f && activation_type.args.pow.exponent != 1.0f) { - pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, 0.015 * PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + auto maxError = pwlMaxErrorPercent > 0.015f? 0.015f: pwlMaxErrorPercent; + pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, maxError, PWL_DESIGN_SAMPLES, err_pct); } make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); @@ -980,15 +1001,14 @@ void PwlApply32(intel_dnn_component_t *component, break; case kActKaldiLstmClipping: { float upper_limit = component->op.pwl.func_id.args.clamp.high; - float lowwer_limit = component->op.pwl.func_id.args.clamp.low; + float lower_limit = component->op.pwl.func_id.args.clamp.low; for (uint32_t i = num_row_start; i <= num_row_end; i++) { for (uint32_t j = num_col_start; j <= num_col_end; j++) { float val = ptr_in[i * num_columns + j]; - if (val > upper_limit) { ptr_out[i * num_columns + j] = upper_limit; - } else if (val < lowwer_limit) { - ptr_out[i * num_columns + j] = lowwer_limit; + } else if (val < lower_limit) { + ptr_out[i * num_columns + j] = lower_limit; } else { ptr_out[i * num_columns + j] = val; } @@ -1050,32 +1070,36 @@ void PwlApply32(intel_dnn_component_t *component, } break; case kActFakeQuantize: { - auto levels = transform->func_id.args.fakeQuantize.levels; + bool clamping = true; + double levels = transform->func_id.fqParams.levels; for (uint32_t i = num_row_start; i <= num_row_end; i++) { - auto inputChannel = transform->func_id.args.fakeQuantize.inputPerChannel ? i : 0; - auto outputChannel = transform->func_id.args.fakeQuantize.outputPerChannel ? i : 0; + auto inputChannel = transform->func_id.fqParams.inputPerChannel ? i : 0; + auto outputChannel = transform->func_id.fqParams.outputPerChannel ? i : 0; - auto input_low = transform->func_id.args.fakeQuantize.input_low[inputChannel]; - auto input_high = transform->func_id.args.fakeQuantize.input_high[inputChannel]; - auto output_low = transform->func_id.args.fakeQuantize.output_low[outputChannel]; - auto output_high = transform->func_id.args.fakeQuantize.output_high[outputChannel]; + double input_low = transform->func_id.fqParams.input_low[inputChannel]; + double input_high = transform->func_id.fqParams.input_high[inputChannel]; + double output_low = transform->func_id.fqParams.output_low[outputChannel]; + double output_high = transform->func_id.fqParams.output_high[outputChannel]; - // TODO: this special modification for spedup-compute give different result with straight FQ formulae - // but this used in reference graph FakeQuantize implementations so we need to honor it for a while - float scaleInput = (input_high - input_low) / (levels-1); - float scaleOutputs = (output_high - output_low) / (levels-1); + auto scaleInput = (levels - 1) / (input_high - input_low); + auto scaleOutput = (levels - 1) / (output_high - output_low); for (uint32_t j = num_col_start; j <= num_col_end; j++) { auto offset = i * num_columns + j; auto x = ptr_in[offset]; + if (!clamping) { + ptr_out[offset] = ptr_in[offset] * scaleInput / scaleOutput; + continue; + } - if (x < std::min(input_low, input_high)) { + if (x <= std::min(input_low, input_high)) { ptr_out[offset] = output_low; } else if (x > std::max(input_low, input_high)) { ptr_out[offset] = output_high; } else { - ptr_out[offset] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low; + ptr_out[offset] = nearbyint((x - input_low) / (input_high - input_low) * (levels - 1)) / + (levels - 1) * (output_high - output_low) + output_low; } } } diff --git a/inference-engine/src/gna_plugin/runtime/pwl.h b/inference-engine/src/gna_plugin/runtime/pwl.h index 86b3cfb93e7..7e8fbbb5a69 100644 --- a/inference-engine/src/gna_plugin/runtime/pwl.h +++ b/inference-engine/src/gna_plugin/runtime/pwl.h @@ -103,4 +103,5 @@ void PwlDesign16(const DnnActivation activation_type, void PwlDesignOpt16(const DnnActivation activation_type, std::vector &ptr_segment, const float scale_in, - const float scale_out); + const float scale_out, + const float pwlMaxErrorPercent); diff --git a/inference-engine/tests/functional/plugin/gna/pass_tests/fq_activation.cpp b/inference-engine/tests/functional/plugin/gna/pass_tests/fq_activation.cpp new file mode 100644 index 00000000000..e0e34949cf7 --- /dev/null +++ b/inference-engine/tests/functional/plugin/gna/pass_tests/fq_activation.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include + +#include + +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/plugin_cache.hpp" +#include "shared_test_classes/base/layer_test_utils.hpp" +#include "functional_test_utils/blob_utils.hpp" +#include "ngraph_functions/utils/ngraph_helpers.hpp" +#include "ngraph_functions/builders.hpp" + +#include "ngraph_functions/pass/convert_prc.hpp" + +typedef std::tuple< + InferenceEngine::Precision, // Network Precision + std::string, // Target Device + std::map, // Configuration + std::vector, // Input Shape + std::pair, // Input Min and Max + size_t // Levels +> fqActivationParams; + +namespace LayerTestsDefinitions { + +class FQActivation : public testing::WithParamInterface, + public LayerTestsUtils::LayerTestsCommon { + float inputDataMin = 0.0f; + float inputDataMax = 0.0f; + float inputDataResolution = 1.0f; + +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + InferenceEngine::Precision netPrecision; + std::string targetDevice; + std::map configuration; + std::vector inputShape; + std::pair inputMinMax; + size_t levels = 0; + std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax, levels) = obj.param; + + std::ostringstream result; + result << "netPRC=" << netPrecision.name() << "_"; + result << "targetDevice=" << targetDevice << "_"; + for (auto const& configItem : configuration) { + result << "_configItem=" << configItem.first << "_" << configItem.second; + } + result << "_inputShape=" << CommonTestUtils::vec2str(inputShape); + result << "_inputMinMax=(" << inputMinMax.first << ".." << inputMinMax.second << ")"; + result << "_levels=" << levels; + + return result.str(); + } + + InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo& info) const { + return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin, 1 / inputDataResolution); + } + +protected: + void SetUp() override { + InferenceEngine::Precision netPrecision; + + std::vector inputShape; + std::pair inputMinMax; + size_t levels = 0; + std::tie(netPrecision, targetDevice, configuration, inputShape, inputMinMax, levels) = this->GetParam(); + auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); + + auto inputLowNode = ngraph::builder::makeConstant(ngPrc, { 1 }, { inputMinMax.first }); + auto inputHighNode = ngraph::builder::makeConstant(ngPrc, { 1 }, { inputMinMax.second }); + + auto inputVector = ngraph::builder::makeParams(ngPrc, { inputShape }); + auto inputFQNode = std::make_shared(inputVector[0], + inputLowNode, inputHighNode, inputLowNode, inputHighNode, levels); + + auto relu = ngraph::builder::makeActivation(inputFQNode, ngraph::element::f32, ngraph::helpers::ActivationTypes::Relu); + auto reluFQNode = std::make_shared(relu, + inputLowNode, inputHighNode, inputLowNode, inputHighNode, levels); + + ngraph::ResultVector results{ std::make_shared(reluFQNode) }; + function = std::make_shared(results, inputVector, "FQActivation"); + } +}; + + +TEST_P(FQActivation, CompareWithRefImpl) { + Run(); +}; + +const std::vector netPrecisions = { + InferenceEngine::Precision::FP32, + InferenceEngine::Precision::FP16 +}; + +const std::vector> configs = { + { + {"GNA_DEVICE_MODE", "GNA_SW_EXACT"}, + } +}; + +const std::vector> inputShape = { + {1, 1024}, +}; + +const std::vector> inputMinMax = { + {-0.5, 0.5}, + {-2, 2}, + {-8, 8}, + {-16, 16}, + {-50, 50}, + {-100, 100}, +}; + +const std::vector levels = { + 65535, +}; + +INSTANTIATE_TEST_CASE_P(smoke_fq_activation, FQActivation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(CommonTestUtils::DEVICE_GNA), + ::testing::ValuesIn(configs), + ::testing::ValuesIn(inputShape), + ::testing::ValuesIn(inputMinMax), + ::testing::ValuesIn(levels)), + FQActivation::getTestCaseName); +} // namespace LayerTestsDefinitions diff --git a/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp b/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp index 394504414b3..7b9337a10fb 100644 --- a/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp +++ b/inference-engine/tests/unit/gna/gna_plugin_config_test.cpp @@ -20,6 +20,7 @@ const std::map supportedConfigKeysWithDefaults = { {CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), CONFIG_VALUE(NO)}, {GNA_CONFIG_KEY(PRECISION), Precision(Precision::I16).name()}, {GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), CONFIG_VALUE(NO)}, + {GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), "1.000000"}, {CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO)}, {GNA_CONFIG_KEY(LIB_N_THREADS), "1"}, {CONFIG_KEY(SINGLE_THREAD), CONFIG_VALUE(YES)} @@ -153,6 +154,17 @@ TEST_F(GNAPluginConfigTest, GnaConfigPwlUniformDesignTest) { config.gnaFlags.uniformPwlDesign); } +TEST_F(GNAPluginConfigTest, GnaConfigPwlMaxErrorPercentTest) { + SetAndCompare(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), std::string("0.100000")); + EXPECT_FLOAT_EQ(config.gnaFlags.pwlMaxErrorPercent, 0.1f); + SetAndCompare(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), std::string("1.000000")); + EXPECT_FLOAT_EQ(config.gnaFlags.pwlMaxErrorPercent, 1); + SetAndCompare(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), std::string("5.000000")); + EXPECT_FLOAT_EQ(config.gnaFlags.pwlMaxErrorPercent, 5); + ExpectThrow(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), "-1"); + ExpectThrow(GNA_CONFIG_KEY(PWL_MAX_ERROR_PERCENT), "100.1"); +} + TEST_F(GNAPluginConfigTest, GnaConfigPerfCountTest) { SetAndCheckFlag(CONFIG_KEY(PERF_COUNT), config.gnaFlags.performance_counting);