diff --git a/inference-engine/include/gna/gna_config.hpp b/inference-engine/include/gna/gna_config.hpp index 8b91de3b066..7273f5a0332 100644 --- a/inference-engine/include/gna/gna_config.hpp +++ b/inference-engine/include/gna/gna_config.hpp @@ -43,12 +43,11 @@ namespace GNAConfigParams { DECLARE_GNA_CONFIG_KEY(SCALE_FACTOR); /** -* @brief By default gna api work in Int16 precision, however this can be adjusted if necessary, +* @brief By default gna api works with Int16 weights precision, however this can be adjusted if necessary, * currently supported values are I16, I8 */ DECLARE_GNA_CONFIG_KEY(PRECISION); - /** * @brief if turned on, dump GNA firmware model into specified file */ diff --git a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp index e3b0f417ff0..6257d8da47d 100644 --- a/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp +++ b/inference-engine/src/gna_plugin/backend/am_intel_dnn.cpp @@ -824,20 +824,38 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_ std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out); if (num_bytes_per_weight == 1) { - int8_t *ptr_weight = reinterpret_cast(component[i].op.affine.ptr_weights); - gna_compound_bias_t *ptr_bias = reinterpret_cast(component[i].op.affine.ptr_biases); + if (num_bytes_per_bias != 1) { + int8_t* ptr_weight = reinterpret_cast(component[i].op.affine.ptr_weights); + gna_compound_bias_t* ptr_bias = reinterpret_cast(component[i].op.affine.ptr_biases); #ifdef DUMP_WB - for (uint32_t row = 0; row < num_weight_rows; row++) { - for (uint32_t col = 0; col < num_weight_columns; col++) { - if (logging_precision == kDnnFloat) { - float val = - static_cast(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier + for (uint32_t row = 0; row < num_weight_rows; row++) { + for (uint32_t col = 0; col < num_weight_columns; col++) { + if (logging_precision == kDnnFloat) { + float val = + static_cast(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier / weight_scale_factor; - out_wfile << std::setprecision(4) << val << " "; - } else { - out_wfile << int((int8_t) ptr_weight[row * num_weight_columns + col]) << " "; + out_wfile << std::setprecision(4) << val << " "; + } else { + out_wfile << int((int8_t)ptr_weight[row * num_weight_columns + col]) << " "; + } + out_wfile << "\n"; + } + } +#endif + } else { + int8_t* ptr_weight = reinterpret_cast(component[i].op.affine.ptr_weights); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + for (uint32_t col = 0; col < num_weight_columns; col++) { + if (logging_precision == kDnnFloat) { + float val = + static_cast(ptr_weight[row * num_weight_columns + col]) / weight_scale_factor; + out_wfile << std::setprecision(4) << val << " "; + } else { + out_wfile << int((int8_t)ptr_weight[row * num_weight_columns + col]) << " "; + } + out_wfile << "\n"; } - out_wfile << "\n"; } } #endif @@ -873,18 +891,31 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_ } if (compute_precision_ == kDnnInt) { if (num_bytes_per_weight == 1) { - gna_compound_bias_t - *ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); + if (num_bytes_per_bias != 1) { + gna_compound_bias_t + * ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); #ifdef DUMP_WB - for (uint32_t row = 0; row < num_rows_out; row++) { - if (logging_precision == kDnnInt) { - out_bfile << std::setw(8) << ptr_biases[row].bias << ", "; - out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n"; - } else { - out_bfile << std::setw(8) << ptr_biases[row].bias / output_scale_factor << "\n"; + for (uint32_t row = 0; row < num_rows_out; row++) { + if (logging_precision == kDnnInt) { + out_bfile << std::setw(8) << ptr_biases[row].bias << ", "; + out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n"; + } else { + out_bfile << std::setw(8) << ptr_biases[row].bias / output_scale_factor << "\n"; + } } - } #endif + } else { + int8_t *ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_rows_out; row++) { + if (logging_precision == kDnnInt) { + out_bfile << std::setw(8) << ptr_biases[row] << "\n"; + } else { + out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n"; + } + } +#endif + } } else { int32_t *ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); #ifdef DUMP_WB @@ -2102,9 +2133,12 @@ void GNAPluginNS::backend::AMIntelDNN::WriteInputAndOutputText() { } else { floatValue = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]; } - } else { + } else if (component[i].num_bytes_per_output == 2) { auto value = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]; floatValue = static_cast(value); + } else { + auto value = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out + j]; + floatValue = static_cast(value); } floatValue /= component[i].output_scale_factor; out_file << std::setw(8) << floatValue << "\n"; @@ -2142,10 +2176,14 @@ void GNAPluginNS::backend::AMIntelDNN::WriteInputAndOutputText() { } else { floatValue = reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + j]; } - } else { + } else if (component[i].num_bytes_per_input == 2) { auto value = reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in+ j]; floatValue = static_cast(value); + } else { + auto value = reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + j]; + floatValue = static_cast(value); } + in_file << std::setw(8) << floatValue / input_scale_factor << "\n"; } } diff --git a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp index f293b7110cf..97539736283 100644 --- a/inference-engine/src/gna_plugin/backend/gna_limitations.hpp +++ b/inference-engine/src/gna_plugin/backend/gna_limitations.hpp @@ -13,6 +13,8 @@ constexpr uint32_t convMinFiltersNum = 4; constexpr uint32_t convMaxFiltersNum = 65532; constexpr uint32_t convFiltersNumDivider = 4; constexpr uint32_t convEachKernelByteAlignment = 16; +constexpr uint32_t noOfInputsDivisor = 8; +constexpr uint32_t noOfInputsLowPrecDivisor = 16; } } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/backend/make_pwl.cpp b/inference-engine/src/gna_plugin/backend/make_pwl.cpp index c63ab7a3145..6190a89540f 100644 --- a/inference-engine/src/gna_plugin/backend/make_pwl.cpp +++ b/inference-engine/src/gna_plugin/backend/make_pwl.cpp @@ -18,6 +18,7 @@ void make_gna_pwl(const DnnActivation fun, const double u_bound, const double in_scale, const double out_scale, + const bool low_precision, std::vector &gna_pwl) { pwl_gna_slope_scale_t s; uint32_t pwl_size = static_cast(pwl.size()); @@ -230,7 +231,7 @@ void make_gna_pwl(const DnnActivation fun, gnalog() << "=========================== LeakyReLU Segments ======================\n"; int32_t x_lower = INT32_MIN; int32_t x_upper = INT32_MAX; - int16_t y_lower = INT16_MIN; + int16_t y_lower = low_precision ? INT8_MIN : INT16_MIN; int16_t y_upper = INT16_MAX; if (fun.fqParams.set) { x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale); diff --git a/inference-engine/src/gna_plugin/backend/make_pwl.hpp b/inference-engine/src/gna_plugin/backend/make_pwl.hpp index eef981034ed..62d95210906 100644 --- a/inference-engine/src/gna_plugin/backend/make_pwl.hpp +++ b/inference-engine/src/gna_plugin/backend/make_pwl.hpp @@ -15,4 +15,5 @@ void make_gna_pwl(const DnnActivation fun, const double u_bound, const double in_scale, const double out_scale, + const bool low_precision, std::vector &gna_pwl); diff --git a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp index 1669fe050fc..e55e36a5f1a 100644 --- a/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp +++ b/inference-engine/src/gna_plugin/descriptions/gna_flags.hpp @@ -18,5 +18,6 @@ struct GNAFlags { bool sw_fp32 = false; bool fake_quantized = false; bool performance_counting = false; + bool input_low_precision = false; }; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp b/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp index 3fc2d49afb6..8095d9cf4dd 100644 --- a/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp +++ b/inference-engine/src/gna_plugin/descriptions/gna_input_desc.cpp @@ -18,7 +18,11 @@ size_t InputDesc::minBytesRequiredForStoreInput(CNNLayerPtr layer) { auto quantized = getInjectedData(layer); size_t precision_bytes; if (quantized) { - precision_bytes = 2; + if (quantized->lowPrecision) { + precision_bytes = 1; + } else { + precision_bytes = 2; + } } else { precision_bytes = 4; } diff --git a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp index 5f78135a1b0..ac1c6bdf47a 100644 --- a/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp @@ -25,6 +25,7 @@ namespace frontend { /** * @brief description of quantisation precision * @tparam Ip - input precision + * @tparam Op - output precision * @tparam Wp - weights precision * @tparam Bp - biases precision * @tparam Np - network precision - can be auto generated in future @@ -82,6 +83,12 @@ struct QuantI8 : public QuantDescTmpl { + QuantI8_I8() { + _Np = InferenceEngine::Precision::MIXED; + } +}; // for support proper trait instantiation for quantization function callback struct FakeQuantI16 : public QuantI16 {}; @@ -155,6 +162,17 @@ class Quant { } }; +template<> +class Quant { +public: + template + void operator()(Args && ... args) const { + QuantizationCallback { + std::forward(args)... + }.runQuantize(); + } +}; + template<> class Quant { public: @@ -650,8 +668,8 @@ template class DataQuantizer : public DataQuantizerBase { public: explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} - bool operator()(InferenceEngine::WeightableLayer *wl) const { - quantizeWeightsBiasesConv(Desc::optional(), wl, Quant()); + bool operator()(InferenceEngine::ConvolutionLayer *cl) const { + quantizeWeightsBiasesConv(Desc::optional(), cl, Quant()); return true; } }; @@ -660,8 +678,8 @@ template class DataQuantizer : public DataQuantizerBase { public: explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} - bool operator()(InferenceEngine::ScaleShiftLayer *wl) const { - quantizeWeightsBiases(Desc::optional(), wl, Quant(), true); + bool operator()(InferenceEngine::ScaleShiftLayer *ssl) const { + quantizeWeightsBiases(Desc::optional(), ssl, Quant(), true); return true; } }; @@ -680,6 +698,7 @@ class LayersQuantizer : public frontend::DataQuantizerBase { using QuantI16 = frontend::QuantPair; using QuantI8 = frontend::QuantPair; +using QuantI8_I8 = frontend::QuantPair; using FakeQuantI16 = frontend::QuantPair; diff --git a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp index 46b000e35df..1f3f125a029 100644 --- a/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp +++ b/inference-engine/src/gna_plugin/frontend/model_quantizer.hpp @@ -26,7 +26,7 @@ template class ModelQuantizer { public: InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, float scaleFactor) const { - return quantize(model, [](const InferenceEngine::CNNNetwork &, bool runBeforeCopy){}, std::vector({scaleFactor})); + return quantize(model, [](const InferenceEngine::CNNNetwork &, bool runBeforeCopy, bool lowPrecision){}, std::vector({scaleFactor})); } template @@ -35,7 +35,7 @@ class ModelQuantizer { } InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, std::vector scaleFactor) const { - return quantize(model, [](InferenceEngine::CNNNetwork &, bool runBeforeCopy){}, scaleFactor); + return quantize(model, [](InferenceEngine::CNNNetwork &, bool runBeforeCopy, bool lowPrecision){}, scaleFactor); } template @@ -45,14 +45,15 @@ class ModelQuantizer { transformLayer(newLayer, WeightsConverter()); return newLayer; }; + bool lowPrecision = (T::mandatory().getInputPrecision().size() == sizeof(uint8_t)); InferenceEngine::CNNNetwork copiedNet = InferenceEngine::CNNNetCopy(model); - cb(copiedNet, true); + cb(copiedNet, true, lowPrecision); copiedNet = InferenceEngine::CNNNetCopy(copiedNet, visitor); // allow client code to access copied topology, to avoid copies if user would like to chain quantisation with // another preprocessing - cb(copiedNet, false); + cb(copiedNet, false, lowPrecision); if (scaleFactor.empty()) { THROW_GNA_EXCEPTION << "Scale factor is empty"; @@ -62,6 +63,8 @@ class ModelQuantizer { auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(copiedNet); gnalog() << "Sorted layers: " << std::endl; for (auto &&layer : sortedNewNet) { + auto quantData = InferenceEngine::getInjectedData(layer); + quantData->lowPrecision = lowPrecision; gnalog() << layer->name << std::endl; } /// filling scale factors for input layers, memory layers will have scaleFactor of 1.0 by default @@ -79,7 +82,8 @@ class ModelQuantizer { } bool isFakeQuantize = std::is_same() || std::is_same(); - propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), isFakeQuantize); + propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), T::optional().getWeightsPrecision().size(), + T::mandatory().getInputPrecision().size(), isFakeQuantize); // sorted order gives possibility for propagate quantisation along depended layers for (auto &&layer : sortedNewNet) { @@ -90,8 +94,9 @@ class ModelQuantizer { } private : - void propagateScaleFactor(std::vector & net, int weightsBytesSize, bool fakeQuantize) const { - ScaleFactorCalculator sf(net, weightsBytesSize, fakeQuantize); + void propagateScaleFactor(std::vector & net, int mandWeightsBytesSize, + int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize) const { + ScaleFactorCalculator sf(net, mandWeightsBytesSize, optWeightsBytesSize, inputsBytesSize, fakeQuantize); while (!sf.allLayersProcessed()) { for (auto &&layer : sf.getStartLayers()) { diff --git a/inference-engine/src/gna_plugin/frontend/quantization.cpp b/inference-engine/src/gna_plugin/frontend/quantization.cpp index df060354f09..69dcc1ccb58 100644 --- a/inference-engine/src/gna_plugin/frontend/quantization.cpp +++ b/inference-engine/src/gna_plugin/frontend/quantization.cpp @@ -358,7 +358,6 @@ void QuantizationCallback::runQuantize() const { int8_t *ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col); rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; - value = ptr_float_weights[row * num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value; if (value > 127.0) { *ptr_weight_8 = 127; @@ -404,3 +403,57 @@ void QuantizationCallback::runQuantize() const { QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows); } } + +template<> +void QuantizationCallback::runQuantize() const { + uint32_t num_saturate = 0; + for (uint32_t row = 0; row < num_rows; row++) { + for (uint32_t col = 0; col < num_columns; col++) { + float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; + float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value; + int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col); + if (value > 127.0) { + *ptr_weight_8 = 127; + num_saturate++; + } else if (value < -128.0) { + *ptr_weight_8 = -128; + num_saturate++; + } else { + *ptr_weight_8 = (int8_t)value; + } + } + for (uint32_t col = num_columns; col < num_columns_padded; col++) { + int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col); + *ptr_weight_8 = 0; + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col); + *ptr_weight_8 = 0; + } + } + + if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) { + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 127.0) { + ptr_int_biases[j] = 127; + num_saturate++; + } else if (value < -128.0) { + ptr_int_biases[j] = -128; + num_saturate++; + } else { + ptr_int_biases[j] = (int8_t)value; + } + } + for (uint32_t j = num_rows; j < num_rows_padded; j++) { + ptr_int_biases[j] = 0; + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8_8()\n", num_saturate, num_rows * num_columns + num_rows); + } +} diff --git a/inference-engine/src/gna_plugin/frontend/quantization.h b/inference-engine/src/gna_plugin/frontend/quantization.h index 7817b66da29..4aaebebe8f6 100644 --- a/inference-engine/src/gna_plugin/frontend/quantization.h +++ b/inference-engine/src/gna_plugin/frontend/quantization.h @@ -13,6 +13,8 @@ #define MAX_OUT_MULTIPLIER 230 #define MAX_VAL_1B_WEIGHT 127 +#define MAX_VAL_1B_FEAT 64 +#define MAX_VAL_1B_BIAS 127 #define MAX_VAL_2B_WEIGHT 16384 #define MAX_VAL_2B_FEAT 16384 #define MAX_VAL_4B_BIAS 1073741824 @@ -45,6 +47,7 @@ struct QuantizationCallback { template class QuantizationCallback; template class QuantizationCallback; +template class QuantizationCallback; std::pair FindMinMaxValues(void* ptr_float_memory, size_t num_elements); float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements); diff --git a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp index 4de70f711e8..918ac8ee3d3 100644 --- a/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp +++ b/inference-engine/src/gna_plugin/frontend/quantized_layer_params.hpp @@ -84,8 +84,8 @@ struct QuantizedLayerParams { // deprecate this Quantization _weights_quant; Quantization _bias_quant; - float _o_shift = 0.0f; - float _b_shift = 0.0f; + + bool lowPrecision = false; }; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp index f5a5942397c..a2bfaccc00f 100644 --- a/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp +++ b/inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp @@ -182,14 +182,14 @@ template class ScaleFactorPerLayer { public: /** - * @brief calculates weights scale factor for fit dynamic range into target bitsize, + * @brief calculates weights scale factor to fit dynamic range into target bitsize, * also calculates output scale factor for the given layer * @param cnnLayer * @param weightsSize * @param result * @return */ - bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { + bool operator()(T cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { return false; } }; @@ -198,6 +198,7 @@ template<> class ScaleFactorPerLayer { private : const float activation_scale_factor = 2048.f; + const float low_prec_activation_scale_factor = 4.f; const float identity_scale_factor = 2049.0f; const float max_activation_scale_factor = 4096.0f; const float k = 5; @@ -207,12 +208,13 @@ class ScaleFactorPerLayer { protected : float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer, GNAPluginNS::LayerInfo const& layer, + int inputsSize, const bool fakeQuantize) { auto quantizedParams = InferenceEngine::getInjectedData(*cnnLayer); // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // set the initial value - float result = activation_scale_factor; + float result = (inputsSize == 2 ? activation_scale_factor : low_prec_activation_scale_factor); if (layer.isIdentity()) { // #define accurate_identity_scale_factor #ifdef accurate_identity_scale_factor @@ -247,11 +249,13 @@ class ScaleFactorPerLayer { result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor; #endif - } else if (layer.isRelu() && - static_cast(activation_scale_factor * quantizedParams->_src_quant.GetScale()) - > std::numeric_limits::max()-1) { + } else if (layer.isRelu()) { // if activation is one from relu family, we need to apply heuristic to avoid activation output overflow - result = (activation_scale_factor * 0.5); + auto limit = (inputsSize == 1 ? std::numeric_limits::max() : std::numeric_limits::max()) - 1; + + if (static_cast(result * quantizedParams->_src_quant.GetScale()) > limit) { + result *= 0.5; + } } else if (layer.isPower()) { auto powerLayer = dynamic_cast(cnnLayer); if (!powerLayer) { @@ -381,7 +385,7 @@ class ScaleFactorPerLayer { (layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) { auto prevLayerQuant = InferenceEngine::getInjectedData(*prevLayer); if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) && - (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) { + (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) { result = prevLayerQuant->_src_quant.GetScale(); usePrevScaleFactor = true; } @@ -412,7 +416,7 @@ class ScaleFactorPerLayer { } public : - bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { + bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !cnnLayer ) { IE_THROW() << "Incorrect Convolutional Layer pointer \n"; } @@ -544,7 +548,13 @@ class ScaleFactorPerLayer { } } - auto levels = fakeQuantize ? MAX_VAL_2B_FEAT : std::numeric_limits::max(); + auto levels = 0; + if (fakeQuantize) { + levels = (inputsSize == 2) ? MAX_VAL_2B_FEAT : MAX_VAL_1B_FEAT; + } else { + levels = (inputsSize == 2) ? std::numeric_limits::max() : std::numeric_limits::max(); + } + auto abs_val = std::max(std::abs(max_val), std::abs(min_val)); auto scale_val = static_cast(levels) / abs_val; //TODO: use FQ formula for scale factor calculation @@ -592,7 +602,7 @@ class ScaleFactorPerLayer { if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) || !fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) { quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale()); - auto scale = getActivationScale(cnnLayer, layerInfo, fakeQuantize); + auto scale = getActivationScale(cnnLayer, layerInfo, inputsSize, fakeQuantize); quant->_dst_quant.SetScale(scale); } return true; @@ -613,10 +623,12 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer { public: - bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { + bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !eltwiseLayer ) { THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n"; } + bool lowPrecision = (inputsSize == sizeof(int8_t)); + auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0); auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1); @@ -641,7 +653,7 @@ class ScaleFactorPerLayer { }) : in0; if (LayerInfo(in0).has32BOutput() || - (LayerInfo(in0).isNonFunctional() && (LayerInfo(eltwiseFunctionalPrev).has32BOutput()))) { + (LayerInfo(in0).isNonFunctional() && LayerInfo(eltwiseFunctionalPrev).has32BOutput())) { std::swap(in0, in1); std::swap(quantParams0, quantParams1); } @@ -654,47 +666,50 @@ class ScaleFactorPerLayer { // this path might result in significant data loss quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale()); auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale(); - auto prevLayerIn1 = CNNNetPrevLayer(in1); + // If a previous layer is a layer where freely weights scale factor can be selected, // try to find the scale factor that will allow to use integer as weights scale factor for eltwise // operation. // If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation. - if (fakeQuantize && LayerInfo(in1).isWeightableIdentity() && - (prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has16BOutput())) { - auto bestWeightsScale = 0.0f; - auto bestError = static_cast(std::numeric_limits::max()); - auto scaleIn0Dst = quantParams0->_dst_quant.GetScale(); - auto scaleIn1Src = quantParams1->_src_quant.GetScale(); - for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) { - auto scaleIn1Dst = i * scaleIn1Src; - auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst; - if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits::max() - 1) { - continue; + if (fakeQuantize) { + auto prevLayerIn1 = CNNNetPrevLayer(in1); + if (LayerInfo(in1).isWeightableIdentity() && + (prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has8BOr16BOutput())) { + auto bestWeightsScale = 0.0f; + auto bestError = static_cast(std::numeric_limits::max()); + auto scaleIn0Dst = quantParams0->_dst_quant.GetScale(); + auto scaleIn1Src = quantParams1->_src_quant.GetScale(); + for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) { + auto scaleIn1Dst = i * scaleIn1Src; + auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst; + if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits::max() - 1) { + continue; + } + + auto error = std::abs(eltwiseWeightsScale - static_cast(eltwiseWeightsScale)); + if (error < bestError) { + bestError = error; + bestWeightsScale = i; + } + + if (fp32eq(error, 0.0f)) { + break; + } } - auto error = std::abs(eltwiseWeightsScale - static_cast(eltwiseWeightsScale)); - if (error < bestError) { - bestError = error; - bestWeightsScale = i; + if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) { + quantParams1->_weights_quant.SetScale(bestWeightsScale); + quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale()); + result = ScaleFactorUpdateResult(in1.get()); + return true; } - - if (fp32eq(error, 0.0f)) { - break; - } - } - - if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) { - quantParams1->_weights_quant.SetScale(bestWeightsScale); - quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale()); - result = ScaleFactorUpdateResult(in1.get()); - return true; } } quantData->_weights_quant.SetScale(weightsScale); quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale()); - // eltwise will always work in int16 - auto maxValue = std::numeric_limits::max() - 1; + // eltwise will work in int16 or int8 if low precision inputs are used + auto maxValue = lowPrecision ? (std::numeric_limits::max() - 1) : (std::numeric_limits::max() - 1); if (quantData->_weights_quant.GetScale() > maxValue + 1) { // rescaling it's activation input // iterating thru previous layers of eltwise @@ -710,7 +725,7 @@ class ScaleFactorPerLayer { // this case for input from port 0 if (info.isSplit() || info.isSlice()) { continue; - } else if (info.has16BOutput() && info.isActivation()) { + } else if (info.has8BOr16BOutput() && info.isActivation()) { auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue; if (newOutputScale > static_cast(std::numeric_limits::max()) / 2) { break; @@ -722,7 +737,7 @@ class ScaleFactorPerLayer { quantDataForActivation->_dst_quant.SetScale(newOutputScale); result = ScaleFactorUpdateResult(in.get()); return true; - } else if (info.has16BOutput()) { + } else if (info.has8BOr16BOutput()) { break; } @@ -768,7 +783,7 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer { public: - bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { + bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !concatLayer ) { THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n"; } @@ -960,7 +975,7 @@ class ScaleFactorPerLayer { auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr; if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() && - (prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) { + (prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) { auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL); @@ -1000,18 +1015,17 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer { private: - float const _scale_reduction_50 = 0.50; - float const _scale_reduction_45 = 0.45; - float const _scale_reduction_40 = 0.40; - float const _scale_reduction_35 = 0.35; - - uint16_t const _scale_change_req_threshold = 30; - uint16_t const _scale_change_threshold_100 = 100; - uint16_t const _scale_change_threshold_150 = 150; - uint16_t const _scale_change_threshold_200 = 200; + std::vector> thresholds { + // tuple values: scale factor threshold, scale factor reduction factor for I16 precision, for I8 precision + std::make_tuple(30, 0.50f, 0.50f), // entry check value + std::make_tuple(100, 0.50f, 0.50f), // if below this threshold, then use this factor + std::make_tuple(150, 0.45f, 0.45f), + std::make_tuple(200, 0.40f, 0.40f), + std::make_tuple(200, 0.35f, 0.35f) // max level -> if above, then use this factor + }; public: - bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { + bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { if ( !wl ) { THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n"; } else if (!wl->_weights) { @@ -1063,18 +1077,30 @@ class ScaleFactorPerLayer { } if (wl->_biases) { - quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as(), - MAX_VAL_4B_BIAS, - wl->_biases->size())); + // for now the only case of INT8 bias we support comes with INT8 inputs and weights as well + if (inputsSize == 1 && weightsSize == 1) { + quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as(), + MAX_VAL_1B_BIAS, + wl->_biases->size())); + } else { + quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as(), + MAX_VAL_4B_BIAS, + wl->_biases->size())); + } if (quant->_bias_quant.GetScale() != -1.0f) { - quant->_bias_quant.SetScale( - std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale())); + // for low precision we don't change bias scale factor based on source and weights scale factors + // in order not to loose too much precision + if (inputsSize != 1 || weightsSize != 1) { + quant->_bias_quant.SetScale( + std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale())); + } quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale()); } } - // TODO: findout why ??? - if (weightsSize == 1) { + // use the MAX_OUT_MULTIPLIER only for int8_t weigths with compound bias (for now handled here only with int16_t inputs) + // it gives the possibility to exetend the output dynamic range + if (weightsSize == 1 && inputsSize == 2) { quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER); } @@ -1089,23 +1115,22 @@ class ScaleFactorPerLayer { } double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(); - if (weightsSize == 1 && - static_cast(tmp_dst_quant_scale * quant->_src_quant.GetScale()) > - static_cast(std::numeric_limits::max() - 1) * _scale_change_req_threshold) { - gnawarn() << "Output scale for " << wl->name - << " too large and are being reduced. Else saturations likely will happen \n"; - // reduce weight scale according experimental heuristic - if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() / - static_cast(std::numeric_limits::max()) < _scale_change_threshold_100) { - quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_50); - } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() / - static_cast(std::numeric_limits::max()) < _scale_change_threshold_150) { - quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_45); - } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() / - static_cast(std::numeric_limits::max()) < _scale_change_threshold_200) { - quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_40); - } else { - quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_35); + if (weightsSize == 1) { + auto itt = thresholds.begin(); + auto limit = std::numeric_limits::max(); + + if (inputsSize == 1) { + limit = std::numeric_limits::max(); + } + + if (static_cast(tmp_dst_quant_scale * quant->_src_quant.GetScale()) > + static_cast(limit - 1) * std::get<0>(*itt)) { + gnawarn() << "Output scale for " << wl->name + << " too large and are being reduced. Else saturations likely will happen \n"; + // reduce weight scale according experimental heuristic + while ((itt + 1) != thresholds.end() && quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() / + static_cast(limit) >= std::get<0>(*(++itt))) {} + quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * (inputsSize == 2 ? std::get<1>(*itt) : std::get<2>(*itt))); } } @@ -1149,17 +1174,10 @@ class ScaleFactorPerLayer { template<> class ScaleFactorPerLayer : public ScaleFactorPerLayer { - public: - bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) { - return ScaleFactorPerLayer::operator()(wl, 2, result, fakeQuantize); - } }; -/** - * GNA convolutions cannot be quantized in int8, remove when library starts support that - */ template<> -class ScaleFactorPerLayer : public ScaleFactorPerLayer { +class ScaleFactorPerLayer : public ScaleFactorPerLayer { }; @@ -1174,12 +1192,15 @@ class ScaleFactorCalculator { Cnt net; mutable Cnt::const_iterator idx; mutable bool needRestart = false; - int weightsBytesSize; + int mandWeightsBytesSize; + int optWeightsBytesSize; bool isFakeQuantize; + int inputsBytesSize; public: - ScaleFactorCalculator(Cnt &net, int weightsBytesSize, bool fakeQuantize) - : net(net), weightsBytesSize(weightsBytesSize), isFakeQuantize(fakeQuantize) { + ScaleFactorCalculator(Cnt &net, int mandWeightsBytesSize, int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize) + : net(net), mandWeightsBytesSize(mandWeightsBytesSize), optWeightsBytesSize(optWeightsBytesSize), + inputsBytesSize(inputsBytesSize), isFakeQuantize(fakeQuantize) { idx = std::begin(this->net); } bool needToRestart() const { @@ -1195,7 +1216,13 @@ class ScaleFactorCalculator { bool operator()(T ptr) const { needRestart = false; frontend::ScaleFactorUpdateResult result; - if (!frontend::ScaleFactorPerLayer()(ptr, weightsBytesSize, result, isFakeQuantize)) { + auto weightsBytesSize = mandWeightsBytesSize; + + if (LayerInfo(ptr).isConvolution() || LayerInfo(ptr).isScaleShift()) { + weightsBytesSize = optWeightsBytesSize; + } + + if (!frontend::ScaleFactorPerLayer()(ptr, weightsBytesSize, inputsBytesSize, result, isFakeQuantize)) { return false; } if (result) { diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp index 9d14d647587..95899982ca3 100644 --- a/inference-engine/src/gna_plugin/gna_device.cpp +++ b/inference-engine/src/gna_plugin/gna_device.cpp @@ -235,7 +235,7 @@ void GNADeviceHelper::checkGna2Status(Gna2Status status, const Gna2Model& gnaMod ? errorReasons.at(reason) : "Unknown Error Reason"; ss << " Reason (" << std::to_string(reason) << "): " << errorReason << "\n"; - ss << " Value (0x" << std::hex << std::to_string(error.Value) << ")"; + ss << " Value (0x" << std::hex << error.Value << ")"; THROW_GNA_EXCEPTION << "\nUnsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ss.str() << diff --git a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp index 0f086bc8207..dbc1c9f3166 100644 --- a/inference-engine/src/gna_plugin/gna_graph_compiler.cpp +++ b/inference-engine/src/gna_plugin/gna_graph_compiler.cpp @@ -35,6 +35,7 @@ #include "round_float_define.hpp" #include "gna_plugin_policy.hpp" #include "gna_groups.hpp" +#include "backend/gna_limitations.hpp" using namespace InferenceEngine; using namespace std; @@ -773,17 +774,19 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) { } ptr_pwl_segments.resize(num_segments); - PwlDesign16(activation_type, + PwlDesign(activation_type, &*ptr_pwl_segments.begin(), static_cast(ptr_pwl_segments.size()), input_pwl_scale_factor, - output_pwl_scale_factor); + output_pwl_scale_factor, + gnaFlags->input_low_precision); } else { - PwlDesignOpt16(activation_type, + PwlDesignOpt(activation_type, ptr_pwl_segments, input_pwl_scale_factor, output_pwl_scale_factor, - gnaFlags->pwlMaxErrorPercent); + gnaFlags->pwlMaxErrorPercent, + gnaFlags->input_low_precision); } } @@ -1139,8 +1142,11 @@ void GNAGraphCompiler::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) { void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { auto& eltwise = dynamic_cast(*layer.get()); auto quantized = InferenceEngine::getInjectedData(layer); + uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ? + GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor; - // for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that + // for eltwise sum/sub in 16-bit precision one input should be 4 bytes and one 2 bytes - detecting that below + // the names of variables are left for clarity although not always reflecting the real precision/size auto inputs2Bytes = layer->insData[0].lock(); auto inputs4Bytes = layer->insData[1].lock(); @@ -1151,19 +1157,32 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { case InferenceEngine::EltwiseLayer::Sum: case InferenceEngine::EltwiseLayer::Sub: { - if (inputs4Bytes->getPrecision().size() != 4) { - std::swap(inputs4Bytes, inputs2Bytes); - biasesLayerIdx = 0; + if (gnaFlags->input_low_precision == false) { + if (inputs4Bytes->getPrecision().size() != 4) { + std::swap(inputs4Bytes, inputs2Bytes); + biasesLayerIdx = 0; + } + GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2); + GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 4); + } else { + // for low precision both inputs should be 1 bytes in size + GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 1); + GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 1); } - GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2); - GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 4); break; } case InferenceEngine::EltwiseLayer::Prod: { - // for mul both inputs should be 2 bytes precision - GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2); - GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 2); + if (gnaFlags->input_low_precision == false) { + // for mul both inputs should be 2 bytes precision + GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2); + GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 2); + } else { + // for mul both inputs should be 1 byte precision + GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 1); + GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 1); + } + break; } default: @@ -1196,7 +1215,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { uint32_t num_rows_in = in_4b_channels * in_4b_height * in_4b_width; uint32_t num_columns_in = in_4b_batch; uint32_t num_rows_out = num_rows_in; - uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; + uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in; void* ptr_inputs = nullptr; void* ptr_outputs = nullptr; @@ -1211,8 +1230,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { inputs2Bytes->getPrecision().size(), outputs->getPrecision().size(), // TODO: only fp32 and Int16 tested - quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2, - quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4, + quantized == nullptr ? inputs2Bytes->getPrecision().size() : (!gnaFlags->input_low_precision ? 2 : 1), + quantized == nullptr ? inputs4Bytes->getPrecision().size() : (!gnaFlags->input_low_precision ? 4 : 1), quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(), quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(), ptr_inputs, @@ -1237,9 +1256,15 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { } else { auto scaledIdentity = -quantized->_weights_quant.GetScale(); - auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); + if (gnaFlags->input_low_precision == false) { + auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); - gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + } else { + auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast(INT8_MAX))); + + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + } } connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx); break; @@ -1249,9 +1274,15 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { } else { auto scaledIdentity = quantized->_weights_quant.GetScale(); - auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); + if (gnaFlags->input_low_precision == false) { + auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); - gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + } else { + auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast(INT8_MAX))); + + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + } } connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx); break; @@ -1260,7 +1291,11 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { if (quantized == nullptr) { gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); } else { - gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); + if (gnaFlags->input_low_precision == false) { + gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); + } else { + gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); + } } connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx); break; @@ -1278,7 +1313,17 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool IE_ASSERT(!layer->outData.empty()); auto inputs = layer->insData.begin()->lock(); auto outputs = *layer->outData.begin(); - auto inputPrecision = quantized ? Precision(Precision::I16) : inputs->getPrecision(); + Precision inputPrecision; + uint32_t noOfInputsDivisor = GNALimitations::noOfInputsDivisor; + + if (!quantized) { + inputPrecision = inputs->getPrecision(); + } else if (gnaFlags->input_low_precision == false) { + inputPrecision = Precision(Precision::I16); + } else { + inputPrecision = Precision(Precision::I8); + noOfInputsDivisor = GNALimitations::noOfInputsLowPrecDivisor; + } auto input_data = HasTo2DReshapeData(layer) ? Get2DReshapedData(inputs, 8) : inputs; auto in_dims = input_data->getDims(); @@ -1286,7 +1331,7 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool uint32_t num_rows_in = InferenceEngine::details::product(in_dims) / batch_size; uint32_t num_columns_in = batch_size; uint32_t num_rows_out = isDiag ? num_rows_in : GetDataDimSize(outputs, 1); - uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; + uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in; uint32_t num_padding_out = isDiag ? num_padding : 0; void* ptr_inputs = nullptr; @@ -1860,17 +1905,19 @@ case name:\ default: THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type; } - PwlDesign16(activation_type, + PwlDesign(activation_type, &*ptr_pwl_segments.begin(), static_cast(ptr_pwl_segments.size()), input_pwl_scale_factor, - output_pwl_scale_factor); + output_pwl_scale_factor, + gnaFlags->input_low_precision); } else { - PwlDesignOpt16(activation_type, + PwlDesignOpt(activation_type, ptr_pwl_segments, input_pwl_scale_factor, output_pwl_scale_factor, - gnaFlags->pwlMaxErrorPercent); + gnaFlags->pwlMaxErrorPercent, + gnaFlags->input_low_precision); } ptr_pwl_segments_target = reinterpret_cast(&ptr_pwl_segments_target); } @@ -2229,8 +2276,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer, // if request for allocation less that realTensorInput - we need to extend request auto minInput = inputDesc->minBytesRequiredForStoreInput(prevLayer); if (num_data_bytes_in < minInput) { - gnalog() << "[INPUT] : requested bytes: " << num_data_bytes_in << ", extended to" << ALIGN(minInput, 8); - num_data_bytes_in = ALIGN(minInput, 8); + uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ? GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor; + gnalog() << "[INPUT] : requested bytes: " << num_data_bytes_in << ", extended to" << ALIGN(minInput, noOfInputsDivisor); + num_data_bytes_in = ALIGN(minInput, noOfInputsDivisor); } // real allocation pointer will be kept in ptr not in ptr_inputs_global diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index 3f93ec30551..ba365a2f5a3 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -107,7 +107,11 @@ void GNAPlugin::copyInputData(T *dst, for (uint32_t i = 0; i < num_frames; i++) { for (uint32_t j = 0; j < num_vector_elements; j++) { if (!std::is_same::value) { - dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor); + if (!gnaFlags->input_low_precision) { + dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor); + } else { + dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt8(src[i * num_vector_elements + j] * scaleFactor); + } } else { dst[j * num_group + i] = src[i * num_vector_elements + j]; } @@ -129,8 +133,14 @@ void GNAPlugin::copyInputData(T *dst, T *ptr_dst_vec = reinterpret_cast(dst) + i * num_vector_stride; const U *ptr_src_vec = reinterpret_cast(src) + i * num_vector_elements; std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T)); - for (uint32_t j=0; j < num_vector_elements; j++) { - ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor); + if (!gnaFlags->input_low_precision) { + for (uint32_t j = 0; j < num_vector_elements; j++) { + ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor); + } + } else { + for (uint32_t j = 0; j < num_vector_elements; j++) { + ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt8(ptr_src_vec[j] * scaleFactor); + } } } @@ -218,6 +228,10 @@ void GNAPlugin::ExportScores(void *ptr_dst, auto dst_ptr = dst + (i * num_vector_elements + j); switch (num_bytes_per_element_input) { + case 1: { + *dst_ptr = static_cast(*reinterpret_cast(input_ptr)); + break; + } case 2 : { *dst_ptr = static_cast(*reinterpret_cast(input_ptr)); break; @@ -284,21 +298,36 @@ void GNAPlugin::ImportFrames( // TODO : fix that as well if (input_precision == Precision::U8) { auto src = reinterpret_cast(ptr_src); - auto dst = reinterpret_cast(ptr_dst); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + if (!gnaFlags->input_low_precision) { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } else { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } } else if (input_precision.size() == 2) { - auto dst = reinterpret_cast(ptr_dst); auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + if (!gnaFlags->input_low_precision) { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } else { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } } else if (input_precision.size() == 4) { if (!gnadevice) { auto dst = reinterpret_cast(ptr_dst); auto src = reinterpret_cast(ptr_src); copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); } else { - auto dst = reinterpret_cast(ptr_dst); auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + if (!gnaFlags->input_low_precision) { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } else { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } } } } else { @@ -307,24 +336,36 @@ void GNAPlugin::ImportFrames( if (!gnadevice) { auto dst = reinterpret_cast(ptr_dst); copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } else if (!gnaFlags->input_low_precision) { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); } else { - auto dst = reinterpret_cast(ptr_dst); + auto dst = reinterpret_cast(ptr_dst); copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); } - } else if (input_precision.size()== 2) { - auto dst = reinterpret_cast(ptr_dst); auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + if (!gnaFlags->input_low_precision) { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } else { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } } else if (input_precision.size() == 4) { if (!gnadevice) { auto dst = reinterpret_cast(ptr_dst); auto src = reinterpret_cast(ptr_src); copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); } else { - auto dst = reinterpret_cast(ptr_dst); auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + if (!gnaFlags->input_low_precision) { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } else { + auto dst = reinterpret_cast(ptr_dst); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + } } } } @@ -663,8 +704,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { // network optimisation phases int passIdx = 0; - auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy) { - auto passes = make_shared(PassManagerSettings{policy, runBeforeCopy}, network); + auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy, bool lowPrecision) { + auto passes = make_shared(PassManagerSettings{policy, runBeforeCopy, lowPrecision}, network); passes->registerPass(); passes->registerPass(); passes->registerPass(); @@ -716,8 +757,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { }; newNet = InferenceEngine::CNNNetCopy(network, visitor); // to run all passes need to have two calls to pass manager - run_passes(newNet, true); - run_passes(newNet, false); + run_passes(newNet, true, gnaFlags->input_low_precision); + run_passes(newNet, false, gnaFlags->input_low_precision); } else if (gnaFlags->fake_quantized) { switch (config.gnaPrecision) { case Precision::I16: @@ -738,8 +779,13 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) { newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors); break; case Precision::I8: - ModelQuantizer q8; - newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors); + if (gnaFlags->input_low_precision == false) { + ModelQuantizer q8; + newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors); + } else { + ModelQuantizer q8_8; + newNet = q8_8.quantize(network, run_passes, inputsDesc->inputScaleFactors); + } break; default: THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision; @@ -1164,7 +1210,7 @@ uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, Infer auto importedFrames = (is3D || is1D) ? 1 : dims[0]; auto targetGroups = is1D ? 1 : dims[0]; // TODO: no proper support for groups yet - auto importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : 2; + auto importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : (gnaFlags->input_low_precision ? 1 : 2); auto importedBytes = importedElements * importedFrames * importedElementSizeBytes; if (inputsDesc->bytes_allocated_for_input[input.first] < importedBytes) { diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp index f6e48fb04b2..502c2cbe1b8 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_config.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_config.hpp @@ -49,7 +49,7 @@ struct Config { std::string GetParameter(const std::string& name) const; std::vector GetSupportedKeys() const; - // precision of GNA hardware model + // default precision of GNA hardware model (see QuantI16 quantizer struct) InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16; std::string dumpXNNPath; diff --git a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp index 732ef1384f0..21ac4b2c47c 100644 --- a/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp +++ b/inference-engine/src/gna_plugin/layers/gna_layer_info.hpp @@ -54,10 +54,13 @@ class LayerInfo { IS_VALID(); return layer->insData.size() > 1; } - bool has16BOutput() const noexcept { + // The name of the funciton may be somehwat misleading + // Explanation: when in low precision mode the listed layers have 8-bit outputs + // and when in 16-bit input mode, they have 16-bit outputs + bool has8BOr16BOutput() const noexcept { IS_VALID(); - static InferenceEngine::details::caseless_set layersWith16BOutputs = {"memory", "input", "split", "slice", "concat", "copy", "const"}; - return layersWith16BOutputs.find(layer->type) != layersWith16BOutputs.end() || + static InferenceEngine::details::caseless_set layersWith8BOr16BOutputs = {"memory", "input", "split", "slice", "concat", "copy", "const"}; + return layersWith8BOr16BOutputs.find(layer->type) != layersWith8BOr16BOutputs.end() || isActivation() || (isCrop() && !isCropAffined()); } diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp index c1d15f68c8d..02e0d7434b0 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.cpp @@ -126,7 +126,7 @@ static CNNLayerPtr InsertCopyLayer(CNNLayerPtr prevLayer, CNNLayerPtr nextLayer, return copyWithQuant; } -static std::vector getCandidatesForIdentityInsertion(const CNNLayerPtr l) { +static std::vector getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr passmanager) { std::vector prevLayers; // skipping memory inputs and true inputs layers @@ -148,15 +148,24 @@ static std::vector getCandidatesForIdentityInsertion(const CNNLayer if (eltwise != nullptr) { // eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted - // for sum if we have 4-4 inputs we will handle that by inserting identity activation case (1) - // for sum if we have 4-2 - OK - // for sum if we have 2-2 inputs we need to insert diagonal + // for sum with 16-bit input precision + // if we have 4-4 inputs - we will handle that by inserting identity activation case (1) + // if we have 4-2 inputs - OK + // if we have 2-2 inputs - we need to insert diagonal - // for mul if we have 2-2 - OK - // for mul if we have 2-4 - inputs we need to insert identity activation to make 2 bytes input - // for mul if we have 4-4 - there 2 options - // option 1 both inputs came from single outdata - we will insert 1 identity to just convert single input into 2 bytes - // option 2 each input came from it's own outdata - we need to insert 2 identities activations to convert both and feed weights and inputs + // for sum with 8-bit input precision + // if we have 1-1 inputs - OK + // if we have 4-4 inputs - there are 2 options + // option 1 both inputs came from single outdata - we need to insert 1 identity activation to just convert single input into 1 byte + // option 2 each input came from its own outdata - we need to insert 2 identity activations to convert both and feed weights and inputs + + // for mul if we have 2-2 or 1-1 (low precision case) inputs - OK + // for mul if we have 2-4 or 1-4 (low precision case) inputs - we need to insert identity activation to make 2 bytes input + // or 1 byte input (low precision case) + // for mul if we have 4-4 inputs - there are 2 options + // option 1 both inputs came from single outdata - we need to insert 1 identity activation to just convert single input into 2 bytes + // or 1 byte (low precision case) + // option 2 each input came from its own outdata - we need to insert 2 identity activations to convert both and feed weights and inputs auto prev0 = PrevFunctionalLayer(l, 0); auto prev1 = PrevFunctionalLayer(l, 1); @@ -164,14 +173,32 @@ static std::vector getCandidatesForIdentityInsertion(const CNNLayer switch (eltwise->_operation) { case EltwiseLayer::Sub: case EltwiseLayer::Sum: - if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) { - return prevLayers; + if (!passmanager->isLowPrecision()) { + if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) { + return prevLayers; + } + // TODO: whether there are possibility to select after what layer identity gets inserted + prevLayers.push_back(CNNNetPrevLayer(l, 0)); + } else { + if (LayerInfo(prev0).has8BOr16BOutput() && LayerInfo(prev1).has8BOr16BOutput()) { + return prevLayers; + } + + if (LayerInfo(prev0).has32BOutput()) { + prevLayers.push_back(CNNNetPrevLayer(l, 0)); + } + + // if layers of outdata are different + auto prevData0 = l->insData[0].lock(); + auto prevData1 = l->insData[1].lock(); + + if ((prev0 != prev1 || prevData0 != prevData1) && LayerInfo(prev1).has32BOutput()) { + prevLayers.push_back(CNNNetPrevLayer(l, 1)); + } } - // TODO: whether there are possibility to select after what layer identity gets inserted - prevLayers.push_back(CNNNetPrevLayer(l, 0)); break; case EltwiseLayer::Prod: { - if (LayerInfo(prev0).has16BOutput() && LayerInfo(prev1).has16BOutput()) { + if (LayerInfo(prev0).has8BOr16BOutput() && LayerInfo(prev1).has8BOr16BOutput()) { return prevLayers; } @@ -227,6 +254,8 @@ static std::vector getCandidatesForIdentityInsertion(const CNNLayer } void InsertDiagonalLayerPass::run() { + bool lowPrecision = getPassManager()->isLowPrecision(); + for (auto & l : *pLayers) { if (l->insData.empty()) continue; auto prevLayer = CNNNetPrevLayerSkipCertain(l, 0, [](CNNLayerPtr ptr) { @@ -241,12 +270,16 @@ void InsertDiagonalLayerPass::run() { if (!eltwise) { continue; } - // in case of eltwise sum one of input would be 4 bytes one - 2 - // in case of eltwise mull one of input would be 2 bytes one - 2 + // in case of eltwise sum in 16-bit input precision one of input would be 4 bytes one - 2 + // in case of eltwise mul in 16-bit input precision one of input would be 2 bytes one - 2 + // in case of eltwise sum in low (8-bit) input precision both inputs are 1 byte + // in case of eltwise mul in low (8-bit) input precision both inputs are 1 byte // for e sum if we have 4-4 inputs we will handle that by inserting identity activation // for e sum if we have 4-2 - OK // for e sum if we have 2-2 inputs we need to insert diagonal -- handling here + // for e sum if we have 1-1 inputs in low precision mode - OK // for e mul if we have 2-2 - OK + // for e mul if we have 1-1 in low precision mode - OK // for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights // for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights @@ -256,7 +289,10 @@ void InsertDiagonalLayerPass::run() { auto prevLayer1 = CNNNetPrevLayerSkipCertain(l, 1, [](CNNLayerPtr ptr) { return LayerInfo(ptr).isNonFunctional(); }); - if (!LayerInfo(prevLayer).has16BOutput() || !LayerInfo(prevLayer1).has16BOutput()) + if (!LayerInfo(prevLayer).has8BOr16BOutput() || !LayerInfo(prevLayer1).has8BOr16BOutput()) + continue; + + if (lowPrecision && LayerInfo(prevLayer).has8BOr16BOutput() && LayerInfo(prevLayer1).has8BOr16BOutput()) continue; } auto prevDirectLayer = CNNNetPrevLayer(l, 0); @@ -736,7 +772,7 @@ void RemovePermutationsNHWCToNCHWPass::run() { void InsertIdentityLayerPass::run() { auto quantized = InferenceEngine::getInjectedData(pLayers->front()); for (auto & l : *pLayers) { - for (auto && prev : getCandidatesForIdentityInsertion(l)) { + for (auto && prev : getCandidatesForIdentityInsertion(l, getPassManager())) { // Do an upstream search until Functional layer is found auto original_prev_layer = prev; auto true_layer = l; @@ -811,7 +847,7 @@ void InsertIdentityLayerPass::run() { for (auto && nextLayer : getInputTo(nextData)) { if (nextLayer.second.get() == l.get()) continue; - if (getCandidatesForIdentityInsertion(nextLayer.second).empty()) { + if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty()) { notAll = true; } } diff --git a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp index 2ee84584e9d..8f0157ce478 100644 --- a/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp +++ b/inference-engine/src/gna_plugin/optimizer/gna_pass_manager.hpp @@ -30,6 +30,7 @@ public: virtual ~IPassManager() = default; virtual int &getIntVar(std::string name) = 0; virtual const Policy &getPolicy() const = 0; + virtual const bool& isLowPrecision() const = 0; virtual InferenceEngine::CNNNetwork &getNetwork() = 0; }; @@ -221,6 +222,7 @@ struct PassManagerSettings { Policy policy; /// @brief whether to run passes before copy bool runBeforeCopy; + bool lowPrecision; }; @@ -245,6 +247,9 @@ public: const Policy & getPolicy() const override { return settings.policy; } + const bool& isLowPrecision() const override { + return settings.lowPrecision; + } InferenceEngine::CNNNetwork& getNetwork() override { return network; } diff --git a/inference-engine/src/gna_plugin/preprocessing.cpp b/inference-engine/src/gna_plugin/preprocessing.cpp index 33924b51b9b..3c316c419e9 100644 --- a/inference-engine/src/gna_plugin/preprocessing.cpp +++ b/inference-engine/src/gna_plugin/preprocessing.cpp @@ -15,6 +15,17 @@ int16_t GNAPluginNS::ConvertFloatToInt16(float src) { return (int16_t)value; } +int8_t GNAPluginNS::ConvertFloatToInt8(float src) { + float rounding_value = (src > 0) ? 0.5f : -0.5f; + float value = src + rounding_value; + if (value > 127.0) { + return 127; + } else if (value < -128.0) { + return -128; + } + return (int8_t)value; +} + void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst, const float *ptr_src, const uint32_t num_rows, diff --git a/inference-engine/src/gna_plugin/preprocessing.hpp b/inference-engine/src/gna_plugin/preprocessing.hpp index a09cfde2982..aac61f2887b 100644 --- a/inference-engine/src/gna_plugin/preprocessing.hpp +++ b/inference-engine/src/gna_plugin/preprocessing.hpp @@ -21,4 +21,5 @@ void ConvertToFloat(float *ptr_dst, const float scale_factor); int16_t ConvertFloatToInt16(float src); +int8_t ConvertFloatToInt8(float src); } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/round_float_define.hpp b/inference-engine/src/gna_plugin/round_float_define.hpp index 1bcbb2a4a29..584d14ecc1a 100644 --- a/inference-engine/src/gna_plugin/round_float_define.hpp +++ b/inference-engine/src/gna_plugin/round_float_define.hpp @@ -7,5 +7,6 @@ #include +#define FLOAT_TO_INT8(a) static_cast(((a) < 0)?((a) - 0.5f):((a) + 0.5f)) #define FLOAT_TO_INT16(a) static_cast(((a) < 0)?((a) - 0.5f):((a) + 0.5f)) #define FLOAT_TO_INT32(a) static_cast(((a) < 0)?((a)-0.5f):((a)+0.5f)) diff --git a/inference-engine/src/gna_plugin/runtime/pwl.cpp b/inference-engine/src/gna_plugin/runtime/pwl.cpp index 8d8528a0b11..3cd5238eba6 100644 --- a/inference-engine/src/gna_plugin/runtime/pwl.cpp +++ b/inference-engine/src/gna_plugin/runtime/pwl.cpp @@ -496,11 +496,12 @@ std::vector pwl_search(const DnnActivation& activation_type, } -void PwlDesignOpt16(const DnnActivation activation_type, +void PwlDesignOpt(const DnnActivation activation_type, std::vector &ptr_segment, const float scale_in, const float scale_out, - const float pwlMaxErrorPercent) { + const float pwlMaxErrorPercent, + const bool low_precision) { std::vector pwl; double err_pct = 0.0; auto minInputStats = 0.0f; @@ -515,7 +516,7 @@ void PwlDesignOpt16(const DnnActivation activation_type, auto minInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? -absMax : -SIGMOID_DOMAIN; auto maxInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? absMax : SIGMOID_DOMAIN; pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment); break; } case kActTanh: { @@ -523,7 +524,7 @@ void PwlDesignOpt16(const DnnActivation activation_type, auto minInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? -absMax : -TANH_DOMAIN; auto maxInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? absMax : TANH_DOMAIN; pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment); break; } case kActSoftSign: { @@ -531,55 +532,56 @@ void PwlDesignOpt16(const DnnActivation activation_type, auto minInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? -absMax : -SOFTSIGN_DOMAIN; auto maxInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? absMax : SOFTSIGN_DOMAIN; pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment); break; } case kActRelu: - make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment); break; case kActLeakyRelu: - make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment); break; case kActIdentity: case kActFakeQuantize: - make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment); break; case kActKaldiLstmClipping: - make_gna_pwl(activation_type, pwl, activation_type.args.clamp.low, activation_type.args.clamp.high, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, activation_type.args.clamp.low, activation_type.args.clamp.high, + scale_in, scale_out, low_precision, ptr_segment); break; case kActLog: { double x_min = (1 + ~XBASEMASK) / scale_in; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment); break; } case kActNegLog: { double x_min = (1 + ~XBASEMASK) / scale_in; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment); break; } case kActNegHalfLog: { double x_min = (1 + ~XBASEMASK) / scale_in; double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN; pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment); break; } case kActExp: { double x_min = -log(scale_out); double x_max = x_min + log(INT16_MAX); pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct); - make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment); break; } case kActSign: - make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment); break; case kActAbs: - make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment); break; case kActPow: { auto fp32eq = [](float p1, float p2) -> bool { @@ -600,7 +602,7 @@ void PwlDesignOpt16(const DnnActivation activation_type, pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, maxError, PWL_DESIGN_SAMPLES, err_pct); } - make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment); + make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment); break; } default: @@ -608,11 +610,12 @@ void PwlDesignOpt16(const DnnActivation activation_type, } } -void PwlDesign16(const DnnActivation activation_type, +void PwlDesign(const DnnActivation activation_type, gna_pwl_segment_t *ptr_segment, const uint32_t num_segments, const float scale_in, - const float scale_out) { + const float scale_out, + const bool low_precision) { switch (activation_type) { case kActSigmoid: { @@ -767,12 +770,12 @@ void PwlDesign16(const DnnActivation activation_type, else gnalog() << "=========================== Identity Segments ===========================\n"; if (x_lower_limit < INT32_MIN) { - std::cerr << "Warning: saturation in PwlDesign16! " << x_lower_limit << " < INT32_MIN"<< std::endl; + std::cerr << "Warning: saturation in PwlDesign! " << x_lower_limit << " < INT32_MIN"<< std::endl; x_lower_limit = INT32_MIN; y_lower_limit = static_cast((scale_out / scale_in)*static_cast(INT32_MIN) - 0.5); } if (x_upper_limit > INT32_MAX) { - std::cerr << "Warning: saturation in PwlDesign16! " << x_upper_limit << " > INT32_MAX"<< std::endl; + std::cerr << "Warning: saturation in PwlDesign! " << x_upper_limit << " > INT32_MAX"<< std::endl; x_upper_limit = INT32_MAX; y_upper_limit = static_cast((scale_out / scale_in)*static_cast(INT32_MAX) + 0.5); } diff --git a/inference-engine/src/gna_plugin/runtime/pwl.h b/inference-engine/src/gna_plugin/runtime/pwl.h index b4ab2dc30d0..cf908f9e765 100644 --- a/inference-engine/src/gna_plugin/runtime/pwl.h +++ b/inference-engine/src/gna_plugin/runtime/pwl.h @@ -95,13 +95,15 @@ void PwlApply32(intel_dnn_component_t *component, const uint32_t num_row_end, const uint32_t num_col_start, const uint32_t num_col_end); -void PwlDesign16(const DnnActivation activation_type, +void PwlDesign(const DnnActivation activation_type, gna_pwl_segment_t *ptr_segment, const uint32_t num_segments, const float scale_in, - const float scale_out); -void PwlDesignOpt16(const DnnActivation activation_type, + const float scale_out, + const bool low_precision); +void PwlDesignOpt(const DnnActivation activation_type, std::vector &ptr_segment, const float scale_in, const float scale_out, - const float pwlMaxErrorPercent); + const float pwlMaxErrorPercent, + const bool low_precision); diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp index 8f99a0d6778..307b6a1271b 100644 --- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp +++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/single_layer_tests/convolution.cpp @@ -22,7 +22,7 @@ protected: } } - void SetUp() { + void SetUp() override { ConvolutionLayerTest::SetUp(); } }; diff --git a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp index e8b67616944..5c0b1afd882 100644 --- a/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp +++ b/inference-engine/tests/functional/plugin/gna/shared_tests_instances/subgraph_tests/convolution_relu_sequence.cpp @@ -24,7 +24,7 @@ protected: } } - void SetUp() { + void SetUp() override { ConvolutionReluSequenceTest::SetUp(); } };