[GNA] Fake quantization layer support for int-8 mode for GNA plugin (#2937)

* [GNA] added support for per-channel FakeQuantise layer

* [GNA] added quantisation types detection in FQ enabled networks, and added input scale factors detection from FQ connected to input layer

* added FakeQuantize callback that will be use to cast integer values stored as float in FakeQuantized layer

* fixed per-channel multiplier calculation for int8 case

* precision improvements for int8 fake quantization and support for propagating scale factors to activation layers

* added initial int16 support

* added support for fake quantize layer with many connected output layers and support for FQ data encoded as FP16

* added support for already quantized weights

* Shared single layer test

* Added subgraph test

* Fix comment

* int8

* Enabling FQ tests on GNA

Co-authored-by: Eugene Smirnov <eugene.smirnov@intel.com>
Co-authored-by: Andrey Dmitriev <andrey.dmitriev@intel.com>
This commit is contained in:
Bartosz Sochacki 2020-11-20 14:40:19 +01:00 committed by GitHub
parent 27be33ba53
commit fc1a3ce2f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
25 changed files with 1430 additions and 268 deletions

View File

@ -45,14 +45,15 @@ struct DnnActivation {
} pow; } pow;
struct { struct {
int32_t levels; int32_t levels;
float input_low; // if input is per-channel quantization - input pointers contains per-channel ranges
float input_high; int8_t inputPerChannel;
float output_low; float *input_low;
float output_high; float *input_high;
// if output is per-channel quantization - output pointers contains per-channel ranges
int8_t outputPerChannel;
float *output_low;
float *output_high;
} fakeQuantize; } fakeQuantize;
struct {
float reserved[5];
};
} args; } args;
operator DnnActivationType () const noexcept { operator DnnActivationType () const noexcept {
return type; return type;

View File

@ -15,6 +15,7 @@ struct GNAFlags {
bool uniformPwlDesign = false; bool uniformPwlDesign = false;
bool gna_openmp_multithreading = false; bool gna_openmp_multithreading = false;
bool sw_fp32 = false; bool sw_fp32 = false;
bool fake_quantized = false;
bool performance_counting = false; bool performance_counting = false;
}; };
} // namespace GNAPluginNS } // namespace GNAPluginNS

View File

@ -83,6 +83,10 @@ struct QuantI8 : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
} }
}; };
// for support proper trait instantiation for quantization function callback
struct FakeQuantI16 : public QuantI16 {};
struct FakeQuantI8 : public QuantI8 {};
template <class A, class B> template <class A, class B>
struct QuantPair { struct QuantPair {
using MandatoryType = A; using MandatoryType = A;
@ -115,7 +119,7 @@ inline bool shouldAlwaysAllocate<gna_compound_bias_t>() {
*/ */
template <class T> template <class T>
class Quant { class Quant {
public: public:
template<class ...Args> template<class ...Args>
void operator()(Args && ... args) const { } void operator()(Args && ... args) const { }
}; };
@ -125,7 +129,9 @@ class Quant<QuantI16> {
public: public:
template<class ...Args> template<class ...Args>
void operator()(Args && ... args) const { void operator()(Args && ... args) const {
QuantizeAffine16(std::forward<Args>(args)...); QuantizationCallback<int16_t, int32_t> {
std::forward<Args>(args)...
}.runQuantize();
} }
}; };
@ -134,10 +140,35 @@ class Quant<QuantI8> {
public: public:
template<class ...Args> template<class ...Args>
void operator()(Args && ... args) const { void operator()(Args && ... args) const {
QuantizeAffine8(std::forward<Args>(args)...); QuantizationCallback<int8_t, gna_compound_bias_t> {
std::forward<Args>(args)...
}.runQuantize();
} }
}; };
template<>
class Quant<FakeQuantI16> {
public:
template<class ...Args>
void operator()(Args && ... args) const {
QuantizationCallback<int16_t, int32_t> {
std::forward<Args>(args)...
}.runFakeQuantize();
}
};
template<>
class Quant<FakeQuantI8> {
public:
template<class ...Args>
void operator()(Args && ... args) const {
QuantizationCallback<int8_t, gna_compound_bias_t>{
std::forward<Args>(args)...
}.runFakeQuantize();
}
};
template <typename T> template <typename T>
inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) { inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision, auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
@ -242,7 +273,7 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
if (InferenceEngine::CNNNetHasPrevLayer(wl)) { if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
auto quantDataForInputLayer = auto quantDataForInputLayer =
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get()); InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
input_scale_factor = quantDataForInputLayer->_dst_quant.scale; input_scale_factor = quantDataForInputLayer->_dst_quant.GetScale();
if (std::isnan(input_scale_factor) || if (std::isnan(input_scale_factor) ||
std::isinf(input_scale_factor)) { std::isinf(input_scale_factor)) {
THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor; THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
@ -273,17 +304,26 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl); auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
{ {
auto per_channel_weights = !quantData->_weights_quant.GetMinValues().empty();
auto weightsScale = quantData->_weights_quant.GetScale();
auto dstScale = quantData->_dst_quant.GetScale();
fnc(wl->_weights->buffer().as<float *>(), fnc(wl->_weights->buffer().as<float *>(),
wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr, wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
intWeights->buffer(), intWeights->buffer(),
intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr), intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
input_scale_factor, input_scale_factor,
&quantData->_weights_quant.scale, &weightsScale,
&quantData->_dst_quant.scale, &dstScale,
num_rows, num_rows,
num_columns, num_columns,
num_rows_padded, num_rows_padded,
num_columns_padded); num_columns_padded,
quantData->_weights_quant.GetLevels(),
nullptr,
nullptr,
per_channel_weights ? &quantData->_weights_quant.GetMinValues().front(): nullptr,
per_channel_weights ? &quantData->_weights_quant.GetMaxValues().front(): nullptr,
&quantData->_weights_quantized);
} }
wl->_weights = intWeights; wl->_weights = intWeights;
wl->_biases = intBiases; wl->_biases = intBiases;
@ -343,7 +383,7 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
if (InferenceEngine::CNNNetHasPrevLayer(conv)) { if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
auto quantDataForInputLayer = auto quantDataForInputLayer =
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get()); InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
input_scale_factor = quantDataForInputLayer->_dst_quant.scale; input_scale_factor = quantDataForInputLayer->_dst_quant.GetScale();
if (std::isnan(input_scale_factor) || if (std::isnan(input_scale_factor) ||
std::isinf(input_scale_factor)) { std::isinf(input_scale_factor)) {
THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor; THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
@ -370,13 +410,15 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv); auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
{ {
auto weightsScale = quantData->_weights_quant.GetScale();
auto dstScale = quantData->_dst_quant.GetScale();
fnc(conv->_weights->buffer().as<float *>(), fnc(conv->_weights->buffer().as<float *>(),
conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr, conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
intWeights->buffer(), intWeights->buffer(),
intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr), intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
input_scale_factor, input_scale_factor,
&quantData->_weights_quant.scale, &weightsScale,
&quantData->_dst_quant.scale, &dstScale,
num_rows, num_rows,
num_columns, num_columns,
num_rows_padded, num_rows_padded,
@ -447,7 +489,7 @@ class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBas
if (cnnLayer->blobs["custom"]->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP16) { if (cnnLayer->blobs["custom"]->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP16) {
cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]); cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]);
} }
auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.scale; auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.GetScale();
auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]); auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]);
auto const_blob = cnnLayer->blobs["custom"]; auto const_blob = cnnLayer->blobs["custom"];
if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) { if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
@ -563,4 +605,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>; using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>; using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
} // namespace GNAPluginNS } // namespace GNAPluginNS

View File

@ -80,7 +80,7 @@ class ModelQuantizer {
THROW_GNA_EXCEPTION << "Scale factors are not set for some of the inputs"; THROW_GNA_EXCEPTION << "Scale factors are not set for some of the inputs";
} }
IE_ASSERT(quantData != nullptr); IE_ASSERT(quantData != nullptr);
quantData->_src_quant.scale = scaleFactor[scaleIndex]; quantData->_src_quant.SetScale(scaleFactor[scaleIndex]);
scaleIndex++; scaleIndex++;
} }

View File

@ -5,20 +5,91 @@
#include <cstring> #include <cstring>
#include <iostream> #include <iostream>
#include <details/ie_exception.hpp> #include <details/ie_exception.hpp>
#include <gna_plugin_log.hpp>
#include <limits>
#include "backend/gna_types.h" #include "backend/gna_types.h"
#include "quantization.h" #include "quantization.h"
void QuantizeAffine16(float *ptr_float_weights, #ifdef DEBUG
float *ptr_float_biases, #define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
int16_t *ptr_int_weights, #else
int32_t *ptr_int_biases, #define QUANTWARNING(...)
float input_scale_factor, #endif
float *ptr_weight_scale_factor,
float *ptr_output_scale_factor,
uint32_t num_rows, template<>
uint32_t num_columns, void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
uint32_t num_rows_padded, uint32_t num_saturate = 0;
uint32_t num_columns_padded) {
for (uint32_t row = 0; row < num_rows; row++) {
for (uint32_t col = 0; col < num_columns; col++) {
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
float value = ptr_float_weights[row * num_columns + col];
if (!*ptr_quantized_weights) {
value = value * *ptr_weight_scale_factor + rounding_value;
} else {
value -= MAX_VAL_2B_WEIGHT;
}
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
if (*ptr_quantized_weights &&
(value > std::numeric_limits<int16_t>::max() ||
value < std::numeric_limits<int16_t>::min())) {
THROW_GNA_EXCEPTION << "unsupported weights range for I16 quantisation: " << value;
}
if (value > std::numeric_limits<int16_t>::max()) {
*ptr_weight_16 = std::numeric_limits<int16_t>::max();
num_saturate++;
} else if (value < std::numeric_limits<int16_t>::min()) {
*ptr_weight_16 = std::numeric_limits<int16_t>::min();
num_saturate++;
} else {
*ptr_weight_16 = (int16_t)value;
}
}
for (uint32_t col = num_columns; col < num_columns_padded; col++) {
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
*ptr_weight_16 = 0;
}
}
for (uint32_t row = num_rows; row < num_rows_padded; row++) {
for (uint32_t col = 0; col < num_columns_padded; col++) {
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
*ptr_weight_16 = 0;
}
}
// case for element wise layer
if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
for (uint32_t j = 0; j < num_rows; j++) {
float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
if (value > 2147483647.0) {
ptr_int_biases[j] = 2147483647L;
num_saturate++;
} else if (value < -2147483648.0) {
ptr_int_biases[j] = -2147483648LL;
num_saturate++;
} else {
ptr_int_biases[j] = (int32_t)value;
}
}
for (uint32_t j = num_rows; j < num_rows_padded; j++) {
ptr_int_biases[j] = 0;
}
}
if (num_saturate > 0) {
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine16()\n",
num_saturate,
num_rows * num_columns + num_rows);
}
}
template<>
void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
uint32_t num_saturate = 0; uint32_t num_saturate = 0;
if (*ptr_weight_scale_factor == 1.0) { if (*ptr_weight_scale_factor == 1.0) {
@ -149,11 +220,90 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t
} }
} }
void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, template<>
int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases, void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
float input_scale_factor, float *ptr_weight_scale_factor, uint32_t num_saturate = 0;
float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
uint32_t num_rows_padded, uint32_t num_columns_padded) { if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
THROW_GNA_EXCEPTION << "Fake quantized output range not set";
}
if (fq_levels == 0 || fq_levels == 1) {
THROW_GNA_EXCEPTION << "Fake quantized levels not set";
}
for (uint32_t i = 0; i < num_rows; i++) {
uint32_t channel_multiplier = ((fq_ptr_output_high[i] - fq_ptr_output_low[i]) *
*ptr_weight_scale_factor) / (fq_levels - 1) + 0.5f;
ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier);
if (channel_multiplier > MAX_OUT_MULTIPLIER) {
THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;
}
for (uint32_t j = 0; j < num_columns; j++) {
auto offset = i * num_columns + j;
auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f;
float value = ptr_float_weights[offset];
if (!*ptr_quantized_weights) {
value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value;
} else {
value -= MAX_VAL_1B_WEIGHT;
}
auto normalizedWeight = static_cast<int32_t>(value);
if (*ptr_quantized_weights &&
(value > std::numeric_limits<int8_t>::max() ||
value < std::numeric_limits<int8_t>::min())) {
THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantization: " << value;
}
if (value > std::numeric_limits<int8_t>::max()) {
normalizedWeight = std::numeric_limits<int8_t>::max();
num_saturate++;
} else if (value < std::numeric_limits<int8_t>::min()) {
normalizedWeight = std::numeric_limits<int8_t>::min();
num_saturate++;
} else {
normalizedWeight = (int8_t)value;
}
// range checking
ptr_int_weights[offset] = static_cast<int8_t>(normalizedWeight);
}
for (uint32_t j = num_columns; j < num_columns_padded; j++) {
ptr_int_weights[i * num_columns + j] = 0;
}
}
for (uint32_t i = num_rows; i < num_rows_padded; i++) {
for (uint32_t j = 0; j < num_columns_padded; j++) {
ptr_int_weights[i * num_columns + j] = 0;
}
ptr_int_biases[i].multiplier = 0;
}
if (ptr_float_biases != nullptr) {
for (uint32_t j = 0; j < num_rows; j++) {
float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
if (value > 2147483647.0) {
ptr_int_biases[j].bias = 2147483647L;
num_saturate++;
} else if (value < -2147483648.0) {
ptr_int_biases[j].bias = -2147483648LL;
num_saturate++;
} else {
ptr_int_biases[j].bias = (int32_t) value;
}
}
}
if (num_saturate > 0) {
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
}
}
template<>
void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
if (ptr_int_biases == nullptr) { if (ptr_int_biases == nullptr) {
THROW_IE_EXCEPTION << "Int biases are empty"; THROW_IE_EXCEPTION << "Int biases are empty";
} }

View File

@ -16,25 +16,34 @@
#define MAX_VAL_2B_WEIGHT 16384 #define MAX_VAL_2B_WEIGHT 16384
#define MAX_VAL_2B_FEAT 16384 #define MAX_VAL_2B_FEAT 16384
#define MAX_VAL_4B_BIAS 1073741824 #define MAX_VAL_4B_BIAS 1073741824
#ifdef DEBUG
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
#else
#define QUANTWARNING(...)
#endif
void QuantizeAffine16(float *ptr_float_weights, template <class WeightsType, class BiasType>
float *ptr_float_biases, struct QuantizationCallback {
int16_t *ptr_int_weights, float *ptr_float_weights;
int32_t *ptr_int_biases, float *ptr_float_biases;
float input_scale_factor, WeightsType* ptr_int_weights;
float *ptr_weight_scale_factor, BiasType* ptr_int_biases;
float *ptr_output_scale_factor, float input_scale_factor;
uint32_t num_rows, float *ptr_weight_scale_factor;
uint32_t num_columns, float *ptr_output_scale_factor;
uint32_t num_rows_padded, uint32_t num_rows;
uint32_t num_columns_padded); uint32_t num_columns;
uint32_t num_rows_padded;
uint32_t num_columns_padded;
int32_t fq_levels;
const float *fq_ptr_input_low;
const float *fq_ptr_input_high;
const float *fq_ptr_output_low;
const float *fq_ptr_output_high;
const bool* ptr_quantized_weights;
void runQuantize() const;
void runFakeQuantize() const;
};
template class QuantizationCallback<int16_t, int32_t>;
template class QuantizationCallback<int8_t, gna_compound_bias_t>;
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements); float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor); void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);

View File

@ -6,19 +6,57 @@
namespace GNAPluginNS { namespace GNAPluginNS {
struct Quantization { class Quantization {
public:
void SetScale(float s) {
scale = s;
scale_set = true;
}
float GetScale() const {
return scale;
}
bool IsScaleSet() const {
return scale_set;
}
void SetLevels(int32_t l) {
levels = l;
}
int32_t GetLevels() const {
return levels;
}
void SetMinValues(const std::vector<float> &min) {
min_values.clear();
min_values.insert(min_values.end(), min.begin(), min.end());
}
const std::vector<float>& GetMinValues() const {
return min_values;
}
void SetMaxValues(const std::vector<float>& max) {
max_values.clear();
max_values.insert(max_values.end(), max.begin(), max.end());
}
const std::vector<float>& GetMaxValues() const {
return max_values;
}
private:
float scale = 1.0f; float scale = 1.0f;
float offset = 0.0f; bool scale_set = false;
int shift = 0.0f; int32_t levels = 0;
std::vector<float> min_values;
std::vector<float> max_values;
}; };
struct QuantizedLayerParams { struct QuantizedLayerParams {
Quantization _src_quant; Quantization _src_quant;
Quantization _dst_quant; Quantization _dst_quant;
// deprecate this
Quantization _weights_quant; Quantization _weights_quant;
bool _weights_quantized = false;
Quantization _bias_quant; Quantization _bias_quant;
float _o_shift = 0.0f; float _o_shift = 0.0f;
float _b_shift = 0.0f; float _b_shift = 0.0f;
}; };
} // namespace GNAPluginNS } // namespace GNAPluginNS

View File

@ -64,8 +64,9 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
} }
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer, float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
GNAPluginNS::LayerInfo const& layer, GNAPluginNS::LayerInfo const& layer) {
QuantizedLayerParams const* quantizedParams) { auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
// set the initial value // set the initial value
float result = activation_scale_factor; float result = activation_scale_factor;
@ -82,29 +83,29 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
for (int slope_scale_index = 1; slope_scale_index != 5; slope_scale_index ++) { for (int slope_scale_index = 1; slope_scale_index != 5; slope_scale_index ++) {
auto slope_scale = static_cast<double>(static_cast<uint64_t>(1) << (8 * slope_scale_index)); auto slope_scale = static_cast<double>(static_cast<uint64_t>(1) << (8 * slope_scale_index));
auto mink = min_range * slope_scale / quantizedParams->_src_quant.scale; auto mink = min_range * slope_scale / quantizedParams->_src_quant.GetScale();
auto maxk = max_range * slope_scale / quantizedParams->_src_quant.scale; auto maxk = max_range * slope_scale / quantizedParams->_src_quant.GetScale();
if (mink < std::numeric_limits<int16_t>::max()) { if (mink < std::numeric_limits<int16_t>::max()) {
auto localMaxK = std::min(static_cast<double>(std::numeric_limits<int16_t>::max()), maxk); auto localMaxK = std::min(static_cast<double>(std::numeric_limits<int16_t>::max()), maxk);
if (localMaxK > optimalK) { if (localMaxK > optimalK) {
result = localMaxK / slope_scale * quantizedParams->_src_quant.scale; result = localMaxK / slope_scale * quantizedParams->_src_quant.GetScale();
optimalK = localMaxK; optimalK = localMaxK;
} }
} }
} }
#else #else
// GNA scale factor encoding might poor represent target slop scale, we are probing 2 values // GNA scale factor encoding might poor represent target slop scale, we are probing 2 values
auto s = gna_slope(1.0, quantizedParams->_src_quant.scale, identity_scale_factor); auto s = gna_slope(1.0, quantizedParams->_src_quant.GetScale(), identity_scale_factor);
auto scale_default = s.slope * s.slope_scale; auto scale_default = s.slope * s.slope_scale;
// probing one more quite good approximation for identity // probing one more quite good approximation for identity
s = gna_slope(1.0, quantizedParams->_src_quant.scale, identity_scale_factor / 2); s = gna_slope(1.0, quantizedParams->_src_quant.GetScale(), identity_scale_factor / 2);
auto scale_extra = s.slope * s.slope_scale; auto scale_extra = s.slope * s.slope_scale;
result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor; result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor;
#endif #endif
} else if (layer.isRelu() && } else if (layer.isRelu() &&
static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.scale) static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.GetScale())
> std::numeric_limits<int32_t>::max()-1) { > std::numeric_limits<int32_t>::max()-1) {
// if activation is one from relu family, we need to apply heuristic to avoid activation output overflow // if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
result = (activation_scale_factor * 0.5); result = (activation_scale_factor * 0.5);
@ -118,10 +119,10 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
auto input_max_value = static_cast<double>(std::numeric_limits<int32_t>::max()); auto input_max_value = static_cast<double>(std::numeric_limits<int32_t>::max());
auto output_max_value = static_cast<double>(std::numeric_limits<int16_t>::max()); auto output_max_value = static_cast<double>(std::numeric_limits<int16_t>::max());
auto x_min = fp32eq(fmod(powerLayer->power, 1.0), 0) ? input_min_value / quantizedParams->_src_quant.scale : 0.0; auto x_min = fp32eq(fmod(powerLayer->power, 1.0), 0) ? input_min_value / quantizedParams->_src_quant.GetScale() : 0.0;
x_min = std::max(x_min, -pow_domain); x_min = std::max(x_min, -pow_domain);
auto x_max = input_max_value / quantizedParams->_src_quant.scale; auto x_max = input_max_value / quantizedParams->_src_quant.GetScale();
x_max = std::min(x_max, pow_domain); x_max = std::min(x_max, pow_domain);
auto val1 = pow(x_min * powerLayer->scale + powerLayer->offset, powerLayer->power); auto val1 = pow(x_min * powerLayer->scale + powerLayer->offset, powerLayer->power);
@ -134,6 +135,14 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
result = scale_val; result = scale_val;
} }
} }
if (!quantizedParams->_dst_quant.GetMaxValues().empty()) {
auto min_value = quantizedParams->_dst_quant.GetMinValues().front();
auto max_value = quantizedParams->_dst_quant.GetMaxValues().front();
auto newScaleFactor = (quantizedParams->_dst_quant.GetLevels() - 1) / (max_value - min_value);
result = newScaleFactor < result ? newScaleFactor : result;
}
return result; return result;
} }
@ -147,12 +156,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer); auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) { if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
if (CNNNetHasPrevLayer(cnnLayer)) { if (!CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsScaleSet()) {
quant->_src_quant = quant->_dst_quant;
}
if (CNNNetHasPrevLayer(cnnLayer)) {
auto prevLayer = CNNNetPrevLayer(cnnLayer); auto prevLayer = CNNNetPrevLayer(cnnLayer);
auto prevInfo = LayerInfo(prevLayer); auto prevInfo = LayerInfo(prevLayer);
auto inputQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer); auto inputQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
// locating corresponding memory layers with same ID // locating corresponding memory layers with same ID
for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) { for (auto&& input : CNNNetGetAllInputLayers(cnnLayer)) {
LayerInfo ll(input); LayerInfo ll(input);
if (!ll.isMemory() || if (!ll.isMemory() ||
!InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) { !InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
@ -162,35 +175,36 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
auto quantSibling = InferenceEngine::getInjectedData<QuantizedLayerParams>(input); auto quantSibling = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
// after restarting from memory input - quant is fine // after restarting from memory input - quant is fine
if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) { if (fp32eq(quantSibling->_dst_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale; quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
quant->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
return true; return true;
} }
if (!fp32eq(quantSibling->_dst_quant.scale, 1)) { if (quantSibling->_dst_quant.IsScaleSet()) {
// means we already restarted propagation input memory layer // means we already restarted propagation input memory layer
// need to search for requantiseable layer prior memory output layer // need to search for requantiseable layer prior memory output layer
InferenceEngine::CNNLayerPtr restartedLayer; InferenceEngine::CNNLayerPtr restartedLayer;
gnalog() << "Memory layer :"<< input->name << " scale factor: " << quantSibling->_dst_quant.scale gnalog() << "Memory layer :" << input->name << " scale factor: " << quantSibling->_dst_quant.GetScale()
<< " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.scale << "\n"; << " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.GetScale() << "\n";
gnalog() << "[UFS] searching for quantizeable input layer for: "<< cnnLayer->name << "\n"; gnalog() << "[UFS] searching for quantizeable input layer for: " << cnnLayer->name << "\n";
CNNNetDFS(InferenceEngine::CNNLayerPtr(cnnLayer, [](InferenceEngine::CNNLayer *) {}), CNNNetDFS(InferenceEngine::CNNLayerPtr(cnnLayer, [](InferenceEngine::CNNLayer*) {}),
[&restartedLayer, cnnLayer](InferenceEngine::CNNLayerPtr layer) { [&restartedLayer, cnnLayer](InferenceEngine::CNNLayerPtr layer) {
gnalog() << "[UFS] from : " << cnnLayer->name << " reached: " << layer->name; gnalog() << "[UFS] from : " << cnnLayer->name << " reached: " << layer->name;
// found that direct input to concat is a indirect parent of align filter - so no link required // found that direct input to concat is a indirect parent of align filter - so no link required
auto info = LayerInfo(layer); auto info = LayerInfo(layer);
if (!info.isWeightable() && !info.isActivation()) { if (!info.isWeightable() && !info.isActivation()) {
gnalog() << "... skipped\n"; gnalog() << "... skipped\n";
return; return;
} }
restartedLayer = layer; restartedLayer = layer;
gnalog() << "... OK, need requantize\n"; gnalog() << "... OK, need requantize\n";
}, true, [&restartedLayer, &cnnLayer](InferenceEngine::CNNLayer *from) { }, true, [&restartedLayer, &cnnLayer](InferenceEngine::CNNLayer* from) {
// aborting UFS once found suitable layer // aborting UFS once found suitable layer
return make_upstream_order(restartedLayer == nullptr ? from : nullptr); return make_upstream_order(restartedLayer == nullptr ? from : nullptr);
}); });
if (restartedLayer == nullptr) { if (restartedLayer == nullptr) {
THROW_GNA_EXCEPTION << "cannot requantize input to " << cnnLayer->name; THROW_GNA_EXCEPTION << "cannot requantize input to " << cnnLayer->name;
@ -201,23 +215,23 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
auto restarLayerInfo = LayerInfo(restartedLayer); auto restarLayerInfo = LayerInfo(restartedLayer);
if (restarLayerInfo.isActivation()) { if (restarLayerInfo.isActivation()) {
// requantize activation by just changing it's output scale factor // requantize activation by just changing it's output scale factor
quantDataForMemoryOutput->_dst_quant.scale = quantSibling->_dst_quant.scale; quantDataForMemoryOutput->_dst_quant.SetScale(quantSibling->_dst_quant.GetScale());
} else { } else {
THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") " THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.GetScale() << ") "
<< " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : " << " for " << cnnLayer->name << ", that is child of " << prevLayer->name << " doesnt match : "
<< activation_scale_factor; << activation_scale_factor;
} }
result = ScaleFactorUpdateResult(restartedLayer.get()); result = ScaleFactorUpdateResult(restartedLayer.get());
return true; return true;
} }
gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")" gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.GetScale() << ")"
<< " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : " << " for " << cnnLayer->name << ", that is child of " << prevLayer->name << " doesnt match : "
<< activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl; << activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl;
// try updating memory input layer scale factor and restart from it // try updating memory input layer scale factor and restart from it
quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale; quantSibling->_src_quant = quantSibling->_dst_quant = inputQuant->_dst_quant;
result = ScaleFactorUpdateResult(input.get()); result = ScaleFactorUpdateResult(input.get());
return true; return true;
} }
@ -226,11 +240,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
} }
if (cnnLayer->type == "Const") { if (cnnLayer->type == "Const") {
if (quant->_dst_quant.IsScaleSet()) {
quant->_src_quant = quant->_dst_quant;
return ScaleFactorUpdateResult();
}
auto blob = cnnLayer->blobs["custom"]; auto blob = cnnLayer->blobs["custom"];
auto blob_precision = blob->getTensorDesc().getPrecision(); auto blob_precision = blob->getTensorDesc().getPrecision();
if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) { if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) {
quant->_dst_quant.scale = 1.0f; quant->_dst_quant.SetScale(1.0f);
return true; return true;
} }
@ -255,16 +274,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
// TODO: Investigate what should be the scale in such cases (31910) // TODO: Investigate what should be the scale in such cases (31910)
if (std::isinf(scale_val)) { if (std::isinf(scale_val)) {
quant->_dst_quant.scale = quant->_src_quant.scale; quant->_dst_quant.SetScale(quant->_src_quant.GetScale());
} else { } else {
quant->_dst_quant.scale = scale_val; quant->_dst_quant.SetScale(scale_val);
} }
return ScaleFactorUpdateResult(); return ScaleFactorUpdateResult();
} }
if (!CNNNetHasPrevLayer(cnnLayer)) { if (!CNNNetHasPrevLayer(cnnLayer)) {
quant->_dst_quant.scale = quant->_src_quant.scale; quant->_dst_quant = quant->_src_quant;
return ScaleFactorUpdateResult(); return ScaleFactorUpdateResult();
} }
@ -273,14 +292,17 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
if (!inputQuant) { if (!inputQuant) {
THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized"; THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized";
} }
quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
quant->_src_quant.scale = inputQuant->_dst_quant.scale;
quant->_src_quant = inputQuant->_dst_quant;
if (layerInfo.isActivation()) { if (layerInfo.isActivation()) {
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
// set the initial value // set the initial value
quant->_dst_quant.scale = getActivationScale(cnnLayer, layerInfo, quant); auto scale = getActivationScale(cnnLayer, layerInfo);
quant->_dst_quant.SetScale(scale);
return true;
} }
quant->_dst_quant = inputQuant->_dst_quant;
return true; return true;
} }
}; };
@ -302,8 +324,8 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
switch (eltwiseLayer->_operation) { switch (eltwiseLayer->_operation) {
case InferenceEngine::EltwiseLayer::Prod: { case InferenceEngine::EltwiseLayer::Prod: {
quantData->_weights_quant.scale = quantParams1->_dst_quant.scale; quantData->_weights_quant = quantParams1->_dst_quant;
quantData->_dst_quant.scale = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale; quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale());
break; break;
} }
case InferenceEngine::EltwiseLayer::Sub: case InferenceEngine::EltwiseLayer::Sub:
@ -325,13 +347,13 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
} }
// this path might result in significant data loss // this path might result in significant data loss
quantData->_bias_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale; quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale; quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
quantData->_dst_quant.scale = quantParams1->_dst_quant.scale; quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
// eltwise will always work in int16 // eltwise will always work in int16
auto maxValue = std::numeric_limits<int16_t>::max() - 1; auto maxValue = std::numeric_limits<int16_t>::max() - 1;
if (quantData->_weights_quant.scale > maxValue + 1) { if (quantData->_weights_quant.GetScale() > maxValue + 1) {
// rescaling it's activation input // rescaling it's activation input
// iterating thru previous layers of eltwise // iterating thru previous layers of eltwise
for (uint8_t i = 0; i < 2; ++i) { for (uint8_t i = 0; i < 2; ++i) {
@ -347,15 +369,15 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
if (info.isSplit() || info.isSlice()) { if (info.isSplit() || info.isSlice()) {
continue; continue;
} else if (info.has16BOutput() && info.isActivation()) { } else if (info.has16BOutput() && info.isActivation()) {
auto newOutputScale = quantParams->_dst_quant.scale / maxValue; auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue;
if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) { if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
break; break;
} }
auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in); auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
<< ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
<< ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush; << ", was " << quantDataForActivation->_dst_quant.GetScale() <<"\n" << std::flush;
quantDataForActivation->_dst_quant.scale = newOutputScale; quantDataForActivation->_dst_quant.SetScale(newOutputScale);
result = ScaleFactorUpdateResult(in.get()); result = ScaleFactorUpdateResult(in.get());
return true; return true;
} else if (info.has16BOutput()) { } else if (info.has16BOutput()) {
@ -365,10 +387,10 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
// if we are here it means that we are in the port 1 // if we are here it means that we are in the port 1
if (info.isFullyConnected() || info.isConvolution()) { if (info.isFullyConnected() || info.isConvolution()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in); auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
auto newOutputScale = quantParams->_dst_quant.scale * maxValue; auto newOutputScale = quantParams->_dst_quant.GetScale() * maxValue;
auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale; auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
quantDataForInputLayer->_dst_quant.scale = newOutputScale; quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
quantDataForInputLayer->_weights_quant.scale = newWeightScale; quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
result = ScaleFactorUpdateResult(in.get()); result = ScaleFactorUpdateResult(in.get());
return true; return true;
} }
@ -410,15 +432,15 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
// if all inputs have same quant value - trivial propagation // if all inputs have same quant value - trivial propagation
auto in0 = inputLayers.front(); auto in0 = inputLayers.front();
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0); auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
auto scaleFactor = quantParams0->_dst_quant.scale; auto scaleFactor = quantParams0->_dst_quant.GetScale();
auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
return fp32eq(quantParams->_dst_quant.scale, scaleFactor); return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor);
}; };
if (std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), scaleFactorCheck) == inputLayers.end()) { if (std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), scaleFactorCheck) == inputLayers.end()) {
quantData->_dst_quant.scale = quantParams0->_dst_quant.scale; quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale());
quantData->_src_quant.scale = quantParams0->_dst_quant.scale; quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
return true; return true;
} }
@ -435,7 +457,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
auto nextInputIt = firstInputIt + 1; auto nextInputIt = firstInputIt + 1;
while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) { while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) {
auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt); auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt);
if (!fp32eq(quantParamsSecond->_dst_quant.scale, quantParamsFirst->_dst_quant.scale)) { if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) {
THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name
<< " and " << (*nextInputIt)->name << " have different scales in concat!!! \n"; << " and " << (*nextInputIt)->name << " have different scales in concat!!! \n";
} }
@ -449,7 +471,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
LayerInfo info(inputLayer); LayerInfo info(inputLayer);
return !info.isActivation() && !fp32eq(quantParams->_dst_quant.scale, 1.0f); return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
}; };
static std::map<std::string, size_t> restarted_counter; static std::map<std::string, size_t> restarted_counter;
@ -469,7 +491,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
if (sourceLayerIt == inputLayers.end()) { if (sourceLayerIt == inputLayers.end()) {
auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
return !fp32eq(quantParams->_dst_quant.scale, 1.0f); return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
}; };
sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor); sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
@ -478,29 +500,28 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
std::set<size_t> concatIdxToUpdate; std::set<size_t> concatIdxToUpdate;
if (sourceLayerIt != inputLayers.end()) { if (sourceLayerIt != inputLayers.end()) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*sourceLayerIt); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*sourceLayerIt);
auto scaleFactor = quantParams->_dst_quant.scale; auto scaleFactor = quantParams->_dst_quant.GetScale();
sourceQuantParams = quantParams; sourceQuantParams = quantParams;
for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) { for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) {
auto quantParamsIn = InferenceEngine::getInjectedData<QuantizedLayerParams>(*it); auto quantParamsIn = InferenceEngine::getInjectedData<QuantizedLayerParams>(*it);
if (fp32eq(quantParamsIn->_dst_quant.scale, scaleFactor)) { if (fp32eq(quantParamsIn->_dst_quant.GetScale(), scaleFactor)) {
continue; continue;
} }
// possible case when some of the concat inputs are free to select scale ex: const->concat<-affine // possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
if (!fp32eq(quantParamsIn->_dst_quant.scale, 1.0f) && !LayerInfo(*it).isActivation()) { if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) {
concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it)); concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
} }
quantParamsIn->_weights_quant = quantParams->_dst_quant; quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
quantParamsIn->_dst_quant = quantParams->_dst_quant;
} }
} }
auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.scale; auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.GetScale();
auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) { auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer); auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
return fp32eq(quantParams->_dst_quant.scale, updatedScaleFactor); return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor);
}; };
auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor); auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor);
@ -508,8 +529,8 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name; THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name;
} }
quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale; quantData->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
if (layerIt == inputLayers.end() && concatIdxToUpdate.empty()) { if (layerIt == inputLayers.end() && concatIdxToUpdate.empty()) {
return true; return true;
@ -517,7 +538,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
for (auto& layerIdToUpdate : concatIdxToUpdate) { for (auto& layerIdToUpdate : concatIdxToUpdate) {
auto destinationQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer); auto destinationQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; destinationQuantParams->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
InferenceEngine::CNNLayerPtr restartedLayer; InferenceEngine::CNNLayerPtr restartedLayer;
// making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input // making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input
@ -542,18 +563,18 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
}); });
if (restartedLayer == nullptr) { if (restartedLayer == nullptr) {
THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << "input to concat: " << concatLayer->name; THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << " input to concat: " << concatLayer->name;
} }
auto quantDataForConCatInput = InferenceEngine::getInjectedData<QuantizedLayerParams>(*restartedLayer); auto quantDataForConCatInput = InferenceEngine::getInjectedData<QuantizedLayerParams>(*restartedLayer);
auto restarLayerInfo = LayerInfo(restartedLayer); auto restarLayerInfo = LayerInfo(restartedLayer);
if (restarLayerInfo.isActivation()) { if (restarLayerInfo.isActivation()) {
// requantize activation by just changing it's output scale factor // requantize activation by just changing it's output scale factor
quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
} }
if (restarLayerInfo.isConst()) { if (restarLayerInfo.isConst()) {
gnalog() << "... warning const layer will be requantized\n"; gnalog() << "... warning const layer will be requantized\n";
quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale; quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
} }
result = ScaleFactorUpdateResult(restartedLayer.get()); result = ScaleFactorUpdateResult(restartedLayer.get());
} }
@ -588,9 +609,9 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get()); InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl); auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale; quant->_src_quant = quantDataForInputLayer->_dst_quant;
// TODO: pass 8 bits somehow // TODO: pass 8 bits somehow
if (quant->_weights_quant.scale == 1.0f) { if (quant->_weights_quant.GetScale() == 1.0f) {
size_t scaleRange = 0; size_t scaleRange = 0;
if (weightsSize == 2) { if (weightsSize == 2) {
scaleRange = MAX_VAL_2B_WEIGHT; scaleRange = MAX_VAL_2B_WEIGHT;
@ -599,67 +620,61 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
} else { } else {
THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize; THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
} }
quant->_weights_quant.scale = quant->_weights_quant.SetScale(
ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size()); ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size()));
if (quant->_weights_quant.scale == -1.0f) { if (quant->_weights_quant.GetScale() == -1.0f) {
quant->_weights_quant.scale = 1.0f; quant->_weights_quant.SetScale(1.0f);
} }
if (wl->_biases) { if (wl->_biases) {
quant->_bias_quant.scale = ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(), quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
MAX_VAL_4B_BIAS, MAX_VAL_4B_BIAS,
wl->_biases->size()); wl->_biases->size()));
if (quant->_bias_quant.scale != -1.0f) { if (quant->_bias_quant.GetScale() != -1.0f) {
quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale); quant->_bias_quant.SetScale(
quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale; std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
} }
} }
// TODO: findout why ??? // TODO: findout why ???
if (weightsSize == 1) { if (weightsSize == 1) {
quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER; quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER);
} }
double weights_reducer = 1.0; double weights_reducer = 1.0;
auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer*>(wl); auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
if (conv) { if (conv) {
auto dims = conv->insData.front().lock()->getDims(); auto dims = conv->insData.front().lock()->getDims();
weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max(); weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
weights_reducer = std::max(1.0, weights_reducer); weights_reducer = std::max(1.0, weights_reducer);
} }
quant->_weights_quant.scale /= weights_reducer; quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weights_reducer);
} }
double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
if (weightsSize == 1 && if (weightsSize == 1 &&
static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) > static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) { static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
gnawarn() << "Output scale for " << wl->name gnawarn() << "Output scale for " << wl->name
<< " too large and are being reduced. Else saturations likely will happen \n"; << " too large and are being reduced. Else saturations likely will happen \n";
// reduce weight scale according experimental heuristic // reduce weight scale according experimental heuristic
if (quant->_dst_quant.scale * quant->_src_quant.scale / if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) { static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
quant->_weights_quant.scale *= _scale_reduction_50; quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_50);
tmp_dst_quant_scale *= _scale_reduction_50; } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
} else if (quant->_dst_quant.scale * quant->_src_quant.scale / static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) { quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_45);
quant->_weights_quant.scale *= _scale_reduction_45; } else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
tmp_dst_quant_scale *= _scale_reduction_45; static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
} else if (quant->_dst_quant.scale * quant->_src_quant.scale / quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_40);
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
quant->_weights_quant.scale *= _scale_reduction_40;
tmp_dst_quant_scale *= _scale_reduction_40;
} else { } else {
quant->_weights_quant.scale *= _scale_reduction_35; quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_35);
tmp_dst_quant_scale *= _scale_reduction_35;
} }
} }
quant->_dst_quant.scale = tmp_dst_quant_scale; quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
return true; return true;
} }
}; };

View File

@ -31,6 +31,7 @@
#include "layers/layers_builder.hpp" #include "layers/layers_builder.hpp"
#include "layers/gna_concat_layer.hpp" #include "layers/gna_concat_layer.hpp"
#include "layers/gna_crop_layer.hpp" #include "layers/gna_crop_layer.hpp"
#include "layers/gna_fake_quantize_layer.hpp"
#include "round_float_define.hpp" #include "round_float_define.hpp"
#include "gna_plugin_policy.hpp" #include "gna_plugin_policy.hpp"
@ -377,8 +378,8 @@ void GNAGraphCompiler::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer)
float output_scale_factor = 1.0f; float output_scale_factor = 1.0f;
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer); auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
if (quantized != nullptr) { if (quantized != nullptr) {
weight_scale_factor = quantized->_weights_quant.scale; weight_scale_factor = quantized->_weights_quant.GetScale();
output_scale_factor = quantized->_dst_quant.scale; output_scale_factor = quantized->_dst_quant.GetScale();
} }
auto& currentComponent = dnnComponents.addComponent(layer->name, "convolution"); auto& currentComponent = dnnComponents.addComponent(layer->name, "convolution");
@ -541,8 +542,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
// TODO: only fp32 and Int16 tested // TODO: only fp32 and Int16 tested
quantized == nullptr ? input->getPrecision().size() : 2, quantized == nullptr ? input->getPrecision().size() : 2,
quantized == nullptr ? input->getPrecision().size() : 4, quantized == nullptr ? input->getPrecision().size() : 4,
quantized == nullptr ? 1 : quantized->_weights_quant.scale, quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs, ptr_outputs,
ptr_weights, ptr_weights,
@ -558,9 +559,9 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64); gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64);
} else { } else {
IE_ASSERT(quantized != nullptr); IE_ASSERT(quantized != nullptr);
auto quantizedScale = FLOAT_TO_INT16(std::min(quantized->_weights_quant.scale * power.scale, auto quantizedScale = FLOAT_TO_INT16(std::min(quantized->_weights_quant.GetScale() * power.scale,
static_cast<float>(INT16_MAX))); static_cast<float>(INT16_MAX)));
auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.scale * power.offset, auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.GetScale() * power.offset,
static_cast<float>(INT32_MAX))); static_cast<float>(INT32_MAX)));
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64); gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64);
gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64); gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
@ -580,8 +581,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
gna_pwl_segment_t* ptr_pwl_segments_target = nullptr; gna_pwl_segment_t* ptr_pwl_segments_target = nullptr;
float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.scale : 1.0f; float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
if (!gnaFlags->sw_fp32) { if (!gnaFlags->sw_fp32) {
if (gnaFlags->uniformPwlDesign) { if (gnaFlags->uniformPwlDesign) {
@ -687,7 +688,7 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
pooling._kernel[X_AXIS], pooling._kernel[X_AXIS],
num_columns_in, num_columns_in,
false, false,
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs); ptr_outputs);
@ -727,7 +728,7 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
num_columns_out, num_columns_out,
inputs->getPrecision().size(), inputs->getPrecision().size(),
outputs->getPrecision().size(), outputs->getPrecision().size(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
num_rows_out + num_padding_out, num_rows_out + num_padding_out,
num_columns_out, num_columns_out,
ptr_inputs, ptr_inputs,
@ -915,8 +916,8 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
4, 4,
quantized == nullptr ? inputs->getPrecision().size() : 2, quantized == nullptr ? inputs->getPrecision().size() : 2,
4, 4,
quantized == nullptr ? 1 : quantized->_weights_quant.scale, quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs, ptr_outputs,
ptr_weights, ptr_weights,
@ -1028,8 +1029,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
// TODO: only fp32 and Int16 tested // TODO: only fp32 and Int16 tested
quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2, quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2,
quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4, quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4,
quantized == nullptr ? 1 : quantized->_weights_quant.scale, quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs, ptr_outputs,
ptr_weights, ptr_weights,
@ -1050,7 +1051,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
if (quantized == nullptr) { if (quantized == nullptr) {
gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64); gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64);
} else { } else {
auto scaledIdentity = -quantized->_weights_quant.scale; auto scaledIdentity = -quantized->_weights_quant.GetScale();
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX))); auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
@ -1062,7 +1063,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
if (quantized == nullptr) { if (quantized == nullptr) {
gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64); gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
} else { } else {
auto scaledIdentity = quantized->_weights_quant.scale; auto scaledIdentity = quantized->_weights_quant.GetScale();
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX))); auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
@ -1132,8 +1133,8 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
outputs->getPrecision().size(), outputs->getPrecision().size(),
weightable._weights->getTensorDesc().getPrecision().size(), weightable._weights->getTensorDesc().getPrecision().size(),
biasPrecision.size(), biasPrecision.size(),
quantized == nullptr ? 1 : quantized->_weights_quant.scale, quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs, ptr_outputs,
ptr_weights, ptr_weights,
@ -1310,7 +1311,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
num_columns_in, num_columns_in,
inputs->getPrecision().size(), inputs->getPrecision().size(),
inputs->getPrecision().size(), inputs->getPrecision().size(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
num_rows_copied, num_rows_copied,
num_columns_in, num_columns_in,
ptr_inputs, ptr_inputs,
@ -1346,8 +1347,8 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
outputs->getPrecision().size(), outputs->getPrecision().size(),
filterLayer->_weights->getTensorDesc().getPrecision().size(), filterLayer->_weights->getTensorDesc().getPrecision().size(),
biasPrecision.size(), biasPrecision.size(),
quantized == nullptr ? 1 : quantized->_weights_quant.scale, quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs, ptr_outputs,
ptr_weights, ptr_weights,
@ -1436,8 +1437,8 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
outputs->getPrecision().size(), outputs->getPrecision().size(),
filterLayer->_weights->getTensorDesc().getPrecision().size(), filterLayer->_weights->getTensorDesc().getPrecision().size(),
biasPrecision.size(), biasPrecision.size(),
quantized == nullptr ? 1 : quantized->_weights_quant.scale, quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs, ptr_outputs,
ptr_weights, ptr_weights,
@ -1517,13 +1518,14 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
} }
} while (false); } while (false);
IE_ASSERT(!layer->insData.empty()); GNA_LAYER_ASSERT(layer, !layer->insData.empty());
IE_ASSERT(!layer->outData.empty()); GNA_LAYER_ASSERT(layer, !layer->outData.empty());
auto inputs = layer->insData.begin()->lock(); auto inputs = layer->insData.begin()->lock();
auto outputs = *layer->outData.begin(); auto outputs = *layer->outData.begin();
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer); auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.scale : 1.0f; float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
auto orientation = kDnnInterleavedOrientation; auto orientation = kDnnInterleavedOrientation;
@ -1588,39 +1590,7 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
} }
if (it->second == kActFakeQuantize) { if (it->second == kActFakeQuantize) {
// get params from const input activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation();
auto GetParamFromInputAsFloat = [](CNNLayerPtr input, size_t idx) {
if (input->insData.size() <= idx) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
}
auto iLayerData = input->insData[idx].lock();
if (!iLayerData) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: cannot dereference data weak-pointer";
}
auto iLayer = getCreatorLayer(iLayerData).lock();
if (!iLayer) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: cannot dereference creator layer weak-pointer";
}
if (!LayerInfo(iLayer).isConst()) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: expected to be of type const, but was: " << iLayer->type;
}
if (!iLayer->blobs.count("custom")) {
THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot get custom blob";
}
auto data = iLayer->blobs["custom"];
if (data->getTensorDesc().getPrecision() != Precision::FP32) {
THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot cast custom blob to type FP32, since it is of type: " << data->getTensorDesc().getPrecision();
}
return data->cbuffer().as<float*>()[0];
};
activation_type.args.fakeQuantize.levels = layer->GetParamAsInt("levels");
activation_type.args.fakeQuantize.input_low = GetParamFromInputAsFloat(layer, 1);
activation_type.args.fakeQuantize.input_high = GetParamFromInputAsFloat(layer, 2);
activation_type.args.fakeQuantize.output_low = GetParamFromInputAsFloat(layer, 3);
activation_type.args.fakeQuantize.output_high = GetParamFromInputAsFloat(layer, 4);
} }
string actName = "unknown"; string actName = "unknown";
@ -1759,7 +1729,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
squeezedInputOrder[1], squeezedInputOrder[1],
inputs->getPrecision().size(), inputs->getPrecision().size(),
outputs->getPrecision().size(), outputs->getPrecision().size(),
(quantized == nullptr) ? 1.0f : quantized->_dst_quant.scale, (quantized == nullptr) ? 1.0f : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs); ptr_outputs);
} }
@ -1774,7 +1744,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
squeezedInputOrder[1], squeezedInputOrder[1],
inputs->getPrecision().size(), inputs->getPrecision().size(),
outputs->getPrecision().size(), outputs->getPrecision().size(),
quantized == nullptr ? 1 : quantized->_dst_quant.scale, quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
ptr_inputs, ptr_inputs,
ptr_outputs); ptr_outputs);
} }

View File

@ -37,6 +37,7 @@
#include "memory/gna_memory_state.hpp" #include "memory/gna_memory_state.hpp"
#include "gna_model_serial.hpp" #include "gna_model_serial.hpp"
#include "runtime/gna_float_runtime.hpp" #include "runtime/gna_float_runtime.hpp"
#include <layers/gna_fake_quantize_layer.hpp>
#include <generic_ie.hpp> #include <generic_ie.hpp>
#include <ngraph/pass/manager.hpp> #include <ngraph/pass/manager.hpp>
@ -351,6 +352,87 @@ void GNAPlugin::InitGNADevice() {
graphCompiler.setGNAMemoryPtr(gnamem); graphCompiler.setGNAMemoryPtr(gnamem);
} }
void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork & network) {
// fp32 emulation mode dont need any modifications to configuration
if (config.gnaFlags.sw_fp32) return;
// search for FQ layers
// only supports cases of int16 or int8
auto it = details::CNNNetworkIterator(&network);
auto end = details::CNNNetworkIterator();
for (; it != end; it++) {
if (!LayerInfo(*it).isFakeQuantize()) {
continue;
}
GNAFakeQuantizeLayer fqLayer(*it);
auto inputLayer = fqLayer.getInputLayer();
// this fake quantize represents data quantization - not weights
if (!LayerInfo(inputLayer).isConst()) {
continue;
}
// also in mixed mode i8 should be stated as target precision
if (fqLayer.getLevels() <= std::numeric_limits<uint8_t>::max()) {
config.gnaPrecision = InferenceEngine::Precision::I8;
} else if (fqLayer.getLevels() <= std::numeric_limits<uint16_t>::max()) {
config.gnaPrecision = InferenceEngine::Precision::I16;
} else {
THROW_GNA_LAYER_EXCEPTION(*it)
<< "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only up to "
<< std::numeric_limits<uint16_t>::max() << " is supported";
}
gnaFlags->fake_quantized = true;
config.gnaFlags.fake_quantized = true;
}
}
void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork & network) {
// fp32 emulation mode dont need any modifications to configuration
if (config.gnaFlags.sw_fp32) return;
// search for FQ layers
// only supports cases of int16 or int8
InputsDataMap inputs;
network.getInputsInfo(inputs);
for (auto && input : inputs) {
auto data = input.second->getInputData();
size_t inputIdx = 0;
for (auto && nextToInputLayer : getInputTo(data)) {
if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
inputIdx++;
continue;
}
// replacing scale factor from this fq layer
GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second);
auto inputRange = fqLayer.getInputRange();
auto outputRange = fqLayer.getOutputRange();
if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
outputRange.second.size() != 1 || outputRange.second.size() != 1) {
THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second)
<< "unsupported, per-channel quantization for input layer : " << input.second->name();
}
float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]);
if (!config.inputScaleFactors.empty()) {
gnalog() << "Scale factor calculated during model quantization (" << scaleInput
<< ") will be used instead of user input (" << inputsDesc->inputScaleFactors[inputIdx] << ").\n";
if (inputsDesc->inputScaleFactors[inputIdx] < scaleInput) {
gnawarn() << "WARNING: Scale factor calculated based on input values (" << inputsDesc->inputScaleFactors[inputIdx]
<< ") is smaller than scale factor used to quantize model (" << scaleInput << "). "
<< "Input values will be clamped.\n";
}
}
config.inputScaleFactors[inputIdx] = scaleInput;
inputsDesc->inputScaleFactors[inputIdx] = scaleInput;
inputIdx++;
}
}
}
void GNAPlugin::LoadNetwork(ICNNNetwork & _network) { void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork; std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
if (_network.getFunction()) { if (_network.getFunction()) {
@ -390,6 +472,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
THROW_GNA_EXCEPTION << error.c_str(); THROW_GNA_EXCEPTION << error.c_str();
} }
// FQ networks now replaces certain flags in the plugin - flags will'be owerritten
UpdateGnaQuantModeFromNetwork(network);
UpdateInputScaleFromNetwork(network);
// network optimisation phases // network optimisation phases
int passIdx = 0; int passIdx = 0;
auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) { auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
@ -401,6 +487,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
passes->registerPass<UnrollLSTMCellPass>(); passes->registerPass<UnrollLSTMCellPass>();
passes->registerPass<RemoveSingleInputConcatPass>(); passes->registerPass<RemoveSingleInputConcatPass>();
// fake quantisation aware passes
passes->registerPass<FuseFQIntoWeightsPass>();
passes->registerPass<MoveFakeQuantizeLayerIntoQuantParamsPass>();
passes->registerPass<SubstitutePReluPass>(); passes->registerPass<SubstitutePReluPass>();
passes->registerPass<SubstituteSoftSignPass>(); passes->registerPass<SubstituteSoftSignPass>();
@ -441,6 +531,19 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
// to run all passes need to have two calls to pass manager // to run all passes need to have two calls to pass manager
run_passes(newNet, true); run_passes(newNet, true);
run_passes(newNet, false); run_passes(newNet, false);
} else if (gnaFlags->fake_quantized) {
switch (config.gnaPrecision) {
case Precision::I16:
ModelQuantizer<FakeQuantI16> q16;
newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
break;
case Precision::I8:
ModelQuantizer<FakeQuantI8> q8;
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
break;
default:
THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
}
} else { } else {
switch (config.gnaPrecision) { switch (config.gnaPrecision) {
case Precision::I16: case Precision::I16:
@ -452,8 +555,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors); newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
break; break;
default: default:
THROW_GNA_EXCEPTION << "no mans land for GNA precision"; THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
break;
} }
} }
@ -470,7 +572,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
return; return;
} }
printed_properties.emplace_back( printed_properties.emplace_back(
"scale factor", std::to_string(quantized->_dst_quant.scale)); "scale factor", std::to_string(quantized->_dst_quant.GetScale()));
}); });
#endif #endif
@ -564,7 +666,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num); desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
desc.orientation = component.orientation_out; desc.orientation = component.orientation_out;
desc.num_bytes_per_element = component.num_bytes_per_output; desc.num_bytes_per_element = component.num_bytes_per_output;
desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
// TODO: this need to be fixed // TODO: this need to be fixed
desc.num_elements = component.num_rows_out; desc.num_elements = component.num_rows_out;
@ -623,7 +725,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
// TODO: what is orientation for concat // TODO: what is orientation for concat
desc.orientation = kDnnInterleavedOrientation; desc.orientation = kDnnInterleavedOrientation;
desc.num_bytes_per_element = layer->outData.front()->getPrecision().size(); desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element; desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
// binding ptr for first infer request - then others will be setup during relocation // binding ptr for first infer request - then others will be setup during relocation

View File

@ -219,6 +219,8 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
int idx = 0); int idx = 0);
void UpdateFieldsFromConfig(); void UpdateFieldsFromConfig();
void UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork &);
void UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork &);
}; };
} // namespace GNAPluginNS } // namespace GNAPluginNS

View File

@ -72,5 +72,5 @@ if (!(expr)) { \
} }
#define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": " #define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
#define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer) #define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer)
#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" " #define LAYER_NAME(layer) (layer)->type << " layer : \"" << (layer)->name << "\" "

View File

@ -0,0 +1,164 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "gna_layer_info.hpp"
#include "gna_plugin_log.hpp"
#include "gna_layer_helpers.hpp"
#include "frontend/weights_converter.hpp"
#include <ie_algorithm.hpp>
namespace GNAPluginNS {
class GNAFakeQuantizeLayer {
InferenceEngine::CNNLayerPtr fqLayer;
public :
GNAFakeQuantizeLayer(InferenceEngine::CNNLayerPtr fqLayer)
: fqLayer(fqLayer) {
if (!LayerInfo(fqLayer).isFakeQuantize()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize";
}
}
/**
* @brief convert FQ layer directly to gna-pwl activation layer
*/
DnnActivation parseAsActivation() const {
DnnActivation fqActivation;
fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels");
auto inputShape = getShapeForRange(fqLayer, 1);
auto outputShape = getShapeForRange(fqLayer, 3);
// TODO: check shapes broadcasting to shape of input at 0
auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end());
auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end());
fqActivation.args.fakeQuantize.inputPerChannel = inputRangeSize != 1;
fqActivation.args.fakeQuantize.input_low = getParamFromInputAsFloats(fqLayer, 1);
fqActivation.args.fakeQuantize.input_high = getParamFromInputAsFloats(fqLayer, 2);
fqActivation.args.fakeQuantize.outputPerChannel = outputRangeSize != 1;
fqActivation.args.fakeQuantize.output_low = getParamFromInputAsFloats(fqLayer, 3);
fqActivation.args.fakeQuantize.output_high = getParamFromInputAsFloats(fqLayer, 4);
fqActivation.type = kActFakeQuantize;
return fqActivation;
}
/**
* retrieves input blob for FQ layer that connected to const layer
*/
InferenceEngine::Blob::Ptr getConstInputData() const {
return LayerUtils::getParamFromInputAsBlob(fqLayer, 0);
}
/**
* fake quantize has 5 input layers, while 4 of them always constant layer, and 1 might be a tensor - connection
*/
InferenceEngine::CNNLayerPtr getInputLayer() const {
return getInputLayerAt(fqLayer, 0);
}
int32_t getLevels() {
return fqLayer->GetParamAsInt("levels");
}
std::pair<std::vector<float>, std::vector<float>> getInputRange() {
return getRange(fqLayer, 1);
}
std::pair<std::vector<float>, std::vector<float>> getOutputRange() {
return getRange(fqLayer, 3);
}
operator InferenceEngine::CNNLayerPtr () const {
return fqLayer;
}
InferenceEngine::CNNLayerPtr operator -> () const {
return fqLayer;
}
InferenceEngine::CNNLayerPtr operator * () const {
return fqLayer;
}
protected :
static std::pair<std::vector<float>, std::vector<float>> getRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
auto shape = getShapeForRange(input, idx);
auto rangeSize = InferenceEngine::details::product(shape.begin(), shape.end());
auto dataMin = LayerUtils::getParamFromInputAsBlob(input, idx);
auto dataMax = LayerUtils::getParamFromInputAsBlob(input, idx + 1);
std::vector<float> minValues(rangeSize), maxValues(rangeSize);
switch (dataMin->getTensorDesc().getPrecision()) {
case InferenceEngine::Precision::FP32: {
memcpy(&minValues[0], dataMin->buffer().as<float*>(), rangeSize * sizeof(float));
memcpy(&maxValues[0], dataMax->buffer().as<float*>(), rangeSize * sizeof(float));
break;
}
case InferenceEngine::Precision::FP16: {
auto dataMinFP32 = make_fp32_blob(dataMin);
memcpy(&minValues[0], dataMinFP32->buffer().as<float*>(), rangeSize * sizeof(float));
auto dataMaxFP32 = make_fp32_blob(dataMax);
memcpy(&maxValues[0], dataMaxFP32->buffer().as<float*>(), rangeSize * sizeof(float));
break;
}
default:
THROW_GNA_LAYER_EXCEPTION(input) << "cannot cast custom blob to type FP32, since it is of type: "
<< dataMin->getTensorDesc().getPrecision();
break;
}
return {minValues, maxValues};
}
static float* getParamFromInputAsFloats(InferenceEngine::CNNLayerPtr input, size_t idx) {
auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
if (data->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot cast custom blob to type FP32, since it is of type: "
<< data->getTensorDesc().getPrecision();
}
return data->buffer().as<float*>();
}
static InferenceEngine::SizeVector getShapeFromInput(InferenceEngine::CNNLayerPtr input, size_t idx) {
auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
return data->getTensorDesc().getDims();
}
static InferenceEngine::CNNLayerPtr getInputLayerAt(InferenceEngine::CNNLayerPtr input, size_t idx) {
if (input->insData.size() <= idx) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
}
auto iLayerData = input->insData[idx].lock();
if (!iLayerData) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
<< ", input: cannot dereference data weak-pointer";
}
auto iLayer = getCreatorLayer(iLayerData).lock();
if (!iLayer) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
<< ", input: cannot dereference creator layer weak-pointer";
}
return iLayer;
}
static InferenceEngine::SizeVector getShapeForRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
auto lowShape = getShapeFromInput(input, idx);
auto highShape = getShapeFromInput(input, idx + 1);
if (lowShape.size() != highShape.size()) {
THROW_GNA_LAYER_EXCEPTION(input) << "shapes mismatch for " << idx << " and " << idx + 1 << " inputs";
}
for (size_t i = 0; i != lowShape.size(); i++) {
if (lowShape[i] != highShape[i]) {
THROW_GNA_LAYER_EXCEPTION(input) << "shapes mismatch for " << idx << " and " << idx + 1 << " inputs";
}
}
return lowShape;
}
};
} // namespace GNAPluginNS

View File

@ -0,0 +1,44 @@
// Copyright (C) 2018-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "gna_layer_info.hpp"
#include "gna_plugin_log.hpp"
namespace GNAPluginNS {
namespace LayerUtils {
/**
* @brief retrievs blob from const layer connected to certain layer
* @param input
* @param idx
*/
inline InferenceEngine::Blob::Ptr getParamFromInputAsBlob(InferenceEngine::CNNLayerPtr input, size_t idx) {
if (input->insData.size() <= idx) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
}
auto iLayerData = input->insData[idx].lock();
if (!iLayerData) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
<< ", input: cannot dereference data weak-pointer";
}
auto iLayer = getCreatorLayer(iLayerData).lock();
if (!iLayer) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
<< ", input: cannot dereference creator layer weak-pointer";
}
if (!LayerInfo(iLayer).isConst()) {
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
<< ", input: expected to be of type const, but was: " << iLayer->type;
}
if (!iLayer->blobs.count("custom")) {
THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot get custom blob";
}
return iLayer->blobs["custom"];
}
} // namespace LayerUtils
} // namespace GNAPluginNS

View File

@ -205,8 +205,8 @@ class LayerInfo {
bool isConcat() const noexcept { bool isConcat() const noexcept {
return isOfType("concat"); return isOfType("concat");
} }
bool isFakeQnatize() const noexcept { bool isFakeQuantize() const noexcept {
return isOfType("FakeQnatize"); return isOfType("FakeQuantize");
} }
bool isNonFunctional() const noexcept { bool isNonFunctional() const noexcept {
return isOfType("reshape") || isOfType("squeeze") || isOfType("unsqueeze") || isTrivialPermute(); return isOfType("reshape") || isOfType("squeeze") || isOfType("unsqueeze") || isTrivialPermute();

View File

@ -71,7 +71,7 @@ namespace memory {
case InferenceEngine::Precision::I16: { case InferenceEngine::Precision::I16: {
if (new_state_precision == InferenceEngine::Precision::FP32) { if (new_state_precision == InferenceEngine::Precision::FP32) {
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput()); auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
auto scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; auto scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
GNAPluginNS::ConvertToInt16(static_cast<int16_t*>(state->gna_ptr), GNAPluginNS::ConvertToInt16(static_cast<int16_t*>(state->gna_ptr),
newState->buffer().as<float*>(), newState->buffer().as<float*>(),
1, 1,
@ -97,7 +97,7 @@ namespace memory {
if (state->getInput() && state_precision == InferenceEngine::Precision::I16) { if (state->getInput() && state_precision == InferenceEngine::Precision::I16) {
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput()); auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
auto scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; auto scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
auto result_blob = make_blob_with_precision(InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32, auto result_blob = make_blob_with_precision(InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32,
InferenceEngine::SizeVector({ 1, elements }), InferenceEngine::SizeVector({ 1, elements }),

View File

@ -25,21 +25,25 @@
#include <legacy/net_pass.h> #include <legacy/net_pass.h>
#include <layers/gna_copy_layer.hpp> #include <layers/gna_copy_layer.hpp>
#include "backend/dnn_types.h"
#include "gna_plugin_log.hpp" #include "gna_plugin_log.hpp"
#include "frontend/quantization.h" #include "frontend/quantization.h"
#include "frontend/quantized_layer_params.hpp" #include "frontend/quantized_layer_params.hpp"
#include <layers/gna_copy_layer.hpp> #include <layers/gna_copy_layer.hpp>
#include <layers/gna_fake_quantize_layer.hpp>
#include <runtime/pwl.h>
#include "gna_graph_tools.hpp" #include "gna_graph_tools.hpp"
#include "gna_pass_manager.hpp" #include "gna_pass_manager.hpp"
#include "layers/gna_layer_info.hpp" #include "layers/gna_layer_info.hpp"
#include "gna_upstream_iterator.hpp" #include "gna_upstream_iterator.hpp"
#include "frontend/quantization.h"
using namespace InferenceEngine; using namespace InferenceEngine;
using namespace InferenceEngine::details; using namespace InferenceEngine::details;
using namespace GNAPluginNS; using namespace GNAPluginNS;
#define pass_trace() gnalog() << "[" << getName() << "]" #define pass_trace() gnalog() << "[" << getName() << "] "
std::shared_ptr<IPassManager> BasePass::getPassManager() { std::shared_ptr<IPassManager> BasePass::getPassManager() {
auto sharedMgr = mgr.lock(); auto sharedMgr = mgr.lock();
@ -1672,6 +1676,232 @@ void FuseMultipleIdentitiesPass::run() {
} }
} }
void FuseFQIntoWeightsPass::run() {
auto isNonFunctional = [](CNNLayerPtr ptr) {
return LayerInfo(ptr).isNonFunctional();
};
auto assignWeightsAndBiases = [](CNNLayerPtr layer, Blob::Ptr weights, Blob::Ptr biases) {
auto weigtableLayer = std::dynamic_pointer_cast<WeightableLayer>(layer);
if (nullptr == weigtableLayer) {
THROW_GNA_LAYER_EXCEPTION(layer) << " not a weightable layer";
}
weigtableLayer->_weights = weights;
weigtableLayer->_biases = biases;
weigtableLayer->blobs["weights"] = weights;
weigtableLayer->blobs["biases"] = biases;
};
for (auto &l : *pLayers) {
if (!LayerInfo(l).isFakeQuantize()) {
continue;
}
// determine whether this FQ is actually ends into weigtable layer
auto fqLayer = l;
if (!CNNNetHasNextLayerSkipCertain(fqLayer, 0, 0, isNonFunctional)) {
continue;
}
auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, 0, isNonFunctional).first;
if (!LayerInfo(weightableLayer).isWeightable()) {
continue;
}
if (weightableLayer->insData.size() != 3) {
continue;
}
// check whether this FQ represents weights - it need to be at index 1 of weightable layer
auto prevLayerAt1 = CNNNetPrevLayerSkipCertain(weightableLayer, 1, isNonFunctional);
if (prevLayerAt1 != fqLayer) {
continue;
}
// now this FQ layer represents weights - lets apply it and fuse to given weightable layer.
pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
<< LAYER_NAME(weightableLayer) << "\n";
GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, 2);
auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();
// 1. broke existing connections - by detaching fq subgraph from rest of graph
auto prevData = weightableLayer->insData[1].lock();
auto prevLayer = getCreatorLayer(prevData).lock();
auto weightDims = prevLayer->outData.front()->getDims();
prevLayer->outData.clear();
weightableLayer->insData.resize(1);
// 2. running FQ function for given layer
if (weightDims.size() != 2) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " layout of weigths not equal to NC not yet supported";
}
auto outputSize = details::product(weightDims.begin(), weightDims.end());
// depending on compute precision weights will be recreated
// for integer mode - weights might be simply copied - to avoid furter quantisations overhead
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(weightableLayer);
if (quantized) {
// assign already quantized Weights
assignWeightsAndBiases(weightableLayer, quantizedWeights, biases);
// modify scale factors for quantized component
auto levels = gnaFakeQuantizeLayer.getLevels();
auto inputRange = gnaFakeQuantizeLayer.getInputRange();
auto outputRange = gnaFakeQuantizeLayer.getOutputRange();
if (outputRange.first.size() != outputRange.second.size()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " number of min and max data must be equal, min size: "
<< outputRange.first.size() << ", max size: " << outputRange.second.size();
}
if (inputRange.first.size() != outputRange.first.size() ||
inputRange.second.size() != outputRange.second.size()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " size of input and output range differs. "
<< "input min size: " << inputRange.first.size() << ", "
<< "output min size: " << outputRange.first.size() << ", "
<< "input max size: " << inputRange.second.size() << ", "
<< "output max size: " << outputRange.second.size();
}
if (levels > std::numeric_limits<uint8_t>::max() && outputRange.first.size() > 1) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantization for int16 weights."
<< " Per-channel quantization ";
}
// check if
// - weights were float values and need to be quantized,
// - weights are integer values and quantization can be skipped
for (size_t i = 0; i < outputRange.first.size(); ++i) {
if (inputRange.first[i] > outputRange.first[i] ||
inputRange.second[i] > outputRange.second[i]) {
quantized->_weights_quantized = true;
break;
}
}
quantized->_weights_quant.SetMinValues(outputRange.first);
quantized->_weights_quant.SetMaxValues(outputRange.second);
quantized->_weights_quant.SetLevels(levels);
// lets find out minimum scale factor among channels
if (quantized->_weights_quant.GetMinValues().empty()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed";
}
auto getScale = [&quantized](size_t i) {
return (quantized->_weights_quant.GetLevels() - 1) /
(quantized->_weights_quant.GetMaxValues()[i] - quantized->_weights_quant.GetMinValues()[i]);
};
float min_channel_scale = getScale(0);
for (uint32_t i = 1; i < quantized->_weights_quant.GetMinValues().size(); i++) {
min_channel_scale = std::min(min_channel_scale, getScale(i));
}
auto multiplier = 1.0f;
if (quantized->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
// GNA supports additional multiplier for only 8bit weights.
// The multipler is used to extend dynamic range.
multiplier = MAX_OUT_MULTIPLIER;
}
// Common weights scale calculation
quantized->_weights_quant.SetScale(min_channel_scale * multiplier);
continue;
}
intel_dnn_component_t component;
component.num_columns_in = weightDims[1];
component.num_rows_in = weightDims[0];
intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component.op.pwl);
transform->func_id = gnaFakeQuantizeLayer.parseAsActivation();
auto quantizedWeightsData = quantizedWeights->buffer();
component.ptr_inputs = quantizedWeightsData.as<float*>();
auto dequantizedWeights = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
dequantizedWeights->allocate();
auto resultBuffer = dequantizedWeights->buffer();
component.ptr_outputs = resultBuffer.as<float*>();
PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
// 3. assign dequantized const blob to weightable layer
assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases);
}
}
void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
if (!quantized) {
return;
}
auto donotSkip = [](CNNLayerPtr) {
return false;
};
for (auto &&l : *pLayers) {
if (!LayerInfo(l).isFakeQuantize()) {
continue;
}
GNAFakeQuantizeLayer fqLayer(l);
auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
if (prevLayer->outData.size() != 1) {
THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
}
auto inputRange = fqLayer.getInputRange();
auto outputRange = fqLayer.getOutputRange();
if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
outputRange.second.size() != 1 || outputRange.second.size() != 1) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantisation";
}
float fqLevels = fqLayer.getLevels();
float scaleInput = (fqLevels - 1) / (inputRange.second[0] - inputRange.first[0]);
float scaleOutputs = (fqLevels - 1) / (outputRange.second[0] - outputRange.first[0]);
// Before FQ layer is removed, the previous layer has to be updated with its quantization data
auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
quantParamsPrevLayer->_dst_quant.SetScale(scaleOutputs);
quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels);
quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] });
quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] });
auto prevData = prevLayer->outData.front();
getInputTo(prevLayer->outData.front()).clear();
// Find all output layers connected to FQ
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
if (nextLayers.empty()) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize does not have any output layers connected";
}
// Connect all next layers after FQ to the layer that is before FQ
// and propagate quantization data
for (size_t i = 0; i < nextLayers.size(); ++i) {
auto insDatas = CNNLayerFindInsDataIdxes(fqLayer->outData.front(), nextLayers[i]);
if (insDatas.size() != 1) {
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize connection to layer: "
<< LAYER_NAME(nextLayers[i]) << " is not correct";
}
nextLayers[i]->insData[insDatas.front()] = prevData;
getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
// After layer gets removed lets absorb its params in QuantParams structure
// replacing scale factor from this fq layer
auto quantParamsNextLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(nextLayers[i]);
quantParamsNextLayer->_src_quant.SetScale(scaleOutputs);
quantParamsNextLayer->_src_quant.SetLevels(fqLevels);
quantParamsNextLayer->_src_quant.SetMinValues({ outputRange.first[0] });
quantParamsNextLayer->_src_quant.SetMaxValues({ outputRange.second[0] });
}
}
}
int PassManager::run(int index) { int PassManager::run(int index) {
#ifdef PLOT #ifdef PLOT
auto dumpNetworkAfterPass = [&index, this] (std::shared_ptr<Pass> pass) { auto dumpNetworkAfterPass = [&index, this] (std::shared_ptr<Pass> pass) {

View File

@ -199,6 +199,17 @@ DECL_PASS(FuseMultipleIdentities);
*/ */
DECL_PASS(BroadcastConst); DECL_PASS(BroadcastConst);
/**
* @brief runs static quantisation on given floating weights and replaces fakeQuantize with constblobs
*/
DECL_PASS(FuseFQIntoWeights);
/**
* @brief remove all fake quantize layers while moving it's settings into QuantParams for certain layer
*/
DECL_PASS(MoveFakeQuantizeLayerIntoQuantParams);
struct PassManagerSettings { struct PassManagerSettings {
Policy policy; Policy policy;
/// @brief whether to run passes before copy /// @brief whether to run passes before copy

View File

@ -1047,25 +1047,32 @@ void PwlApply32(intel_dnn_component_t *component,
} }
break; break;
case kActFakeQuantize: { case kActFakeQuantize: {
auto input_low = transform->func_id.args.fakeQuantize.input_low;
auto input_high = transform->func_id.args.fakeQuantize.input_high;
auto output_low = transform->func_id.args.fakeQuantize.output_low;
auto output_high = transform->func_id.args.fakeQuantize.output_high;
auto levels = transform->func_id.args.fakeQuantize.levels; auto levels = transform->func_id.args.fakeQuantize.levels;
// TODO: this special modification for spedup-compute give different result with straight FQ forulae
// but this used in referencen graph FakeQuantize implementations so we need to honor it for a while
float scaleInput = (input_high - input_low) / (levels-1);
float scaleOutputs = (output_high - output_low) / (levels-1);
for (uint32_t i = num_row_start; i <= num_row_end; i++) { for (uint32_t i = num_row_start; i <= num_row_end; i++) {
auto inputChannel = transform->func_id.args.fakeQuantize.inputPerChannel ? i : 0;
auto outputChannel = transform->func_id.args.fakeQuantize.outputPerChannel ? i : 0;
auto input_low = transform->func_id.args.fakeQuantize.input_low[inputChannel];
auto input_high = transform->func_id.args.fakeQuantize.input_high[inputChannel];
auto output_low = transform->func_id.args.fakeQuantize.output_low[outputChannel];
auto output_high = transform->func_id.args.fakeQuantize.output_high[outputChannel];
// TODO: this special modification for spedup-compute give different result with straight FQ formulae
// but this used in reference graph FakeQuantize implementations so we need to honor it for a while
float scaleInput = (input_high - input_low) / (levels-1);
float scaleOutputs = (output_high - output_low) / (levels-1);
for (uint32_t j = num_col_start; j <= num_col_end; j++) { for (uint32_t j = num_col_start; j <= num_col_end; j++) {
auto x = ptr_in[i * num_columns + j]; auto offset = i * num_columns + j;
auto x = ptr_in[offset];
if (x < std::min(input_low, input_high)) { if (x < std::min(input_low, input_high)) {
ptr_out[i * num_columns + j] = output_low; ptr_out[offset] = output_low;
} else if (x > std::max(input_low, input_high)) { } else if (x > std::max(input_low, input_high)) {
ptr_out[i * num_columns + j] = output_high; ptr_out[offset] = output_high;
} else { } else {
ptr_out[i * num_columns + j] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low; ptr_out[offset] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low;
} }
} }
} }

View File

@ -41,17 +41,45 @@ const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes = {
// {"sw_exact_i8", configInt8}, // {"sw_exact_i8", configInt8},
}; };
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}}; const std::vector<std::vector<size_t>> inputShapes = {
{3, 10, 5, 6},
{1, 1, 1, 1},
{1, 8, 8, 256},
{1, 2, 2, 2},
{1, 3, 4, 5},
};
const std::vector<std::vector<size_t>> constShapes = {{1}}; const std::vector<std::vector<size_t>> constShapes = {{1}};
const std::vector<size_t> levels = {16, 255, 256}; const std::vector<size_t> levels = {16, 255, 256};
const std::vector<std::vector<float>> fqArgs = {{0, 10, 2, 5}, {}}; const std::vector<std::vector<float>> fqArgs = {{}};
const std::vector<std::vector<float>> inputParams = {{-10, 10, 0.1}, {}}; const std::vector<std::vector<float>> inputParams = {{-10, 10, 0.1}, {}};
const std::vector<float> fqInputMin = {0, 1, 2, 3, 4, 5};
const std::vector<float> fqInputMax = {10, 9, 8, 7, 6};
const std::vector<float> fqOutputMin = {1, 2, 3, 4};
const std::vector<float> fqOutputMax = {8, 7, 6, 5};
std::vector<std::vector<float>> getInputOutputShapes(const std::vector<float> inputsMin,
const std::vector<float> inputsMax,
const std::vector<float> OutputsMin,
const std::vector<float> OutputsMax,
std::vector<std::vector<float>> fqArg) {
for (const auto& inputMin : inputsMin) {
for (const auto& inputMax : inputsMax) {
for (const auto& outputMin : OutputsMin) {
for (const auto& outputMax : OutputsMax) {
fqArg.push_back({inputMin, inputMax, outputMin, outputMax});
}
}
}
}
return fqArg;
}
const auto fqParams = ::testing::Combine( const auto fqParams = ::testing::Combine(
::testing::ValuesIn(levels), ::testing::ValuesIn(levels),
::testing::ValuesIn(constShapes), ::testing::ValuesIn(constShapes),
::testing::ValuesIn(fqArgs), ::testing::ValuesIn(getInputOutputShapes(fqInputMin, fqInputMax, fqOutputMin, fqOutputMax, fqArgs)),
::testing::ValuesIn(inputParams) ::testing::ValuesIn(inputParams)
); );

View File

@ -0,0 +1,125 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <vector>
#include <gna/gna_config.hpp>
#include "subgraph_tests/two_fake_quantize_to_fullyconnected.hpp"
#include "common_test_utils/test_constants.hpp"
using namespace LayerTestsDefinitions;
namespace {
const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16,
};
using ConfigType = std::map<std::string, std::string>;
const ConfigType configFP32 = {
{"GNA_DEVICE_MODE", "GNA_SW_FP32"},
};
const ConfigType configSWExact = {
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
{"GNA_COMPACT_MODE", "NO"}
};
/**
* @brief specific quantisation mode to be used internally
*/
const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes = {
{"sw_fp32", configFP32},
};
const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes_I8 = {
{"gna_sw_exact", configSWExact},
};
const std::vector<std::vector<size_t>> inputShapes = {
{1, 440}
};
const std::vector<std::vector<std::vector<size_t>>> constShapes = {
{{1}, {2048, 1}}
};
const std::vector<std::vector<std::vector<size_t>>> constShapes_int16 = {
{{1}, {1}}
};
const std::vector<size_t> levels_fp = {255, 65535};
const std::vector<std::vector<size_t>> levels_i16 = {{65535, 65535}, {32767, 32767}, {16383, 16383}};
const std::vector<std::vector<size_t>> levels_i8 = {{255, 255}};
const std::vector<std::vector<float>> fqArgs = {{-2.0f, 2.0f, -2.0f, 2.0f}};
const std::vector<std::vector<float>> inputParams = {{-64, 64, 1}, {-10, 10, 0.1}};
const std::vector<std::vector<float>> inputParams_I8 = {{-2.0f, 2.0f, 0.1f}};
const std::vector<bool> biases = {false, true};
const auto fqParams = ::testing::Combine(
::testing::Values(levels_fp),
::testing::ValuesIn(constShapes),
::testing::ValuesIn(fqArgs),
::testing::ValuesIn(inputParams)
);
const auto fqParams_I8 = ::testing::Combine(
::testing::ValuesIn(levels_i8),
::testing::ValuesIn(constShapes),
::testing::ValuesIn(fqArgs),
::testing::ValuesIn(inputParams_I8)
);
const auto fqParams_I16 = ::testing::Combine(
::testing::ValuesIn(levels_i16),
::testing::ValuesIn(constShapes_int16),
::testing::ValuesIn(fqArgs),
::testing::ValuesIn(inputParams_I8)
);
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph, FakeQuantizeSubgraphTest,
::testing::Combine(
fqParams,
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(inputShapes),
::testing::Values(CommonTestUtils::DEVICE_GNA),
::testing::ValuesIn(gnaQuantModes),
::testing::ValuesIn(biases)),
FakeQuantizeSubgraphTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph_U8, FakeQuantizeSubgraphTest,
::testing::Combine(
fqParams_I8,
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(inputShapes),
::testing::Values(CommonTestUtils::DEVICE_GNA),
::testing::ValuesIn(gnaQuantModes_I8),
::testing::ValuesIn(biases)),
FakeQuantizeSubgraphTest::getTestCaseName);
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph_I16, FakeQuantizeSubgraphTest,
::testing::Combine(
fqParams_I16,
::testing::ValuesIn(netPrecisions),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::Values(InferenceEngine::Layout::ANY),
::testing::ValuesIn(inputShapes),
::testing::Values(CommonTestUtils::DEVICE_GNA),
::testing::ValuesIn(gnaQuantModes_I8),
::testing::ValuesIn(biases)),
FakeQuantizeSubgraphTest::getTestCaseName);
} // namespace

View File

@ -0,0 +1,52 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <tuple>
#include <vector>
#include <string>
#include <memory>
#include "functional_test_utils/layer_test_utils.hpp"
#include "ngraph_functions/builders.hpp"
#include "ngraph_functions/utils/ngraph_helpers.hpp"
typedef std::tuple<
std::vector<size_t>, // levels
std::vector<std::vector<size_t>>, // const inputs shape
std::vector<float>, // fake quantize inputLow, inputHigh, outputLow, outputHigh or empty for random
std::vector<float> // input generator data: low, high, resolution
> fqSpecificParams;
typedef std::tuple<
fqSpecificParams,
InferenceEngine::Precision, // Net precision
InferenceEngine::Precision, // Input precision
InferenceEngine::Precision, // Output precision
InferenceEngine::Layout, // Input layout
InferenceEngine::Layout, // Output layout
InferenceEngine::SizeVector, // Input shapes
LayerTestsUtils::TargetDevice, // Device name
std::pair<std::string, std::map<std::string, std::string>>, // Additional backend configuration and alis name to it
bool
> fqSubgraphTestParamsSet;
namespace LayerTestsDefinitions {
class FakeQuantizeSubgraphTest : public testing::WithParamInterface<fqSubgraphTestParamsSet>,
virtual public LayerTestsUtils::LayerTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<fqSubgraphTestParamsSet> obj);
protected:
void SetUp() override;
InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override;
protected:
float inputDataMin = 0.0;
float inputDataMax = 10.0;
float inputDataResolution = 1.0;
int32_t seed = 1;
};
} // namespace LayerTestsDefinitions

View File

@ -111,8 +111,6 @@ void FakeQuantizeLayerTest::SetUp() {
{fqDirectArg[2]}, {fqDirectArg[2]},
{fqDirectArg[3]}); {fqDirectArg[3]});
} }
auto fq = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(fakeQNode); auto fq = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(fakeQNode);
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(fq)}; ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(fq)};

View File

@ -0,0 +1,168 @@
// Copyright (C) 2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <tuple>
#include <vector>
#include <string>
#include <memory>
#include <functional>
#include <functional_test_utils/skip_tests_config.hpp>
#include "ie_core.hpp"
#include "common_test_utils/common_utils.hpp"
#include "functional_test_utils/blob_utils.hpp"
#include "functional_test_utils/plugin_cache.hpp"
#include "functional_test_utils/layer_test_utils.hpp"
#include "subgraph_tests/two_fake_quantize_to_fullyconnected.hpp"
namespace LayerTestsDefinitions {
std::string FakeQuantizeSubgraphTest::getTestCaseName(testing::TestParamInfo<fqSubgraphTestParamsSet> obj) {
fqSpecificParams fqParams;
InferenceEngine::Precision netPrecision;
InferenceEngine::Precision inPrc, outPrc;
InferenceEngine::Layout inLayout, outLayout;
InferenceEngine::SizeVector inputShapes;
std::string targetDevice;
std::pair<std::string, std::map<std::string, std::string>> config;
bool biases = false;
std::tie(fqParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShapes, targetDevice, config, biases) = obj.param;
std::vector<size_t> levels;
std::vector<std::vector<size_t>> constShape;
std::vector<float> fqDirectArgs;
std::vector<float> inputArg;
std::tie(levels, constShape, fqDirectArgs, inputArg) = fqParams;
std::ostringstream result;
result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
result << "CS=" << CommonTestUtils::vec2str(constShape) << "_";
result << "LEVELS=" << CommonTestUtils::vec2str(levels) << "_";
result << "netPRC=" << netPrecision.name() << "_";
result << "inPRC=" << inPrc.name() << "_";
result << "outPRC=" << outPrc.name() << "_";
result << "inL=" << inLayout << "_";
result << "outL=" << outLayout << "_";
result << "biases=" << biases << "_";
result << "trgDev=" << targetDevice;
if (!config.first.empty()) {
result << "_targetConfig=" << config.first;
}
if (!fqDirectArgs.empty()) {
result << "_fqArgs=" << fqDirectArgs[0] << "_" << fqDirectArgs[1] << "_" << fqDirectArgs[2] << "_" << fqDirectArgs[3];
}
if (inputArg.size() == 3) {
result << "_inputArg=" << inputArg[0] << "_" << inputArg[1] << "_" << inputArg[2];
}
return result.str();
}
void FakeQuantizeSubgraphTest::SetUp() {
fqSpecificParams fqParams;
std::vector<size_t> inputShape;
std::pair<std::string, std::map<std::string, std::string>> config;
auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
bool biases = false;
std::tie(fqParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShape, targetDevice, config, biases) = this->GetParam();
InferenceEngine::SizeVector kernel, stride, dilation;
std::vector<size_t> levels;
std::vector<std::vector<size_t>> constShape;
std::vector<float> fqDirectArg;
std::vector<float> inputArg;
std::tie(levels, constShape, fqDirectArg, inputArg) = fqParams;
if (inputArg.size() == 3) {
inputDataMin = inputArg[0];
inputDataMax = inputArg[1];
inputDataResolution = inputArg[2];
}
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
const int seed = 0;
std::mt19937 gen(static_cast<float>(seed));
auto generateFloatNumbers = [gen](std::size_t vec_len, float min, float max) mutable {
std::vector<float> res;
std::uniform_real_distribution<float> dist(min, max);
for (int i = 0; i < vec_len; i++)
res.emplace_back(static_cast<float>(dist(gen)));
return res;
};
auto weightsRowNum = constShape[1][0];
auto weightsColNum = inputShape[1];
auto weightsData = generateFloatNumbers(weightsRowNum * weightsColNum, inputDataMin, inputDataMax);
auto const_param = ngraph::builder::makeConstant<float>(ngPrc, { constShape[1][0], inputShape[1] }, { 1.0f });
auto inputMinRange = std::vector<float>{};
auto inputMaxRange = std::vector<float>{};
auto channelDataSize = constShape[1];
if (channelDataSize[0] == 1) {
// If per tensor data needs to be provided
inputMinRange.push_back(inputDataMin);
inputMaxRange.push_back(inputDataMax);
} else if (channelDataSize[0] == weightsRowNum) {
// If per channel data needs to be provided
for (size_t i = 0; i < weightsRowNum; ++i) {
auto minChannelVal = std::numeric_limits<float>::max();
auto maxChannelVal = std::numeric_limits<float>::min();
for (size_t j = 0; j < weightsColNum; ++j) {
minChannelVal = std::min(minChannelVal, weightsData[i * weightsColNum + j]);
maxChannelVal = std::max(maxChannelVal, weightsData[i * weightsColNum + j]);
}
inputMinRange.push_back(minChannelVal);
inputMaxRange.push_back(maxChannelVal);
}
} else {
FAIL() << "Invalid test configuration";
}
auto lowNode = ngraph::builder::makeConstant(ngraph::element::f32, channelDataSize, inputMinRange, false);
auto highNode = ngraph::builder::makeConstant(ngraph::element::f32, channelDataSize, inputMaxRange, false);
auto inputFQNode = ngraph::builder::makeFakeQuantize(paramOuts[0], ngraph::element::f32, levels[0], constShape[0],
{ inputDataMin }, { inputDataMax }, { inputDataMin }, { inputDataMax });
auto weightsFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(const_param,
lowNode, highNode, lowNode, highNode, levels[1]);
auto inputFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(inputFQNode);
auto weightsFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(weightsFQNode);
auto matmul = std::make_shared<ngraph::opset1::MatMul>(inputFQ, weightsFQ, false, true);
std::shared_ptr<ngraph::Node> biases_node;
if (biases) {
auto const_bias = ngraph::builder::makeConstant(ngPrc, {1, constShape[1][0]}, std::vector<float>{ -1.0f });
biases_node = std::make_shared<ngraph::opset1::Add>(matmul, const_bias);
} else {
biases_node = matmul;
}
auto sigmoid = std::make_shared<ngraph::opset1::Sigmoid>(biases_node);
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(sigmoid)};
if (biases) {
auto sigmoid_2 = std::make_shared<ngraph::opset1::Sigmoid>(inputFQ);
results.push_back(std::make_shared<ngraph::opset1::Result>(sigmoid_2));
}
function = std::make_shared<ngraph::Function>(results, params, "fakeQuantizeSubgraph");
configuration = config.second;
}
InferenceEngine::Blob::Ptr FakeQuantizeSubgraphTest::GenerateInput(const InferenceEngine::InputInfo &info) const {
return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin, 1 / inputDataResolution,
seed);
}
TEST_P(FakeQuantizeSubgraphTest, CompareWithRefs) {
Run();
}
} // namespace LayerTestsDefinitions

View File

@ -137,8 +137,8 @@ TEST_F(I16QuantisationTest, DISABLED_outputScaleFactorForAffineIsCorrect){
auto quantParams = getInjectedData<QuantizedLayerParams>(affineLayerPtr); auto quantParams = getInjectedData<QuantizedLayerParams>(affineLayerPtr);
ASSERT_FLOAT_EQ(quantParams->_dst_quant.scale, 100); ASSERT_FLOAT_EQ(quantParams->_dst_quant.GetScale(), 100);
ASSERT_FLOAT_EQ(quantParams->_weights_quant.scale, 100); ASSERT_FLOAT_EQ(quantParams->_weights_quant.GetScale(), 100);
} }
TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion) { TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion) {