[GNA] Fake quantization layer support for int-8 mode for GNA plugin (#2937)
* [GNA] added support for per-channel FakeQuantise layer * [GNA] added quantisation types detection in FQ enabled networks, and added input scale factors detection from FQ connected to input layer * added FakeQuantize callback that will be use to cast integer values stored as float in FakeQuantized layer * fixed per-channel multiplier calculation for int8 case * precision improvements for int8 fake quantization and support for propagating scale factors to activation layers * added initial int16 support * added support for fake quantize layer with many connected output layers and support for FQ data encoded as FP16 * added support for already quantized weights * Shared single layer test * Added subgraph test * Fix comment * int8 * Enabling FQ tests on GNA Co-authored-by: Eugene Smirnov <eugene.smirnov@intel.com> Co-authored-by: Andrey Dmitriev <andrey.dmitriev@intel.com>
This commit is contained in:
parent
27be33ba53
commit
fc1a3ce2f1
@ -45,14 +45,15 @@ struct DnnActivation {
|
|||||||
} pow;
|
} pow;
|
||||||
struct {
|
struct {
|
||||||
int32_t levels;
|
int32_t levels;
|
||||||
float input_low;
|
// if input is per-channel quantization - input pointers contains per-channel ranges
|
||||||
float input_high;
|
int8_t inputPerChannel;
|
||||||
float output_low;
|
float *input_low;
|
||||||
float output_high;
|
float *input_high;
|
||||||
|
// if output is per-channel quantization - output pointers contains per-channel ranges
|
||||||
|
int8_t outputPerChannel;
|
||||||
|
float *output_low;
|
||||||
|
float *output_high;
|
||||||
} fakeQuantize;
|
} fakeQuantize;
|
||||||
struct {
|
|
||||||
float reserved[5];
|
|
||||||
};
|
|
||||||
} args;
|
} args;
|
||||||
operator DnnActivationType () const noexcept {
|
operator DnnActivationType () const noexcept {
|
||||||
return type;
|
return type;
|
||||||
|
@ -15,6 +15,7 @@ struct GNAFlags {
|
|||||||
bool uniformPwlDesign = false;
|
bool uniformPwlDesign = false;
|
||||||
bool gna_openmp_multithreading = false;
|
bool gna_openmp_multithreading = false;
|
||||||
bool sw_fp32 = false;
|
bool sw_fp32 = false;
|
||||||
|
bool fake_quantized = false;
|
||||||
bool performance_counting = false;
|
bool performance_counting = false;
|
||||||
};
|
};
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -83,6 +83,10 @@ struct QuantI8 : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// for support proper trait instantiation for quantization function callback
|
||||||
|
struct FakeQuantI16 : public QuantI16 {};
|
||||||
|
struct FakeQuantI8 : public QuantI8 {};
|
||||||
|
|
||||||
template <class A, class B>
|
template <class A, class B>
|
||||||
struct QuantPair {
|
struct QuantPair {
|
||||||
using MandatoryType = A;
|
using MandatoryType = A;
|
||||||
@ -115,7 +119,7 @@ inline bool shouldAlwaysAllocate<gna_compound_bias_t>() {
|
|||||||
*/
|
*/
|
||||||
template <class T>
|
template <class T>
|
||||||
class Quant {
|
class Quant {
|
||||||
public:
|
public:
|
||||||
template<class ...Args>
|
template<class ...Args>
|
||||||
void operator()(Args && ... args) const { }
|
void operator()(Args && ... args) const { }
|
||||||
};
|
};
|
||||||
@ -125,7 +129,9 @@ class Quant<QuantI16> {
|
|||||||
public:
|
public:
|
||||||
template<class ...Args>
|
template<class ...Args>
|
||||||
void operator()(Args && ... args) const {
|
void operator()(Args && ... args) const {
|
||||||
QuantizeAffine16(std::forward<Args>(args)...);
|
QuantizationCallback<int16_t, int32_t> {
|
||||||
|
std::forward<Args>(args)...
|
||||||
|
}.runQuantize();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -134,10 +140,35 @@ class Quant<QuantI8> {
|
|||||||
public:
|
public:
|
||||||
template<class ...Args>
|
template<class ...Args>
|
||||||
void operator()(Args && ... args) const {
|
void operator()(Args && ... args) const {
|
||||||
QuantizeAffine8(std::forward<Args>(args)...);
|
QuantizationCallback<int8_t, gna_compound_bias_t> {
|
||||||
|
std::forward<Args>(args)...
|
||||||
|
}.runQuantize();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
class Quant<FakeQuantI16> {
|
||||||
|
public:
|
||||||
|
template<class ...Args>
|
||||||
|
void operator()(Args && ... args) const {
|
||||||
|
QuantizationCallback<int16_t, int32_t> {
|
||||||
|
std::forward<Args>(args)...
|
||||||
|
}.runFakeQuantize();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
class Quant<FakeQuantI8> {
|
||||||
|
public:
|
||||||
|
template<class ...Args>
|
||||||
|
void operator()(Args && ... args) const {
|
||||||
|
QuantizationCallback<int8_t, gna_compound_bias_t>{
|
||||||
|
std::forward<Args>(args)...
|
||||||
|
}.runFakeQuantize();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
|
inline InferenceEngine::Blob::Ptr fp32_to_precision_blob(InferenceEngine::Blob::Ptr fp32_blob, InferenceEngine::Precision precision, float scale_factor) {
|
||||||
auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
|
auto prec_blob = InferenceEngine::make_shared_blob<T>({ precision,
|
||||||
@ -242,7 +273,7 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
|
|||||||
if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
|
if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
|
||||||
auto quantDataForInputLayer =
|
auto quantDataForInputLayer =
|
||||||
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
|
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
|
||||||
input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
|
input_scale_factor = quantDataForInputLayer->_dst_quant.GetScale();
|
||||||
if (std::isnan(input_scale_factor) ||
|
if (std::isnan(input_scale_factor) ||
|
||||||
std::isinf(input_scale_factor)) {
|
std::isinf(input_scale_factor)) {
|
||||||
THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
|
THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
|
||||||
@ -273,17 +304,26 @@ inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
|
|||||||
|
|
||||||
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
|
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
|
||||||
{
|
{
|
||||||
|
auto per_channel_weights = !quantData->_weights_quant.GetMinValues().empty();
|
||||||
|
auto weightsScale = quantData->_weights_quant.GetScale();
|
||||||
|
auto dstScale = quantData->_dst_quant.GetScale();
|
||||||
fnc(wl->_weights->buffer().as<float *>(),
|
fnc(wl->_weights->buffer().as<float *>(),
|
||||||
wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
|
wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
|
||||||
intWeights->buffer(),
|
intWeights->buffer(),
|
||||||
intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
|
intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
|
||||||
input_scale_factor,
|
input_scale_factor,
|
||||||
&quantData->_weights_quant.scale,
|
&weightsScale,
|
||||||
&quantData->_dst_quant.scale,
|
&dstScale,
|
||||||
num_rows,
|
num_rows,
|
||||||
num_columns,
|
num_columns,
|
||||||
num_rows_padded,
|
num_rows_padded,
|
||||||
num_columns_padded);
|
num_columns_padded,
|
||||||
|
quantData->_weights_quant.GetLevels(),
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
per_channel_weights ? &quantData->_weights_quant.GetMinValues().front(): nullptr,
|
||||||
|
per_channel_weights ? &quantData->_weights_quant.GetMaxValues().front(): nullptr,
|
||||||
|
&quantData->_weights_quantized);
|
||||||
}
|
}
|
||||||
wl->_weights = intWeights;
|
wl->_weights = intWeights;
|
||||||
wl->_biases = intBiases;
|
wl->_biases = intBiases;
|
||||||
@ -343,7 +383,7 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
|
|||||||
if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
|
if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
|
||||||
auto quantDataForInputLayer =
|
auto quantDataForInputLayer =
|
||||||
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
|
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
|
||||||
input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
|
input_scale_factor = quantDataForInputLayer->_dst_quant.GetScale();
|
||||||
if (std::isnan(input_scale_factor) ||
|
if (std::isnan(input_scale_factor) ||
|
||||||
std::isinf(input_scale_factor)) {
|
std::isinf(input_scale_factor)) {
|
||||||
THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
|
THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
|
||||||
@ -370,13 +410,15 @@ inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
|
|||||||
|
|
||||||
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
|
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
|
||||||
{
|
{
|
||||||
|
auto weightsScale = quantData->_weights_quant.GetScale();
|
||||||
|
auto dstScale = quantData->_dst_quant.GetScale();
|
||||||
fnc(conv->_weights->buffer().as<float *>(),
|
fnc(conv->_weights->buffer().as<float *>(),
|
||||||
conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
|
conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
|
||||||
intWeights->buffer(),
|
intWeights->buffer(),
|
||||||
intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
|
intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
|
||||||
input_scale_factor,
|
input_scale_factor,
|
||||||
&quantData->_weights_quant.scale,
|
&weightsScale,
|
||||||
&quantData->_dst_quant.scale,
|
&dstScale,
|
||||||
num_rows,
|
num_rows,
|
||||||
num_columns,
|
num_columns,
|
||||||
num_rows_padded,
|
num_rows_padded,
|
||||||
@ -447,7 +489,7 @@ class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBas
|
|||||||
if (cnnLayer->blobs["custom"]->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP16) {
|
if (cnnLayer->blobs["custom"]->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP16) {
|
||||||
cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]);
|
cnnLayer->blobs["custom"] = make_fp32_blob(cnnLayer->blobs["custom"]);
|
||||||
}
|
}
|
||||||
auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.scale;
|
auto const_scale_factor = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer)->_dst_quant.GetScale();
|
||||||
auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]);
|
auto new_const_blob = InferenceEngine::Blob::CreateFromData(cnnLayer->outData[0]);
|
||||||
auto const_blob = cnnLayer->blobs["custom"];
|
auto const_blob = cnnLayer->blobs["custom"];
|
||||||
if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
|
if (const_blob->getTensorDesc().getPrecision() == InferenceEngine::Precision::FP32) {
|
||||||
@ -563,4 +605,9 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
|
|||||||
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
|
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
|
||||||
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
|
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
|
||||||
|
|
||||||
|
|
||||||
|
using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
|
||||||
|
using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;
|
||||||
|
|
||||||
|
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -80,7 +80,7 @@ class ModelQuantizer {
|
|||||||
THROW_GNA_EXCEPTION << "Scale factors are not set for some of the inputs";
|
THROW_GNA_EXCEPTION << "Scale factors are not set for some of the inputs";
|
||||||
}
|
}
|
||||||
IE_ASSERT(quantData != nullptr);
|
IE_ASSERT(quantData != nullptr);
|
||||||
quantData->_src_quant.scale = scaleFactor[scaleIndex];
|
quantData->_src_quant.SetScale(scaleFactor[scaleIndex]);
|
||||||
scaleIndex++;
|
scaleIndex++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5,20 +5,91 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <details/ie_exception.hpp>
|
#include <details/ie_exception.hpp>
|
||||||
|
#include <gna_plugin_log.hpp>
|
||||||
|
#include <limits>
|
||||||
#include "backend/gna_types.h"
|
#include "backend/gna_types.h"
|
||||||
#include "quantization.h"
|
#include "quantization.h"
|
||||||
|
|
||||||
void QuantizeAffine16(float *ptr_float_weights,
|
#ifdef DEBUG
|
||||||
float *ptr_float_biases,
|
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
|
||||||
int16_t *ptr_int_weights,
|
#else
|
||||||
int32_t *ptr_int_biases,
|
#define QUANTWARNING(...)
|
||||||
float input_scale_factor,
|
#endif
|
||||||
float *ptr_weight_scale_factor,
|
|
||||||
float *ptr_output_scale_factor,
|
|
||||||
uint32_t num_rows,
|
template<>
|
||||||
uint32_t num_columns,
|
void QuantizationCallback<int16_t, int32_t>::runFakeQuantize() const {
|
||||||
uint32_t num_rows_padded,
|
uint32_t num_saturate = 0;
|
||||||
uint32_t num_columns_padded) {
|
|
||||||
|
for (uint32_t row = 0; row < num_rows; row++) {
|
||||||
|
for (uint32_t col = 0; col < num_columns; col++) {
|
||||||
|
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = ptr_float_weights[row * num_columns + col];
|
||||||
|
if (!*ptr_quantized_weights) {
|
||||||
|
value = value * *ptr_weight_scale_factor + rounding_value;
|
||||||
|
} else {
|
||||||
|
value -= MAX_VAL_2B_WEIGHT;
|
||||||
|
}
|
||||||
|
|
||||||
|
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
|
|
||||||
|
if (*ptr_quantized_weights &&
|
||||||
|
(value > std::numeric_limits<int16_t>::max() ||
|
||||||
|
value < std::numeric_limits<int16_t>::min())) {
|
||||||
|
THROW_GNA_EXCEPTION << "unsupported weights range for I16 quantisation: " << value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value > std::numeric_limits<int16_t>::max()) {
|
||||||
|
*ptr_weight_16 = std::numeric_limits<int16_t>::max();
|
||||||
|
num_saturate++;
|
||||||
|
} else if (value < std::numeric_limits<int16_t>::min()) {
|
||||||
|
*ptr_weight_16 = std::numeric_limits<int16_t>::min();
|
||||||
|
num_saturate++;
|
||||||
|
} else {
|
||||||
|
*ptr_weight_16 = (int16_t)value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (uint32_t col = num_columns; col < num_columns_padded; col++) {
|
||||||
|
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
|
*ptr_weight_16 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (uint32_t row = num_rows; row < num_rows_padded; row++) {
|
||||||
|
for (uint32_t col = 0; col < num_columns_padded; col++) {
|
||||||
|
int16_t* ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
|
*ptr_weight_16 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// case for element wise layer
|
||||||
|
if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
|
||||||
|
for (uint32_t j = 0; j < num_rows; j++) {
|
||||||
|
float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
|
||||||
|
if (value > 2147483647.0) {
|
||||||
|
ptr_int_biases[j] = 2147483647L;
|
||||||
|
num_saturate++;
|
||||||
|
} else if (value < -2147483648.0) {
|
||||||
|
ptr_int_biases[j] = -2147483648LL;
|
||||||
|
num_saturate++;
|
||||||
|
} else {
|
||||||
|
ptr_int_biases[j] = (int32_t)value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (uint32_t j = num_rows; j < num_rows_padded; j++) {
|
||||||
|
ptr_int_biases[j] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (num_saturate > 0) {
|
||||||
|
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine16()\n",
|
||||||
|
num_saturate,
|
||||||
|
num_rows * num_columns + num_rows);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void QuantizationCallback<int16_t, int32_t>::runQuantize() const {
|
||||||
uint32_t num_saturate = 0;
|
uint32_t num_saturate = 0;
|
||||||
|
|
||||||
if (*ptr_weight_scale_factor == 1.0) {
|
if (*ptr_weight_scale_factor == 1.0) {
|
||||||
@ -149,11 +220,90 @@ void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
|
template<>
|
||||||
int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
|
void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const {
|
||||||
float input_scale_factor, float *ptr_weight_scale_factor,
|
uint32_t num_saturate = 0;
|
||||||
float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
|
|
||||||
uint32_t num_rows_padded, uint32_t num_columns_padded) {
|
if (fq_ptr_output_high == nullptr || fq_ptr_output_low == nullptr) {
|
||||||
|
THROW_GNA_EXCEPTION << "Fake quantized output range not set";
|
||||||
|
}
|
||||||
|
if (fq_levels == 0 || fq_levels == 1) {
|
||||||
|
THROW_GNA_EXCEPTION << "Fake quantized levels not set";
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < num_rows; i++) {
|
||||||
|
uint32_t channel_multiplier = ((fq_ptr_output_high[i] - fq_ptr_output_low[i]) *
|
||||||
|
*ptr_weight_scale_factor) / (fq_levels - 1) + 0.5f;
|
||||||
|
ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier);
|
||||||
|
if (channel_multiplier > MAX_OUT_MULTIPLIER) {
|
||||||
|
THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t j = 0; j < num_columns; j++) {
|
||||||
|
auto offset = i * num_columns + j;
|
||||||
|
auto rounding_value = (ptr_float_weights[i * num_columns + j] > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = ptr_float_weights[offset];
|
||||||
|
if (!*ptr_quantized_weights) {
|
||||||
|
value = value * (*ptr_weight_scale_factor / ptr_int_biases[i].multiplier) + rounding_value;
|
||||||
|
} else {
|
||||||
|
value -= MAX_VAL_1B_WEIGHT;
|
||||||
|
}
|
||||||
|
auto normalizedWeight = static_cast<int32_t>(value);
|
||||||
|
|
||||||
|
if (*ptr_quantized_weights &&
|
||||||
|
(value > std::numeric_limits<int8_t>::max() ||
|
||||||
|
value < std::numeric_limits<int8_t>::min())) {
|
||||||
|
THROW_GNA_EXCEPTION << "unsupported weights range for I8 quantization: " << value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value > std::numeric_limits<int8_t>::max()) {
|
||||||
|
normalizedWeight = std::numeric_limits<int8_t>::max();
|
||||||
|
num_saturate++;
|
||||||
|
} else if (value < std::numeric_limits<int8_t>::min()) {
|
||||||
|
normalizedWeight = std::numeric_limits<int8_t>::min();
|
||||||
|
num_saturate++;
|
||||||
|
} else {
|
||||||
|
normalizedWeight = (int8_t)value;
|
||||||
|
}
|
||||||
|
|
||||||
|
// range checking
|
||||||
|
ptr_int_weights[offset] = static_cast<int8_t>(normalizedWeight);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t j = num_columns; j < num_columns_padded; j++) {
|
||||||
|
ptr_int_weights[i * num_columns + j] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint32_t i = num_rows; i < num_rows_padded; i++) {
|
||||||
|
for (uint32_t j = 0; j < num_columns_padded; j++) {
|
||||||
|
ptr_int_weights[i * num_columns + j] = 0;
|
||||||
|
}
|
||||||
|
ptr_int_biases[i].multiplier = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr_float_biases != nullptr) {
|
||||||
|
for (uint32_t j = 0; j < num_rows; j++) {
|
||||||
|
float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
|
||||||
|
if (value > 2147483647.0) {
|
||||||
|
ptr_int_biases[j].bias = 2147483647L;
|
||||||
|
num_saturate++;
|
||||||
|
} else if (value < -2147483648.0) {
|
||||||
|
ptr_int_biases[j].bias = -2147483648LL;
|
||||||
|
num_saturate++;
|
||||||
|
} else {
|
||||||
|
ptr_int_biases[j].bias = (int32_t) value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (num_saturate > 0) {
|
||||||
|
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
|
||||||
if (ptr_int_biases == nullptr) {
|
if (ptr_int_biases == nullptr) {
|
||||||
THROW_IE_EXCEPTION << "Int biases are empty";
|
THROW_IE_EXCEPTION << "Int biases are empty";
|
||||||
}
|
}
|
||||||
|
@ -16,25 +16,34 @@
|
|||||||
#define MAX_VAL_2B_WEIGHT 16384
|
#define MAX_VAL_2B_WEIGHT 16384
|
||||||
#define MAX_VAL_2B_FEAT 16384
|
#define MAX_VAL_2B_FEAT 16384
|
||||||
#define MAX_VAL_4B_BIAS 1073741824
|
#define MAX_VAL_4B_BIAS 1073741824
|
||||||
#ifdef DEBUG
|
|
||||||
#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
|
|
||||||
#else
|
|
||||||
#define QUANTWARNING(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void QuantizeAffine16(float *ptr_float_weights,
|
template <class WeightsType, class BiasType>
|
||||||
float *ptr_float_biases,
|
struct QuantizationCallback {
|
||||||
int16_t *ptr_int_weights,
|
float *ptr_float_weights;
|
||||||
int32_t *ptr_int_biases,
|
float *ptr_float_biases;
|
||||||
float input_scale_factor,
|
WeightsType* ptr_int_weights;
|
||||||
float *ptr_weight_scale_factor,
|
BiasType* ptr_int_biases;
|
||||||
float *ptr_output_scale_factor,
|
float input_scale_factor;
|
||||||
uint32_t num_rows,
|
float *ptr_weight_scale_factor;
|
||||||
uint32_t num_columns,
|
float *ptr_output_scale_factor;
|
||||||
uint32_t num_rows_padded,
|
uint32_t num_rows;
|
||||||
uint32_t num_columns_padded);
|
uint32_t num_columns;
|
||||||
|
uint32_t num_rows_padded;
|
||||||
|
uint32_t num_columns_padded;
|
||||||
|
|
||||||
|
int32_t fq_levels;
|
||||||
|
const float *fq_ptr_input_low;
|
||||||
|
const float *fq_ptr_input_high;
|
||||||
|
const float *fq_ptr_output_low;
|
||||||
|
const float *fq_ptr_output_high;
|
||||||
|
const bool* ptr_quantized_weights;
|
||||||
|
|
||||||
|
void runQuantize() const;
|
||||||
|
void runFakeQuantize() const;
|
||||||
|
};
|
||||||
|
|
||||||
|
template class QuantizationCallback<int16_t, int32_t>;
|
||||||
|
template class QuantizationCallback<int8_t, gna_compound_bias_t>;
|
||||||
|
|
||||||
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
|
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
|
||||||
void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
|
void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
|
||||||
void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, gna_compound_bias_t *ptr_int_biases,
|
|
||||||
float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
|
|
||||||
uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
|
|
||||||
|
@ -6,19 +6,57 @@
|
|||||||
|
|
||||||
namespace GNAPluginNS {
|
namespace GNAPluginNS {
|
||||||
|
|
||||||
struct Quantization {
|
class Quantization {
|
||||||
|
public:
|
||||||
|
void SetScale(float s) {
|
||||||
|
scale = s;
|
||||||
|
scale_set = true;
|
||||||
|
}
|
||||||
|
float GetScale() const {
|
||||||
|
return scale;
|
||||||
|
}
|
||||||
|
bool IsScaleSet() const {
|
||||||
|
return scale_set;
|
||||||
|
}
|
||||||
|
void SetLevels(int32_t l) {
|
||||||
|
levels = l;
|
||||||
|
}
|
||||||
|
int32_t GetLevels() const {
|
||||||
|
return levels;
|
||||||
|
}
|
||||||
|
void SetMinValues(const std::vector<float> &min) {
|
||||||
|
min_values.clear();
|
||||||
|
min_values.insert(min_values.end(), min.begin(), min.end());
|
||||||
|
}
|
||||||
|
const std::vector<float>& GetMinValues() const {
|
||||||
|
return min_values;
|
||||||
|
}
|
||||||
|
void SetMaxValues(const std::vector<float>& max) {
|
||||||
|
max_values.clear();
|
||||||
|
max_values.insert(max_values.end(), max.begin(), max.end());
|
||||||
|
}
|
||||||
|
const std::vector<float>& GetMaxValues() const {
|
||||||
|
return max_values;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
float scale = 1.0f;
|
float scale = 1.0f;
|
||||||
float offset = 0.0f;
|
bool scale_set = false;
|
||||||
int shift = 0.0f;
|
int32_t levels = 0;
|
||||||
|
std::vector<float> min_values;
|
||||||
|
std::vector<float> max_values;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct QuantizedLayerParams {
|
struct QuantizedLayerParams {
|
||||||
Quantization _src_quant;
|
Quantization _src_quant;
|
||||||
Quantization _dst_quant;
|
Quantization _dst_quant;
|
||||||
|
|
||||||
|
// deprecate this
|
||||||
Quantization _weights_quant;
|
Quantization _weights_quant;
|
||||||
|
bool _weights_quantized = false;
|
||||||
Quantization _bias_quant;
|
Quantization _bias_quant;
|
||||||
float _o_shift = 0.0f;
|
float _o_shift = 0.0f;
|
||||||
float _b_shift = 0.0f;
|
float _b_shift = 0.0f;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -64,8 +64,9 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
|
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
|
||||||
GNAPluginNS::LayerInfo const& layer,
|
GNAPluginNS::LayerInfo const& layer) {
|
||||||
QuantizedLayerParams const* quantizedParams) {
|
auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
|
||||||
|
|
||||||
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
|
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
|
||||||
// set the initial value
|
// set the initial value
|
||||||
float result = activation_scale_factor;
|
float result = activation_scale_factor;
|
||||||
@ -82,29 +83,29 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
|
|
||||||
for (int slope_scale_index = 1; slope_scale_index != 5; slope_scale_index ++) {
|
for (int slope_scale_index = 1; slope_scale_index != 5; slope_scale_index ++) {
|
||||||
auto slope_scale = static_cast<double>(static_cast<uint64_t>(1) << (8 * slope_scale_index));
|
auto slope_scale = static_cast<double>(static_cast<uint64_t>(1) << (8 * slope_scale_index));
|
||||||
auto mink = min_range * slope_scale / quantizedParams->_src_quant.scale;
|
auto mink = min_range * slope_scale / quantizedParams->_src_quant.GetScale();
|
||||||
auto maxk = max_range * slope_scale / quantizedParams->_src_quant.scale;
|
auto maxk = max_range * slope_scale / quantizedParams->_src_quant.GetScale();
|
||||||
|
|
||||||
if (mink < std::numeric_limits<int16_t>::max()) {
|
if (mink < std::numeric_limits<int16_t>::max()) {
|
||||||
auto localMaxK = std::min(static_cast<double>(std::numeric_limits<int16_t>::max()), maxk);
|
auto localMaxK = std::min(static_cast<double>(std::numeric_limits<int16_t>::max()), maxk);
|
||||||
if (localMaxK > optimalK) {
|
if (localMaxK > optimalK) {
|
||||||
result = localMaxK / slope_scale * quantizedParams->_src_quant.scale;
|
result = localMaxK / slope_scale * quantizedParams->_src_quant.GetScale();
|
||||||
optimalK = localMaxK;
|
optimalK = localMaxK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// GNA scale factor encoding might poor represent target slop scale, we are probing 2 values
|
// GNA scale factor encoding might poor represent target slop scale, we are probing 2 values
|
||||||
auto s = gna_slope(1.0, quantizedParams->_src_quant.scale, identity_scale_factor);
|
auto s = gna_slope(1.0, quantizedParams->_src_quant.GetScale(), identity_scale_factor);
|
||||||
auto scale_default = s.slope * s.slope_scale;
|
auto scale_default = s.slope * s.slope_scale;
|
||||||
// probing one more quite good approximation for identity
|
// probing one more quite good approximation for identity
|
||||||
s = gna_slope(1.0, quantizedParams->_src_quant.scale, identity_scale_factor / 2);
|
s = gna_slope(1.0, quantizedParams->_src_quant.GetScale(), identity_scale_factor / 2);
|
||||||
auto scale_extra = s.slope * s.slope_scale;
|
auto scale_extra = s.slope * s.slope_scale;
|
||||||
result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor;
|
result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
} else if (layer.isRelu() &&
|
} else if (layer.isRelu() &&
|
||||||
static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.scale)
|
static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.GetScale())
|
||||||
> std::numeric_limits<int32_t>::max()-1) {
|
> std::numeric_limits<int32_t>::max()-1) {
|
||||||
// if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
|
// if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
|
||||||
result = (activation_scale_factor * 0.5);
|
result = (activation_scale_factor * 0.5);
|
||||||
@ -118,10 +119,10 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
auto input_max_value = static_cast<double>(std::numeric_limits<int32_t>::max());
|
auto input_max_value = static_cast<double>(std::numeric_limits<int32_t>::max());
|
||||||
auto output_max_value = static_cast<double>(std::numeric_limits<int16_t>::max());
|
auto output_max_value = static_cast<double>(std::numeric_limits<int16_t>::max());
|
||||||
|
|
||||||
auto x_min = fp32eq(fmod(powerLayer->power, 1.0), 0) ? input_min_value / quantizedParams->_src_quant.scale : 0.0;
|
auto x_min = fp32eq(fmod(powerLayer->power, 1.0), 0) ? input_min_value / quantizedParams->_src_quant.GetScale() : 0.0;
|
||||||
x_min = std::max(x_min, -pow_domain);
|
x_min = std::max(x_min, -pow_domain);
|
||||||
|
|
||||||
auto x_max = input_max_value / quantizedParams->_src_quant.scale;
|
auto x_max = input_max_value / quantizedParams->_src_quant.GetScale();
|
||||||
x_max = std::min(x_max, pow_domain);
|
x_max = std::min(x_max, pow_domain);
|
||||||
|
|
||||||
auto val1 = pow(x_min * powerLayer->scale + powerLayer->offset, powerLayer->power);
|
auto val1 = pow(x_min * powerLayer->scale + powerLayer->offset, powerLayer->power);
|
||||||
@ -134,6 +135,14 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
result = scale_val;
|
result = scale_val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!quantizedParams->_dst_quant.GetMaxValues().empty()) {
|
||||||
|
auto min_value = quantizedParams->_dst_quant.GetMinValues().front();
|
||||||
|
auto max_value = quantizedParams->_dst_quant.GetMaxValues().front();
|
||||||
|
auto newScaleFactor = (quantizedParams->_dst_quant.GetLevels() - 1) / (max_value - min_value);
|
||||||
|
result = newScaleFactor < result ? newScaleFactor : result;
|
||||||
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -147,12 +156,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
|
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
|
||||||
|
|
||||||
if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
|
if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
|
||||||
if (CNNNetHasPrevLayer(cnnLayer)) {
|
if (!CNNNetHasPrevLayer(cnnLayer) && quant->_dst_quant.IsScaleSet()) {
|
||||||
|
quant->_src_quant = quant->_dst_quant;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (CNNNetHasPrevLayer(cnnLayer)) {
|
||||||
auto prevLayer = CNNNetPrevLayer(cnnLayer);
|
auto prevLayer = CNNNetPrevLayer(cnnLayer);
|
||||||
auto prevInfo = LayerInfo(prevLayer);
|
auto prevInfo = LayerInfo(prevLayer);
|
||||||
auto inputQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
|
auto inputQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
|
||||||
// locating corresponding memory layers with same ID
|
// locating corresponding memory layers with same ID
|
||||||
for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) {
|
for (auto&& input : CNNNetGetAllInputLayers(cnnLayer)) {
|
||||||
LayerInfo ll(input);
|
LayerInfo ll(input);
|
||||||
if (!ll.isMemory() ||
|
if (!ll.isMemory() ||
|
||||||
!InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
|
!InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
|
||||||
@ -162,35 +175,36 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
auto quantSibling = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
|
auto quantSibling = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
|
||||||
|
|
||||||
// after restarting from memory input - quant is fine
|
// after restarting from memory input - quant is fine
|
||||||
if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) {
|
if (fp32eq(quantSibling->_dst_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
|
||||||
quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
|
quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
|
||||||
|
quant->_dst_quant.SetScale(inputQuant->_dst_quant.GetScale());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!fp32eq(quantSibling->_dst_quant.scale, 1)) {
|
if (quantSibling->_dst_quant.IsScaleSet()) {
|
||||||
// means we already restarted propagation input memory layer
|
// means we already restarted propagation input memory layer
|
||||||
// need to search for requantiseable layer prior memory output layer
|
// need to search for requantiseable layer prior memory output layer
|
||||||
InferenceEngine::CNNLayerPtr restartedLayer;
|
InferenceEngine::CNNLayerPtr restartedLayer;
|
||||||
|
|
||||||
gnalog() << "Memory layer :"<< input->name << " scale factor: " << quantSibling->_dst_quant.scale
|
gnalog() << "Memory layer :" << input->name << " scale factor: " << quantSibling->_dst_quant.GetScale()
|
||||||
<< " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.scale << "\n";
|
<< " doesn't match its outputs counterpart: " << cnnLayer->name << " scale factor: " << inputQuant->_dst_quant.GetScale() << "\n";
|
||||||
gnalog() << "[UFS] searching for quantizeable input layer for: "<< cnnLayer->name << "\n";
|
gnalog() << "[UFS] searching for quantizeable input layer for: " << cnnLayer->name << "\n";
|
||||||
|
|
||||||
CNNNetDFS(InferenceEngine::CNNLayerPtr(cnnLayer, [](InferenceEngine::CNNLayer *) {}),
|
CNNNetDFS(InferenceEngine::CNNLayerPtr(cnnLayer, [](InferenceEngine::CNNLayer*) {}),
|
||||||
[&restartedLayer, cnnLayer](InferenceEngine::CNNLayerPtr layer) {
|
[&restartedLayer, cnnLayer](InferenceEngine::CNNLayerPtr layer) {
|
||||||
gnalog() << "[UFS] from : " << cnnLayer->name << " reached: " << layer->name;
|
gnalog() << "[UFS] from : " << cnnLayer->name << " reached: " << layer->name;
|
||||||
// found that direct input to concat is a indirect parent of align filter - so no link required
|
// found that direct input to concat is a indirect parent of align filter - so no link required
|
||||||
auto info = LayerInfo(layer);
|
auto info = LayerInfo(layer);
|
||||||
if (!info.isWeightable() && !info.isActivation()) {
|
if (!info.isWeightable() && !info.isActivation()) {
|
||||||
gnalog() << "... skipped\n";
|
gnalog() << "... skipped\n";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
restartedLayer = layer;
|
restartedLayer = layer;
|
||||||
gnalog() << "... OK, need requantize\n";
|
gnalog() << "... OK, need requantize\n";
|
||||||
}, true, [&restartedLayer, &cnnLayer](InferenceEngine::CNNLayer *from) {
|
}, true, [&restartedLayer, &cnnLayer](InferenceEngine::CNNLayer* from) {
|
||||||
// aborting UFS once found suitable layer
|
// aborting UFS once found suitable layer
|
||||||
return make_upstream_order(restartedLayer == nullptr ? from : nullptr);
|
return make_upstream_order(restartedLayer == nullptr ? from : nullptr);
|
||||||
});
|
});
|
||||||
|
|
||||||
if (restartedLayer == nullptr) {
|
if (restartedLayer == nullptr) {
|
||||||
THROW_GNA_EXCEPTION << "cannot requantize input to " << cnnLayer->name;
|
THROW_GNA_EXCEPTION << "cannot requantize input to " << cnnLayer->name;
|
||||||
@ -201,23 +215,23 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
auto restarLayerInfo = LayerInfo(restartedLayer);
|
auto restarLayerInfo = LayerInfo(restartedLayer);
|
||||||
if (restarLayerInfo.isActivation()) {
|
if (restarLayerInfo.isActivation()) {
|
||||||
// requantize activation by just changing it's output scale factor
|
// requantize activation by just changing it's output scale factor
|
||||||
quantDataForMemoryOutput->_dst_quant.scale = quantSibling->_dst_quant.scale;
|
quantDataForMemoryOutput->_dst_quant.SetScale(quantSibling->_dst_quant.GetScale());
|
||||||
} else {
|
} else {
|
||||||
THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
|
THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.GetScale() << ") "
|
||||||
<< " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
|
<< " for " << cnnLayer->name << ", that is child of " << prevLayer->name << " doesnt match : "
|
||||||
<< activation_scale_factor;
|
<< activation_scale_factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
result = ScaleFactorUpdateResult(restartedLayer.get());
|
result = ScaleFactorUpdateResult(restartedLayer.get());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")"
|
gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.GetScale() << ")"
|
||||||
<< " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
|
<< " for " << cnnLayer->name << ", that is child of " << prevLayer->name << " doesnt match : "
|
||||||
<< activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl;
|
<< activation_scale_factor << ", restarting from corresponding memory: " << input->name << std::endl;
|
||||||
|
|
||||||
// try updating memory input layer scale factor and restart from it
|
// try updating memory input layer scale factor and restart from it
|
||||||
quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale;
|
quantSibling->_src_quant = quantSibling->_dst_quant = inputQuant->_dst_quant;
|
||||||
result = ScaleFactorUpdateResult(input.get());
|
result = ScaleFactorUpdateResult(input.get());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -226,11 +240,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (cnnLayer->type == "Const") {
|
if (cnnLayer->type == "Const") {
|
||||||
|
if (quant->_dst_quant.IsScaleSet()) {
|
||||||
|
quant->_src_quant = quant->_dst_quant;
|
||||||
|
return ScaleFactorUpdateResult();
|
||||||
|
}
|
||||||
|
|
||||||
auto blob = cnnLayer->blobs["custom"];
|
auto blob = cnnLayer->blobs["custom"];
|
||||||
auto blob_precision = blob->getTensorDesc().getPrecision();
|
auto blob_precision = blob->getTensorDesc().getPrecision();
|
||||||
|
|
||||||
if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) {
|
if (blob_precision != InferenceEngine::Precision::FP32 && blob_precision != InferenceEngine::Precision::FP16) {
|
||||||
quant->_dst_quant.scale = 1.0f;
|
quant->_dst_quant.SetScale(1.0f);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -255,16 +274,16 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
|
|
||||||
// TODO: Investigate what should be the scale in such cases (31910)
|
// TODO: Investigate what should be the scale in such cases (31910)
|
||||||
if (std::isinf(scale_val)) {
|
if (std::isinf(scale_val)) {
|
||||||
quant->_dst_quant.scale = quant->_src_quant.scale;
|
quant->_dst_quant.SetScale(quant->_src_quant.GetScale());
|
||||||
} else {
|
} else {
|
||||||
quant->_dst_quant.scale = scale_val;
|
quant->_dst_quant.SetScale(scale_val);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ScaleFactorUpdateResult();
|
return ScaleFactorUpdateResult();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!CNNNetHasPrevLayer(cnnLayer)) {
|
if (!CNNNetHasPrevLayer(cnnLayer)) {
|
||||||
quant->_dst_quant.scale = quant->_src_quant.scale;
|
quant->_dst_quant = quant->_src_quant;
|
||||||
return ScaleFactorUpdateResult();
|
return ScaleFactorUpdateResult();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -273,14 +292,17 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
if (!inputQuant) {
|
if (!inputQuant) {
|
||||||
THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized";
|
THROW_GNA_EXCEPTION << "layer: " << CNNNetPrevLayer(cnnLayer)->name << "not quantized";
|
||||||
}
|
}
|
||||||
quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
|
|
||||||
quant->_src_quant.scale = inputQuant->_dst_quant.scale;
|
|
||||||
|
|
||||||
|
quant->_src_quant = inputQuant->_dst_quant;
|
||||||
if (layerInfo.isActivation()) {
|
if (layerInfo.isActivation()) {
|
||||||
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
|
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
|
||||||
// set the initial value
|
// set the initial value
|
||||||
quant->_dst_quant.scale = getActivationScale(cnnLayer, layerInfo, quant);
|
auto scale = getActivationScale(cnnLayer, layerInfo);
|
||||||
|
quant->_dst_quant.SetScale(scale);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
quant->_dst_quant = inputQuant->_dst_quant;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -302,8 +324,8 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
|
|
||||||
switch (eltwiseLayer->_operation) {
|
switch (eltwiseLayer->_operation) {
|
||||||
case InferenceEngine::EltwiseLayer::Prod: {
|
case InferenceEngine::EltwiseLayer::Prod: {
|
||||||
quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
|
quantData->_weights_quant = quantParams1->_dst_quant;
|
||||||
quantData->_dst_quant.scale = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
|
quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale() * quantParams1->_dst_quant.GetScale());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case InferenceEngine::EltwiseLayer::Sub:
|
case InferenceEngine::EltwiseLayer::Sub:
|
||||||
@ -325,13 +347,13 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// this path might result in significant data loss
|
// this path might result in significant data loss
|
||||||
quantData->_bias_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
|
quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
|
||||||
quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
|
quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
|
||||||
quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
|
quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
|
||||||
|
|
||||||
// eltwise will always work in int16
|
// eltwise will always work in int16
|
||||||
auto maxValue = std::numeric_limits<int16_t>::max() - 1;
|
auto maxValue = std::numeric_limits<int16_t>::max() - 1;
|
||||||
if (quantData->_weights_quant.scale > maxValue + 1) {
|
if (quantData->_weights_quant.GetScale() > maxValue + 1) {
|
||||||
// rescaling it's activation input
|
// rescaling it's activation input
|
||||||
// iterating thru previous layers of eltwise
|
// iterating thru previous layers of eltwise
|
||||||
for (uint8_t i = 0; i < 2; ++i) {
|
for (uint8_t i = 0; i < 2; ++i) {
|
||||||
@ -347,15 +369,15 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
if (info.isSplit() || info.isSlice()) {
|
if (info.isSplit() || info.isSlice()) {
|
||||||
continue;
|
continue;
|
||||||
} else if (info.has16BOutput() && info.isActivation()) {
|
} else if (info.has16BOutput() && info.isActivation()) {
|
||||||
auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
|
auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue;
|
||||||
if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
|
if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
|
auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
|
||||||
gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
|
gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
|
||||||
<< ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
|
<< ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
|
||||||
<< ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
|
<< ", was " << quantDataForActivation->_dst_quant.GetScale() <<"\n" << std::flush;
|
||||||
quantDataForActivation->_dst_quant.scale = newOutputScale;
|
quantDataForActivation->_dst_quant.SetScale(newOutputScale);
|
||||||
result = ScaleFactorUpdateResult(in.get());
|
result = ScaleFactorUpdateResult(in.get());
|
||||||
return true;
|
return true;
|
||||||
} else if (info.has16BOutput()) {
|
} else if (info.has16BOutput()) {
|
||||||
@ -365,10 +387,10 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
// if we are here it means that we are in the port 1
|
// if we are here it means that we are in the port 1
|
||||||
if (info.isFullyConnected() || info.isConvolution()) {
|
if (info.isFullyConnected() || info.isConvolution()) {
|
||||||
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
|
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
|
||||||
auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
|
auto newOutputScale = quantParams->_dst_quant.GetScale() * maxValue;
|
||||||
auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
|
auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
|
||||||
quantDataForInputLayer->_dst_quant.scale = newOutputScale;
|
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
|
||||||
quantDataForInputLayer->_weights_quant.scale = newWeightScale;
|
quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
|
||||||
result = ScaleFactorUpdateResult(in.get());
|
result = ScaleFactorUpdateResult(in.get());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -410,15 +432,15 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
// if all inputs have same quant value - trivial propagation
|
// if all inputs have same quant value - trivial propagation
|
||||||
auto in0 = inputLayers.front();
|
auto in0 = inputLayers.front();
|
||||||
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
|
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
|
||||||
auto scaleFactor = quantParams0->_dst_quant.scale;
|
auto scaleFactor = quantParams0->_dst_quant.GetScale();
|
||||||
auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
auto scaleFactorCheck = [scaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
||||||
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
||||||
return fp32eq(quantParams->_dst_quant.scale, scaleFactor);
|
return fp32eq(quantParams->_dst_quant.GetScale(), scaleFactor);
|
||||||
};
|
};
|
||||||
|
|
||||||
if (std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), scaleFactorCheck) == inputLayers.end()) {
|
if (std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), scaleFactorCheck) == inputLayers.end()) {
|
||||||
quantData->_dst_quant.scale = quantParams0->_dst_quant.scale;
|
quantData->_dst_quant.SetScale(quantParams0->_dst_quant.GetScale());
|
||||||
quantData->_src_quant.scale = quantParams0->_dst_quant.scale;
|
quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -435,7 +457,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
auto nextInputIt = firstInputIt + 1;
|
auto nextInputIt = firstInputIt + 1;
|
||||||
while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) {
|
while ((nextInputIt = std::find_if(nextInputIt, inputLayers.end(), inputLayerCheck)) != inputLayers.end()) {
|
||||||
auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt);
|
auto quantParamsSecond = InferenceEngine::getInjectedData<QuantizedLayerParams>(*nextInputIt);
|
||||||
if (!fp32eq(quantParamsSecond->_dst_quant.scale, quantParamsFirst->_dst_quant.scale)) {
|
if (!fp32eq(quantParamsSecond->_dst_quant.GetScale(), quantParamsFirst->_dst_quant.GetScale())) {
|
||||||
THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name
|
THROW_GNA_EXCEPTION << "Two Input layers " << (*firstInputIt)->name
|
||||||
<< " and " << (*nextInputIt)->name << " have different scales in concat!!! \n";
|
<< " and " << (*nextInputIt)->name << " have different scales in concat!!! \n";
|
||||||
}
|
}
|
||||||
@ -449,7 +471,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
auto sourceLayerCheck = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
||||||
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
||||||
LayerInfo info(inputLayer);
|
LayerInfo info(inputLayer);
|
||||||
return !info.isActivation() && !fp32eq(quantParams->_dst_quant.scale, 1.0f);
|
return !info.isActivation() && !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::map<std::string, size_t> restarted_counter;
|
static std::map<std::string, size_t> restarted_counter;
|
||||||
@ -469,7 +491,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
if (sourceLayerIt == inputLayers.end()) {
|
if (sourceLayerIt == inputLayers.end()) {
|
||||||
auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
auto nonDefaultScaleFactor = [&fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
||||||
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
||||||
return !fp32eq(quantParams->_dst_quant.scale, 1.0f);
|
return !fp32eq(quantParams->_dst_quant.GetScale(), 1.0f);
|
||||||
};
|
};
|
||||||
|
|
||||||
sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
|
sourceLayerIt = std::find_if(inputLayers.begin(), inputLayers.end(), nonDefaultScaleFactor);
|
||||||
@ -478,29 +500,28 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
std::set<size_t> concatIdxToUpdate;
|
std::set<size_t> concatIdxToUpdate;
|
||||||
if (sourceLayerIt != inputLayers.end()) {
|
if (sourceLayerIt != inputLayers.end()) {
|
||||||
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*sourceLayerIt);
|
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*sourceLayerIt);
|
||||||
auto scaleFactor = quantParams->_dst_quant.scale;
|
auto scaleFactor = quantParams->_dst_quant.GetScale();
|
||||||
sourceQuantParams = quantParams;
|
sourceQuantParams = quantParams;
|
||||||
|
|
||||||
for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) {
|
for (auto it = inputLayers.begin(); it != inputLayers.end(); ++it) {
|
||||||
auto quantParamsIn = InferenceEngine::getInjectedData<QuantizedLayerParams>(*it);
|
auto quantParamsIn = InferenceEngine::getInjectedData<QuantizedLayerParams>(*it);
|
||||||
if (fp32eq(quantParamsIn->_dst_quant.scale, scaleFactor)) {
|
if (fp32eq(quantParamsIn->_dst_quant.GetScale(), scaleFactor)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
|
// possible case when some of the concat inputs are free to select scale ex: const->concat<-affine
|
||||||
if (!fp32eq(quantParamsIn->_dst_quant.scale, 1.0f) && !LayerInfo(*it).isActivation()) {
|
if (!fp32eq(quantParamsIn->_dst_quant.GetScale(), 1.0f) && !LayerInfo(*it).isActivation()) {
|
||||||
concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
|
concatIdxToUpdate.insert(std::distance(inputLayers.begin(), it));
|
||||||
}
|
}
|
||||||
|
|
||||||
quantParamsIn->_weights_quant = quantParams->_dst_quant;
|
quantParamsIn->_dst_quant.SetScale(quantParams->_dst_quant.GetScale());
|
||||||
quantParamsIn->_dst_quant = quantParams->_dst_quant;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.scale;
|
auto updatedScaleFactor = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0)->_dst_quant.GetScale();
|
||||||
auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
auto equalScaleFactor = [updatedScaleFactor, &fp32eq](InferenceEngine::CNNLayerPtr& inputLayer) {
|
||||||
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(inputLayer);
|
||||||
return fp32eq(quantParams->_dst_quant.scale, updatedScaleFactor);
|
return fp32eq(quantParams->_dst_quant.GetScale(), updatedScaleFactor);
|
||||||
};
|
};
|
||||||
|
|
||||||
auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor);
|
auto layerIt = std::find_if_not(inputLayers.begin() + 1, inputLayers.end(), equalScaleFactor);
|
||||||
@ -508,8 +529,8 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name;
|
THROW_GNA_EXCEPTION << "layers entered into concat have different scale factors" << concatLayer->name;
|
||||||
}
|
}
|
||||||
|
|
||||||
quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
|
quantData->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
|
||||||
quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale;
|
quantData->_src_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
|
||||||
|
|
||||||
if (layerIt == inputLayers.end() && concatIdxToUpdate.empty()) {
|
if (layerIt == inputLayers.end() && concatIdxToUpdate.empty()) {
|
||||||
return true;
|
return true;
|
||||||
@ -517,7 +538,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
|
|
||||||
for (auto& layerIdToUpdate : concatIdxToUpdate) {
|
for (auto& layerIdToUpdate : concatIdxToUpdate) {
|
||||||
auto destinationQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
|
auto destinationQuantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
|
||||||
destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
|
destinationQuantParams->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
|
||||||
|
|
||||||
InferenceEngine::CNNLayerPtr restartedLayer;
|
InferenceEngine::CNNLayerPtr restartedLayer;
|
||||||
// making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input
|
// making a link activation possible without extra layer if first input to concat not a parent / indirect parent of second input
|
||||||
@ -542,18 +563,18 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (restartedLayer == nullptr) {
|
if (restartedLayer == nullptr) {
|
||||||
THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << "input to concat: " << concatLayer->name;
|
THROW_GNA_EXCEPTION << "cannot requantize " << layerIdToUpdate << " input to concat: " << concatLayer->name;
|
||||||
}
|
}
|
||||||
auto quantDataForConCatInput = InferenceEngine::getInjectedData<QuantizedLayerParams>(*restartedLayer);
|
auto quantDataForConCatInput = InferenceEngine::getInjectedData<QuantizedLayerParams>(*restartedLayer);
|
||||||
|
|
||||||
auto restarLayerInfo = LayerInfo(restartedLayer);
|
auto restarLayerInfo = LayerInfo(restartedLayer);
|
||||||
if (restarLayerInfo.isActivation()) {
|
if (restarLayerInfo.isActivation()) {
|
||||||
// requantize activation by just changing it's output scale factor
|
// requantize activation by just changing it's output scale factor
|
||||||
quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
|
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
|
||||||
}
|
}
|
||||||
if (restarLayerInfo.isConst()) {
|
if (restarLayerInfo.isConst()) {
|
||||||
gnalog() << "... warning const layer will be requantized\n";
|
gnalog() << "... warning const layer will be requantized\n";
|
||||||
quantDataForConCatInput->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
|
quantDataForConCatInput->_dst_quant.SetScale(sourceQuantParams->_dst_quant.GetScale());
|
||||||
}
|
}
|
||||||
result = ScaleFactorUpdateResult(restartedLayer.get());
|
result = ScaleFactorUpdateResult(restartedLayer.get());
|
||||||
}
|
}
|
||||||
@ -588,9 +609,9 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
|||||||
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
|
InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
|
||||||
|
|
||||||
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
|
auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
|
||||||
quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
|
quant->_src_quant = quantDataForInputLayer->_dst_quant;
|
||||||
// TODO: pass 8 bits somehow
|
// TODO: pass 8 bits somehow
|
||||||
if (quant->_weights_quant.scale == 1.0f) {
|
if (quant->_weights_quant.GetScale() == 1.0f) {
|
||||||
size_t scaleRange = 0;
|
size_t scaleRange = 0;
|
||||||
if (weightsSize == 2) {
|
if (weightsSize == 2) {
|
||||||
scaleRange = MAX_VAL_2B_WEIGHT;
|
scaleRange = MAX_VAL_2B_WEIGHT;
|
||||||
@ -599,67 +620,61 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
|||||||
} else {
|
} else {
|
||||||
THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
|
THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
|
||||||
}
|
}
|
||||||
quant->_weights_quant.scale =
|
quant->_weights_quant.SetScale(
|
||||||
ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
|
ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size()));
|
||||||
if (quant->_weights_quant.scale == -1.0f) {
|
if (quant->_weights_quant.GetScale() == -1.0f) {
|
||||||
quant->_weights_quant.scale = 1.0f;
|
quant->_weights_quant.SetScale(1.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wl->_biases) {
|
if (wl->_biases) {
|
||||||
quant->_bias_quant.scale = ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
|
quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
|
||||||
MAX_VAL_4B_BIAS,
|
MAX_VAL_4B_BIAS,
|
||||||
wl->_biases->size());
|
wl->_biases->size()));
|
||||||
if (quant->_bias_quant.scale != -1.0f) {
|
if (quant->_bias_quant.GetScale() != -1.0f) {
|
||||||
quant->_bias_quant.scale = std::min(quant->_weights_quant.scale * quant->_src_quant.scale, quant->_bias_quant.scale);
|
quant->_bias_quant.SetScale(
|
||||||
quant->_weights_quant.scale = quant->_bias_quant.scale / quant->_src_quant.scale;
|
std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
|
||||||
|
quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: findout why ???
|
// TODO: findout why ???
|
||||||
if (weightsSize == 1) {
|
if (weightsSize == 1) {
|
||||||
quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
double weights_reducer = 1.0;
|
double weights_reducer = 1.0;
|
||||||
auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer*>(wl);
|
auto conv = dynamic_cast<InferenceEngine::ConvolutionLayer *>(wl);
|
||||||
if (conv) {
|
if (conv) {
|
||||||
auto dims = conv->insData.front().lock()->getDims();
|
auto dims = conv->insData.front().lock()->getDims();
|
||||||
|
|
||||||
weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
|
weights_reducer = MAX_VAL_2B_FEAT * scaleRange * dims[1] / std::numeric_limits<int32_t>::max();
|
||||||
weights_reducer = std::max(1.0, weights_reducer);
|
weights_reducer = std::max(1.0, weights_reducer);
|
||||||
}
|
}
|
||||||
quant->_weights_quant.scale /= weights_reducer;
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weights_reducer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
|
||||||
double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
|
|
||||||
|
|
||||||
if (weightsSize == 1 &&
|
if (weightsSize == 1 &&
|
||||||
static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
|
static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
|
||||||
static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
|
static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
|
||||||
gnawarn() << "Output scale for " << wl->name
|
gnawarn() << "Output scale for " << wl->name
|
||||||
<< " too large and are being reduced. Else saturations likely will happen \n";
|
<< " too large and are being reduced. Else saturations likely will happen \n";
|
||||||
// reduce weight scale according experimental heuristic
|
// reduce weight scale according experimental heuristic
|
||||||
if (quant->_dst_quant.scale * quant->_src_quant.scale /
|
if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
||||||
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
|
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
|
||||||
quant->_weights_quant.scale *= _scale_reduction_50;
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_50);
|
||||||
tmp_dst_quant_scale *= _scale_reduction_50;
|
} else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
||||||
} else if (quant->_dst_quant.scale * quant->_src_quant.scale /
|
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
|
||||||
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_45);
|
||||||
quant->_weights_quant.scale *= _scale_reduction_45;
|
} else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
||||||
tmp_dst_quant_scale *= _scale_reduction_45;
|
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
|
||||||
} else if (quant->_dst_quant.scale * quant->_src_quant.scale /
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_40);
|
||||||
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
|
|
||||||
quant->_weights_quant.scale *= _scale_reduction_40;
|
|
||||||
tmp_dst_quant_scale *= _scale_reduction_40;
|
|
||||||
} else {
|
} else {
|
||||||
quant->_weights_quant.scale *= _scale_reduction_35;
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_35);
|
||||||
tmp_dst_quant_scale *= _scale_reduction_35;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
quant->_dst_quant.scale = tmp_dst_quant_scale;
|
quant->_dst_quant.SetScale(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale());
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -31,6 +31,7 @@
|
|||||||
#include "layers/layers_builder.hpp"
|
#include "layers/layers_builder.hpp"
|
||||||
#include "layers/gna_concat_layer.hpp"
|
#include "layers/gna_concat_layer.hpp"
|
||||||
#include "layers/gna_crop_layer.hpp"
|
#include "layers/gna_crop_layer.hpp"
|
||||||
|
#include "layers/gna_fake_quantize_layer.hpp"
|
||||||
#include "round_float_define.hpp"
|
#include "round_float_define.hpp"
|
||||||
#include "gna_plugin_policy.hpp"
|
#include "gna_plugin_policy.hpp"
|
||||||
|
|
||||||
@ -377,8 +378,8 @@ void GNAGraphCompiler::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer)
|
|||||||
float output_scale_factor = 1.0f;
|
float output_scale_factor = 1.0f;
|
||||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
||||||
if (quantized != nullptr) {
|
if (quantized != nullptr) {
|
||||||
weight_scale_factor = quantized->_weights_quant.scale;
|
weight_scale_factor = quantized->_weights_quant.GetScale();
|
||||||
output_scale_factor = quantized->_dst_quant.scale;
|
output_scale_factor = quantized->_dst_quant.GetScale();
|
||||||
}
|
}
|
||||||
|
|
||||||
auto& currentComponent = dnnComponents.addComponent(layer->name, "convolution");
|
auto& currentComponent = dnnComponents.addComponent(layer->name, "convolution");
|
||||||
@ -541,8 +542,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
// TODO: only fp32 and Int16 tested
|
// TODO: only fp32 and Int16 tested
|
||||||
quantized == nullptr ? input->getPrecision().size() : 2,
|
quantized == nullptr ? input->getPrecision().size() : 2,
|
||||||
quantized == nullptr ? input->getPrecision().size() : 4,
|
quantized == nullptr ? input->getPrecision().size() : 4,
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.scale,
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs,
|
ptr_outputs,
|
||||||
ptr_weights,
|
ptr_weights,
|
||||||
@ -558,9 +559,9 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64);
|
gnamem->readonly().push_value(ptr_biases, power.offset, num_rows_out, 64);
|
||||||
} else {
|
} else {
|
||||||
IE_ASSERT(quantized != nullptr);
|
IE_ASSERT(quantized != nullptr);
|
||||||
auto quantizedScale = FLOAT_TO_INT16(std::min(quantized->_weights_quant.scale * power.scale,
|
auto quantizedScale = FLOAT_TO_INT16(std::min(quantized->_weights_quant.GetScale() * power.scale,
|
||||||
static_cast<float>(INT16_MAX)));
|
static_cast<float>(INT16_MAX)));
|
||||||
auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.scale * power.offset,
|
auto quantizedOffset = FLOAT_TO_INT32(std::min(quantized->_dst_quant.GetScale() * power.offset,
|
||||||
static_cast<float>(INT32_MAX)));
|
static_cast<float>(INT32_MAX)));
|
||||||
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64);
|
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedScale, num_rows_out, 64);
|
||||||
gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
|
gnamem->readonly().push_value<int32_t>(ptr_biases, quantizedOffset, num_rows_out, 64);
|
||||||
@ -580,8 +581,8 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
|
|
||||||
gna_pwl_segment_t* ptr_pwl_segments_target = nullptr;
|
gna_pwl_segment_t* ptr_pwl_segments_target = nullptr;
|
||||||
|
|
||||||
float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
|
float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
|
||||||
float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.scale : 1.0f;
|
float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
|
||||||
|
|
||||||
if (!gnaFlags->sw_fp32) {
|
if (!gnaFlags->sw_fp32) {
|
||||||
if (gnaFlags->uniformPwlDesign) {
|
if (gnaFlags->uniformPwlDesign) {
|
||||||
@ -687,7 +688,7 @@ void GNAGraphCompiler::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
pooling._kernel[X_AXIS],
|
pooling._kernel[X_AXIS],
|
||||||
num_columns_in,
|
num_columns_in,
|
||||||
false,
|
false,
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs);
|
ptr_outputs);
|
||||||
|
|
||||||
@ -727,7 +728,7 @@ void GNAGraphCompiler::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
num_columns_out,
|
num_columns_out,
|
||||||
inputs->getPrecision().size(),
|
inputs->getPrecision().size(),
|
||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
num_rows_out + num_padding_out,
|
num_rows_out + num_padding_out,
|
||||||
num_columns_out,
|
num_columns_out,
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
@ -915,8 +916,8 @@ void GNAGraphCompiler::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
4,
|
4,
|
||||||
quantized == nullptr ? inputs->getPrecision().size() : 2,
|
quantized == nullptr ? inputs->getPrecision().size() : 2,
|
||||||
4,
|
4,
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.scale,
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs,
|
ptr_outputs,
|
||||||
ptr_weights,
|
ptr_weights,
|
||||||
@ -1028,8 +1029,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
// TODO: only fp32 and Int16 tested
|
// TODO: only fp32 and Int16 tested
|
||||||
quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2,
|
quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2,
|
||||||
quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4,
|
quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4,
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.scale,
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs,
|
ptr_outputs,
|
||||||
ptr_weights,
|
ptr_weights,
|
||||||
@ -1050,7 +1051,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
if (quantized == nullptr) {
|
if (quantized == nullptr) {
|
||||||
gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64);
|
gnamem->readonly().push_value(ptr_weights, -1.0f, num_rows_out, 64);
|
||||||
} else {
|
} else {
|
||||||
auto scaledIdentity = -quantized->_weights_quant.scale;
|
auto scaledIdentity = -quantized->_weights_quant.GetScale();
|
||||||
|
|
||||||
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
||||||
|
|
||||||
@ -1062,7 +1063,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
if (quantized == nullptr) {
|
if (quantized == nullptr) {
|
||||||
gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
|
gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
|
||||||
} else {
|
} else {
|
||||||
auto scaledIdentity = quantized->_weights_quant.scale;
|
auto scaledIdentity = quantized->_weights_quant.GetScale();
|
||||||
|
|
||||||
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
||||||
|
|
||||||
@ -1132,8 +1133,8 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
|
|||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
weightable._weights->getTensorDesc().getPrecision().size(),
|
weightable._weights->getTensorDesc().getPrecision().size(),
|
||||||
biasPrecision.size(),
|
biasPrecision.size(),
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.scale,
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs,
|
ptr_outputs,
|
||||||
ptr_weights,
|
ptr_weights,
|
||||||
@ -1310,7 +1311,7 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
|
|||||||
num_columns_in,
|
num_columns_in,
|
||||||
inputs->getPrecision().size(),
|
inputs->getPrecision().size(),
|
||||||
inputs->getPrecision().size(),
|
inputs->getPrecision().size(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
num_rows_copied,
|
num_rows_copied,
|
||||||
num_columns_in,
|
num_columns_in,
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
@ -1346,8 +1347,8 @@ void GNAGraphCompiler::ConcatAlignFilterPrimitive(InferenceEngine::CNNLayerPtr l
|
|||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
filterLayer->_weights->getTensorDesc().getPrecision().size(),
|
filterLayer->_weights->getTensorDesc().getPrecision().size(),
|
||||||
biasPrecision.size(),
|
biasPrecision.size(),
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.scale,
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs,
|
ptr_outputs,
|
||||||
ptr_weights,
|
ptr_weights,
|
||||||
@ -1436,8 +1437,8 @@ void GNAGraphCompiler::AffineFilterPrimitive(InferenceEngine::CNNLayerPtr layer)
|
|||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
filterLayer->_weights->getTensorDesc().getPrecision().size(),
|
filterLayer->_weights->getTensorDesc().getPrecision().size(),
|
||||||
biasPrecision.size(),
|
biasPrecision.size(),
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.scale,
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs,
|
ptr_outputs,
|
||||||
ptr_weights,
|
ptr_weights,
|
||||||
@ -1517,13 +1518,14 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
}
|
}
|
||||||
} while (false);
|
} while (false);
|
||||||
|
|
||||||
IE_ASSERT(!layer->insData.empty());
|
GNA_LAYER_ASSERT(layer, !layer->insData.empty());
|
||||||
IE_ASSERT(!layer->outData.empty());
|
GNA_LAYER_ASSERT(layer, !layer->outData.empty());
|
||||||
|
|
||||||
auto inputs = layer->insData.begin()->lock();
|
auto inputs = layer->insData.begin()->lock();
|
||||||
auto outputs = *layer->outData.begin();
|
auto outputs = *layer->outData.begin();
|
||||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
||||||
float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
|
float output_pwl_scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
|
||||||
float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.scale : 1.0f;
|
float input_pwl_scale_factor = quantized != nullptr ? quantized->_src_quant.GetScale() : 1.0f;
|
||||||
|
|
||||||
auto orientation = kDnnInterleavedOrientation;
|
auto orientation = kDnnInterleavedOrientation;
|
||||||
|
|
||||||
@ -1588,39 +1590,7 @@ void GNAGraphCompiler::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (it->second == kActFakeQuantize) {
|
if (it->second == kActFakeQuantize) {
|
||||||
// get params from const input
|
activation_type = GNAFakeQuantizeLayer(layer).parseAsActivation();
|
||||||
auto GetParamFromInputAsFloat = [](CNNLayerPtr input, size_t idx) {
|
|
||||||
if (input->insData.size() <= idx) {
|
|
||||||
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
|
|
||||||
}
|
|
||||||
auto iLayerData = input->insData[idx].lock();
|
|
||||||
if (!iLayerData) {
|
|
||||||
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: cannot dereference data weak-pointer";
|
|
||||||
}
|
|
||||||
auto iLayer = getCreatorLayer(iLayerData).lock();
|
|
||||||
if (!iLayer) {
|
|
||||||
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: cannot dereference creator layer weak-pointer";
|
|
||||||
}
|
|
||||||
if (!LayerInfo(iLayer).isConst()) {
|
|
||||||
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << ", input: expected to be of type const, but was: " << iLayer->type;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!iLayer->blobs.count("custom")) {
|
|
||||||
THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot get custom blob";
|
|
||||||
}
|
|
||||||
auto data = iLayer->blobs["custom"];
|
|
||||||
if (data->getTensorDesc().getPrecision() != Precision::FP32) {
|
|
||||||
THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot cast custom blob to type FP32, since it is of type: " << data->getTensorDesc().getPrecision();
|
|
||||||
}
|
|
||||||
|
|
||||||
return data->cbuffer().as<float*>()[0];
|
|
||||||
};
|
|
||||||
|
|
||||||
activation_type.args.fakeQuantize.levels = layer->GetParamAsInt("levels");
|
|
||||||
activation_type.args.fakeQuantize.input_low = GetParamFromInputAsFloat(layer, 1);
|
|
||||||
activation_type.args.fakeQuantize.input_high = GetParamFromInputAsFloat(layer, 2);
|
|
||||||
activation_type.args.fakeQuantize.output_low = GetParamFromInputAsFloat(layer, 3);
|
|
||||||
activation_type.args.fakeQuantize.output_high = GetParamFromInputAsFloat(layer, 4);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
string actName = "unknown";
|
string actName = "unknown";
|
||||||
@ -1759,7 +1729,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
squeezedInputOrder[1],
|
squeezedInputOrder[1],
|
||||||
inputs->getPrecision().size(),
|
inputs->getPrecision().size(),
|
||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
(quantized == nullptr) ? 1.0f : quantized->_dst_quant.scale,
|
(quantized == nullptr) ? 1.0f : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs);
|
ptr_outputs);
|
||||||
}
|
}
|
||||||
@ -1774,7 +1744,7 @@ void GNAGraphCompiler::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
squeezedInputOrder[1],
|
squeezedInputOrder[1],
|
||||||
inputs->getPrecision().size(),
|
inputs->getPrecision().size(),
|
||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.scale,
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
ptr_outputs);
|
ptr_outputs);
|
||||||
}
|
}
|
||||||
|
@ -37,6 +37,7 @@
|
|||||||
#include "memory/gna_memory_state.hpp"
|
#include "memory/gna_memory_state.hpp"
|
||||||
#include "gna_model_serial.hpp"
|
#include "gna_model_serial.hpp"
|
||||||
#include "runtime/gna_float_runtime.hpp"
|
#include "runtime/gna_float_runtime.hpp"
|
||||||
|
#include <layers/gna_fake_quantize_layer.hpp>
|
||||||
|
|
||||||
#include <generic_ie.hpp>
|
#include <generic_ie.hpp>
|
||||||
#include <ngraph/pass/manager.hpp>
|
#include <ngraph/pass/manager.hpp>
|
||||||
@ -351,6 +352,87 @@ void GNAPlugin::InitGNADevice() {
|
|||||||
graphCompiler.setGNAMemoryPtr(gnamem);
|
graphCompiler.setGNAMemoryPtr(gnamem);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void GNAPlugin::UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork & network) {
|
||||||
|
// fp32 emulation mode dont need any modifications to configuration
|
||||||
|
if (config.gnaFlags.sw_fp32) return;
|
||||||
|
|
||||||
|
// search for FQ layers
|
||||||
|
// only supports cases of int16 or int8
|
||||||
|
auto it = details::CNNNetworkIterator(&network);
|
||||||
|
auto end = details::CNNNetworkIterator();
|
||||||
|
for (; it != end; it++) {
|
||||||
|
if (!LayerInfo(*it).isFakeQuantize()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
GNAFakeQuantizeLayer fqLayer(*it);
|
||||||
|
auto inputLayer = fqLayer.getInputLayer();
|
||||||
|
|
||||||
|
// this fake quantize represents data quantization - not weights
|
||||||
|
if (!LayerInfo(inputLayer).isConst()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// also in mixed mode i8 should be stated as target precision
|
||||||
|
if (fqLayer.getLevels() <= std::numeric_limits<uint8_t>::max()) {
|
||||||
|
config.gnaPrecision = InferenceEngine::Precision::I8;
|
||||||
|
} else if (fqLayer.getLevels() <= std::numeric_limits<uint16_t>::max()) {
|
||||||
|
config.gnaPrecision = InferenceEngine::Precision::I16;
|
||||||
|
} else {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(*it)
|
||||||
|
<< "unsupported quantisation scheme: number of levels is " << fqLayer.getLevels() << " while only up to "
|
||||||
|
<< std::numeric_limits<uint16_t>::max() << " is supported";
|
||||||
|
}
|
||||||
|
|
||||||
|
gnaFlags->fake_quantized = true;
|
||||||
|
config.gnaFlags.fake_quantized = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void GNAPlugin::UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork & network) {
|
||||||
|
// fp32 emulation mode dont need any modifications to configuration
|
||||||
|
if (config.gnaFlags.sw_fp32) return;
|
||||||
|
|
||||||
|
// search for FQ layers
|
||||||
|
// only supports cases of int16 or int8
|
||||||
|
InputsDataMap inputs;
|
||||||
|
network.getInputsInfo(inputs);
|
||||||
|
for (auto && input : inputs) {
|
||||||
|
auto data = input.second->getInputData();
|
||||||
|
size_t inputIdx = 0;
|
||||||
|
for (auto && nextToInputLayer : getInputTo(data)) {
|
||||||
|
if (!LayerInfo(nextToInputLayer.second).isFakeQuantize()) {
|
||||||
|
inputIdx++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// replacing scale factor from this fq layer
|
||||||
|
GNAFakeQuantizeLayer fqLayer(nextToInputLayer.second);
|
||||||
|
auto inputRange = fqLayer.getInputRange();
|
||||||
|
auto outputRange = fqLayer.getOutputRange();
|
||||||
|
if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
|
||||||
|
outputRange.second.size() != 1 || outputRange.second.size() != 1) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(nextToInputLayer.second)
|
||||||
|
<< "unsupported, per-channel quantization for input layer : " << input.second->name();
|
||||||
|
}
|
||||||
|
float scaleInput = (fqLayer.getLevels() - 1) / (inputRange.second[0] - inputRange.first[0]);
|
||||||
|
|
||||||
|
if (!config.inputScaleFactors.empty()) {
|
||||||
|
gnalog() << "Scale factor calculated during model quantization (" << scaleInput
|
||||||
|
<< ") will be used instead of user input (" << inputsDesc->inputScaleFactors[inputIdx] << ").\n";
|
||||||
|
if (inputsDesc->inputScaleFactors[inputIdx] < scaleInput) {
|
||||||
|
gnawarn() << "WARNING: Scale factor calculated based on input values (" << inputsDesc->inputScaleFactors[inputIdx]
|
||||||
|
<< ") is smaller than scale factor used to quantize model (" << scaleInput << "). "
|
||||||
|
<< "Input values will be clamped.\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
config.inputScaleFactors[inputIdx] = scaleInput;
|
||||||
|
inputsDesc->inputScaleFactors[inputIdx] = scaleInput;
|
||||||
|
|
||||||
|
inputIdx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
||||||
std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
|
std::shared_ptr<InferenceEngine::details::CNNNetworkImpl> convertedNetwork;
|
||||||
if (_network.getFunction()) {
|
if (_network.getFunction()) {
|
||||||
@ -390,6 +472,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
THROW_GNA_EXCEPTION << error.c_str();
|
THROW_GNA_EXCEPTION << error.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FQ networks now replaces certain flags in the plugin - flags will'be owerritten
|
||||||
|
UpdateGnaQuantModeFromNetwork(network);
|
||||||
|
UpdateInputScaleFromNetwork(network);
|
||||||
|
|
||||||
// network optimisation phases
|
// network optimisation phases
|
||||||
int passIdx = 0;
|
int passIdx = 0;
|
||||||
auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
|
auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
|
||||||
@ -401,6 +487,10 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
passes->registerPass<UnrollLSTMCellPass>();
|
passes->registerPass<UnrollLSTMCellPass>();
|
||||||
passes->registerPass<RemoveSingleInputConcatPass>();
|
passes->registerPass<RemoveSingleInputConcatPass>();
|
||||||
|
|
||||||
|
// fake quantisation aware passes
|
||||||
|
passes->registerPass<FuseFQIntoWeightsPass>();
|
||||||
|
passes->registerPass<MoveFakeQuantizeLayerIntoQuantParamsPass>();
|
||||||
|
|
||||||
passes->registerPass<SubstitutePReluPass>();
|
passes->registerPass<SubstitutePReluPass>();
|
||||||
passes->registerPass<SubstituteSoftSignPass>();
|
passes->registerPass<SubstituteSoftSignPass>();
|
||||||
|
|
||||||
@ -441,6 +531,19 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
// to run all passes need to have two calls to pass manager
|
// to run all passes need to have two calls to pass manager
|
||||||
run_passes(newNet, true);
|
run_passes(newNet, true);
|
||||||
run_passes(newNet, false);
|
run_passes(newNet, false);
|
||||||
|
} else if (gnaFlags->fake_quantized) {
|
||||||
|
switch (config.gnaPrecision) {
|
||||||
|
case Precision::I16:
|
||||||
|
ModelQuantizer<FakeQuantI16> q16;
|
||||||
|
newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||||
|
break;
|
||||||
|
case Precision::I8:
|
||||||
|
ModelQuantizer<FakeQuantI8> q8;
|
||||||
|
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
switch (config.gnaPrecision) {
|
switch (config.gnaPrecision) {
|
||||||
case Precision::I16:
|
case Precision::I16:
|
||||||
@ -452,8 +555,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
THROW_GNA_EXCEPTION << "no mans land for GNA precision";
|
THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -470,7 +572,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
printed_properties.emplace_back(
|
printed_properties.emplace_back(
|
||||||
"scale factor", std::to_string(quantized->_dst_quant.scale));
|
"scale factor", std::to_string(quantized->_dst_quant.GetScale()));
|
||||||
});
|
});
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -564,7 +666,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
|
desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
|
||||||
desc.orientation = component.orientation_out;
|
desc.orientation = component.orientation_out;
|
||||||
desc.num_bytes_per_element = component.num_bytes_per_output;
|
desc.num_bytes_per_element = component.num_bytes_per_output;
|
||||||
desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
|
desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
|
||||||
// TODO: this need to be fixed
|
// TODO: this need to be fixed
|
||||||
desc.num_elements = component.num_rows_out;
|
desc.num_elements = component.num_rows_out;
|
||||||
|
|
||||||
@ -623,7 +725,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
|||||||
// TODO: what is orientation for concat
|
// TODO: what is orientation for concat
|
||||||
desc.orientation = kDnnInterleavedOrientation;
|
desc.orientation = kDnnInterleavedOrientation;
|
||||||
desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
|
desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
|
||||||
desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
|
desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
|
||||||
desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
|
desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
|
||||||
|
|
||||||
// binding ptr for first infer request - then others will be setup during relocation
|
// binding ptr for first infer request - then others will be setup during relocation
|
||||||
|
@ -219,6 +219,8 @@ class GNAPlugin : public InferenceEngine::IInferencePlugin {
|
|||||||
int idx = 0);
|
int idx = 0);
|
||||||
|
|
||||||
void UpdateFieldsFromConfig();
|
void UpdateFieldsFromConfig();
|
||||||
|
void UpdateGnaQuantModeFromNetwork(InferenceEngine::ICNNNetwork &);
|
||||||
|
void UpdateInputScaleFromNetwork(InferenceEngine::ICNNNetwork &);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -72,5 +72,5 @@ if (!(expr)) { \
|
|||||||
}
|
}
|
||||||
#define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
|
#define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
|
||||||
#define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer)
|
#define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer)
|
||||||
#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" "
|
#define LAYER_NAME(layer) (layer)->type << " layer : \"" << (layer)->name << "\" "
|
||||||
|
|
||||||
|
@ -0,0 +1,164 @@
|
|||||||
|
// Copyright (C) 2018-2020 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "gna_layer_info.hpp"
|
||||||
|
#include "gna_plugin_log.hpp"
|
||||||
|
#include "gna_layer_helpers.hpp"
|
||||||
|
#include "frontend/weights_converter.hpp"
|
||||||
|
|
||||||
|
#include <ie_algorithm.hpp>
|
||||||
|
|
||||||
|
namespace GNAPluginNS {
|
||||||
|
class GNAFakeQuantizeLayer {
|
||||||
|
InferenceEngine::CNNLayerPtr fqLayer;
|
||||||
|
public :
|
||||||
|
GNAFakeQuantizeLayer(InferenceEngine::CNNLayerPtr fqLayer)
|
||||||
|
: fqLayer(fqLayer) {
|
||||||
|
if (!LayerInfo(fqLayer).isFakeQuantize()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << "cannot parse as fake quantize";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief convert FQ layer directly to gna-pwl activation layer
|
||||||
|
*/
|
||||||
|
DnnActivation parseAsActivation() const {
|
||||||
|
DnnActivation fqActivation;
|
||||||
|
|
||||||
|
fqActivation.args.fakeQuantize.levels = fqLayer->GetParamAsInt("levels");
|
||||||
|
auto inputShape = getShapeForRange(fqLayer, 1);
|
||||||
|
auto outputShape = getShapeForRange(fqLayer, 3);
|
||||||
|
|
||||||
|
// TODO: check shapes broadcasting to shape of input at 0
|
||||||
|
auto inputRangeSize = InferenceEngine::details::product(inputShape.begin(), inputShape.end());
|
||||||
|
auto outputRangeSize = InferenceEngine::details::product(outputShape.begin(), outputShape.end());
|
||||||
|
|
||||||
|
fqActivation.args.fakeQuantize.inputPerChannel = inputRangeSize != 1;
|
||||||
|
fqActivation.args.fakeQuantize.input_low = getParamFromInputAsFloats(fqLayer, 1);
|
||||||
|
fqActivation.args.fakeQuantize.input_high = getParamFromInputAsFloats(fqLayer, 2);
|
||||||
|
|
||||||
|
fqActivation.args.fakeQuantize.outputPerChannel = outputRangeSize != 1;
|
||||||
|
fqActivation.args.fakeQuantize.output_low = getParamFromInputAsFloats(fqLayer, 3);
|
||||||
|
fqActivation.args.fakeQuantize.output_high = getParamFromInputAsFloats(fqLayer, 4);
|
||||||
|
fqActivation.type = kActFakeQuantize;
|
||||||
|
|
||||||
|
return fqActivation;
|
||||||
|
}
|
||||||
|
/**
|
||||||
|
* retrieves input blob for FQ layer that connected to const layer
|
||||||
|
*/
|
||||||
|
InferenceEngine::Blob::Ptr getConstInputData() const {
|
||||||
|
return LayerUtils::getParamFromInputAsBlob(fqLayer, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* fake quantize has 5 input layers, while 4 of them always constant layer, and 1 might be a tensor - connection
|
||||||
|
*/
|
||||||
|
InferenceEngine::CNNLayerPtr getInputLayer() const {
|
||||||
|
return getInputLayerAt(fqLayer, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t getLevels() {
|
||||||
|
return fqLayer->GetParamAsInt("levels");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::vector<float>, std::vector<float>> getInputRange() {
|
||||||
|
return getRange(fqLayer, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::vector<float>, std::vector<float>> getOutputRange() {
|
||||||
|
return getRange(fqLayer, 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
operator InferenceEngine::CNNLayerPtr () const {
|
||||||
|
return fqLayer;
|
||||||
|
}
|
||||||
|
|
||||||
|
InferenceEngine::CNNLayerPtr operator -> () const {
|
||||||
|
return fqLayer;
|
||||||
|
}
|
||||||
|
InferenceEngine::CNNLayerPtr operator * () const {
|
||||||
|
return fqLayer;
|
||||||
|
}
|
||||||
|
protected :
|
||||||
|
|
||||||
|
static std::pair<std::vector<float>, std::vector<float>> getRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
|
||||||
|
auto shape = getShapeForRange(input, idx);
|
||||||
|
auto rangeSize = InferenceEngine::details::product(shape.begin(), shape.end());
|
||||||
|
|
||||||
|
auto dataMin = LayerUtils::getParamFromInputAsBlob(input, idx);
|
||||||
|
auto dataMax = LayerUtils::getParamFromInputAsBlob(input, idx + 1);
|
||||||
|
std::vector<float> minValues(rangeSize), maxValues(rangeSize);
|
||||||
|
switch (dataMin->getTensorDesc().getPrecision()) {
|
||||||
|
case InferenceEngine::Precision::FP32: {
|
||||||
|
memcpy(&minValues[0], dataMin->buffer().as<float*>(), rangeSize * sizeof(float));
|
||||||
|
memcpy(&maxValues[0], dataMax->buffer().as<float*>(), rangeSize * sizeof(float));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case InferenceEngine::Precision::FP16: {
|
||||||
|
auto dataMinFP32 = make_fp32_blob(dataMin);
|
||||||
|
memcpy(&minValues[0], dataMinFP32->buffer().as<float*>(), rangeSize * sizeof(float));
|
||||||
|
|
||||||
|
auto dataMaxFP32 = make_fp32_blob(dataMax);
|
||||||
|
memcpy(&maxValues[0], dataMaxFP32->buffer().as<float*>(), rangeSize * sizeof(float));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot cast custom blob to type FP32, since it is of type: "
|
||||||
|
<< dataMin->getTensorDesc().getPrecision();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {minValues, maxValues};
|
||||||
|
}
|
||||||
|
|
||||||
|
static float* getParamFromInputAsFloats(InferenceEngine::CNNLayerPtr input, size_t idx) {
|
||||||
|
auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
|
||||||
|
if (data->getTensorDesc().getPrecision() != InferenceEngine::Precision::FP32) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot cast custom blob to type FP32, since it is of type: "
|
||||||
|
<< data->getTensorDesc().getPrecision();
|
||||||
|
}
|
||||||
|
return data->buffer().as<float*>();
|
||||||
|
}
|
||||||
|
|
||||||
|
static InferenceEngine::SizeVector getShapeFromInput(InferenceEngine::CNNLayerPtr input, size_t idx) {
|
||||||
|
auto data = LayerUtils::getParamFromInputAsBlob(input, idx);
|
||||||
|
return data->getTensorDesc().getDims();
|
||||||
|
}
|
||||||
|
|
||||||
|
static InferenceEngine::CNNLayerPtr getInputLayerAt(InferenceEngine::CNNLayerPtr input, size_t idx) {
|
||||||
|
if (input->insData.size() <= idx) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
|
||||||
|
}
|
||||||
|
auto iLayerData = input->insData[idx].lock();
|
||||||
|
if (!iLayerData) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
|
||||||
|
<< ", input: cannot dereference data weak-pointer";
|
||||||
|
}
|
||||||
|
auto iLayer = getCreatorLayer(iLayerData).lock();
|
||||||
|
if (!iLayer) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
|
||||||
|
<< ", input: cannot dereference creator layer weak-pointer";
|
||||||
|
}
|
||||||
|
return iLayer;
|
||||||
|
}
|
||||||
|
|
||||||
|
static InferenceEngine::SizeVector getShapeForRange(InferenceEngine::CNNLayerPtr input, size_t idx) {
|
||||||
|
auto lowShape = getShapeFromInput(input, idx);
|
||||||
|
auto highShape = getShapeFromInput(input, idx + 1);
|
||||||
|
if (lowShape.size() != highShape.size()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "shapes mismatch for " << idx << " and " << idx + 1 << " inputs";
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i != lowShape.size(); i++) {
|
||||||
|
if (lowShape[i] != highShape[i]) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "shapes mismatch for " << idx << " and " << idx + 1 << " inputs";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return lowShape;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace GNAPluginNS
|
44
inference-engine/src/gna_plugin/layers/gna_layer_helpers.hpp
Normal file
44
inference-engine/src/gna_plugin/layers/gna_layer_helpers.hpp
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
// Copyright (C) 2018-2020 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "gna_layer_info.hpp"
|
||||||
|
#include "gna_plugin_log.hpp"
|
||||||
|
|
||||||
|
namespace GNAPluginNS {
|
||||||
|
namespace LayerUtils {
|
||||||
|
/**
|
||||||
|
* @brief retrievs blob from const layer connected to certain layer
|
||||||
|
* @param input
|
||||||
|
* @param idx
|
||||||
|
*/
|
||||||
|
inline InferenceEngine::Blob::Ptr getParamFromInputAsBlob(InferenceEngine::CNNLayerPtr input, size_t idx) {
|
||||||
|
if (input->insData.size() <= idx) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx << "input";
|
||||||
|
}
|
||||||
|
auto iLayerData = input->insData[idx].lock();
|
||||||
|
if (!iLayerData) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
|
||||||
|
<< ", input: cannot dereference data weak-pointer";
|
||||||
|
}
|
||||||
|
auto iLayer = getCreatorLayer(iLayerData).lock();
|
||||||
|
if (!iLayer) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
|
||||||
|
<< ", input: cannot dereference creator layer weak-pointer";
|
||||||
|
}
|
||||||
|
if (!LayerInfo(iLayer).isConst()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(input) << "cannot get data from " << idx
|
||||||
|
<< ", input: expected to be of type const, but was: " << iLayer->type;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!iLayer->blobs.count("custom")) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(iLayer) << "cannot get custom blob";
|
||||||
|
}
|
||||||
|
|
||||||
|
return iLayer->blobs["custom"];
|
||||||
|
}
|
||||||
|
} // namespace LayerUtils
|
||||||
|
} // namespace GNAPluginNS
|
@ -205,8 +205,8 @@ class LayerInfo {
|
|||||||
bool isConcat() const noexcept {
|
bool isConcat() const noexcept {
|
||||||
return isOfType("concat");
|
return isOfType("concat");
|
||||||
}
|
}
|
||||||
bool isFakeQnatize() const noexcept {
|
bool isFakeQuantize() const noexcept {
|
||||||
return isOfType("FakeQnatize");
|
return isOfType("FakeQuantize");
|
||||||
}
|
}
|
||||||
bool isNonFunctional() const noexcept {
|
bool isNonFunctional() const noexcept {
|
||||||
return isOfType("reshape") || isOfType("squeeze") || isOfType("unsqueeze") || isTrivialPermute();
|
return isOfType("reshape") || isOfType("squeeze") || isOfType("unsqueeze") || isTrivialPermute();
|
||||||
|
@ -71,7 +71,7 @@ namespace memory {
|
|||||||
case InferenceEngine::Precision::I16: {
|
case InferenceEngine::Precision::I16: {
|
||||||
if (new_state_precision == InferenceEngine::Precision::FP32) {
|
if (new_state_precision == InferenceEngine::Precision::FP32) {
|
||||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
|
||||||
auto scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
|
auto scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
|
||||||
GNAPluginNS::ConvertToInt16(static_cast<int16_t*>(state->gna_ptr),
|
GNAPluginNS::ConvertToInt16(static_cast<int16_t*>(state->gna_ptr),
|
||||||
newState->buffer().as<float*>(),
|
newState->buffer().as<float*>(),
|
||||||
1,
|
1,
|
||||||
@ -97,7 +97,7 @@ namespace memory {
|
|||||||
|
|
||||||
if (state->getInput() && state_precision == InferenceEngine::Precision::I16) {
|
if (state->getInput() && state_precision == InferenceEngine::Precision::I16) {
|
||||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(state->getInput());
|
||||||
auto scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
|
auto scale_factor = quantized != nullptr ? quantized->_dst_quant.GetScale() : 1.0f;
|
||||||
|
|
||||||
auto result_blob = make_blob_with_precision(InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32,
|
auto result_blob = make_blob_with_precision(InferenceEngine::TensorDesc(InferenceEngine::Precision::FP32,
|
||||||
InferenceEngine::SizeVector({ 1, elements }),
|
InferenceEngine::SizeVector({ 1, elements }),
|
||||||
|
@ -25,21 +25,25 @@
|
|||||||
#include <legacy/net_pass.h>
|
#include <legacy/net_pass.h>
|
||||||
#include <layers/gna_copy_layer.hpp>
|
#include <layers/gna_copy_layer.hpp>
|
||||||
|
|
||||||
|
#include "backend/dnn_types.h"
|
||||||
#include "gna_plugin_log.hpp"
|
#include "gna_plugin_log.hpp"
|
||||||
#include "frontend/quantization.h"
|
#include "frontend/quantization.h"
|
||||||
#include "frontend/quantized_layer_params.hpp"
|
#include "frontend/quantized_layer_params.hpp"
|
||||||
#include <layers/gna_copy_layer.hpp>
|
#include <layers/gna_copy_layer.hpp>
|
||||||
|
#include <layers/gna_fake_quantize_layer.hpp>
|
||||||
|
#include <runtime/pwl.h>
|
||||||
#include "gna_graph_tools.hpp"
|
#include "gna_graph_tools.hpp"
|
||||||
#include "gna_pass_manager.hpp"
|
#include "gna_pass_manager.hpp"
|
||||||
#include "layers/gna_layer_info.hpp"
|
#include "layers/gna_layer_info.hpp"
|
||||||
#include "gna_upstream_iterator.hpp"
|
#include "gna_upstream_iterator.hpp"
|
||||||
|
#include "frontend/quantization.h"
|
||||||
|
|
||||||
|
|
||||||
using namespace InferenceEngine;
|
using namespace InferenceEngine;
|
||||||
using namespace InferenceEngine::details;
|
using namespace InferenceEngine::details;
|
||||||
using namespace GNAPluginNS;
|
using namespace GNAPluginNS;
|
||||||
|
|
||||||
#define pass_trace() gnalog() << "[" << getName() << "]"
|
#define pass_trace() gnalog() << "[" << getName() << "] "
|
||||||
|
|
||||||
std::shared_ptr<IPassManager> BasePass::getPassManager() {
|
std::shared_ptr<IPassManager> BasePass::getPassManager() {
|
||||||
auto sharedMgr = mgr.lock();
|
auto sharedMgr = mgr.lock();
|
||||||
@ -1672,6 +1676,232 @@ void FuseMultipleIdentitiesPass::run() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void FuseFQIntoWeightsPass::run() {
|
||||||
|
auto isNonFunctional = [](CNNLayerPtr ptr) {
|
||||||
|
return LayerInfo(ptr).isNonFunctional();
|
||||||
|
};
|
||||||
|
|
||||||
|
auto assignWeightsAndBiases = [](CNNLayerPtr layer, Blob::Ptr weights, Blob::Ptr biases) {
|
||||||
|
auto weigtableLayer = std::dynamic_pointer_cast<WeightableLayer>(layer);
|
||||||
|
if (nullptr == weigtableLayer) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(layer) << " not a weightable layer";
|
||||||
|
}
|
||||||
|
weigtableLayer->_weights = weights;
|
||||||
|
weigtableLayer->_biases = biases;
|
||||||
|
weigtableLayer->blobs["weights"] = weights;
|
||||||
|
weigtableLayer->blobs["biases"] = biases;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (auto &l : *pLayers) {
|
||||||
|
if (!LayerInfo(l).isFakeQuantize()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// determine whether this FQ is actually ends into weigtable layer
|
||||||
|
auto fqLayer = l;
|
||||||
|
if (!CNNNetHasNextLayerSkipCertain(fqLayer, 0, 0, isNonFunctional)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto weightableLayer = CNNNetGetNextLayerSkipCertain(fqLayer, 0, 0, isNonFunctional).first;
|
||||||
|
if (!LayerInfo(weightableLayer).isWeightable()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (weightableLayer->insData.size() != 3) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// check whether this FQ represents weights - it need to be at index 1 of weightable layer
|
||||||
|
auto prevLayerAt1 = CNNNetPrevLayerSkipCertain(weightableLayer, 1, isNonFunctional);
|
||||||
|
|
||||||
|
if (prevLayerAt1 != fqLayer) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// now this FQ layer represents weights - lets apply it and fuse to given weightable layer.
|
||||||
|
pass_trace() << "found " << LAYER_NAME(fqLayer) << " that will be converted to weights of "
|
||||||
|
<< LAYER_NAME(weightableLayer) << "\n";
|
||||||
|
|
||||||
|
GNAFakeQuantizeLayer gnaFakeQuantizeLayer(fqLayer);
|
||||||
|
|
||||||
|
auto biases = LayerUtils::getParamFromInputAsBlob(weightableLayer, 2);
|
||||||
|
auto quantizedWeights = gnaFakeQuantizeLayer.getConstInputData();
|
||||||
|
|
||||||
|
// 1. broke existing connections - by detaching fq subgraph from rest of graph
|
||||||
|
auto prevData = weightableLayer->insData[1].lock();
|
||||||
|
auto prevLayer = getCreatorLayer(prevData).lock();
|
||||||
|
auto weightDims = prevLayer->outData.front()->getDims();
|
||||||
|
prevLayer->outData.clear();
|
||||||
|
weightableLayer->insData.resize(1);
|
||||||
|
|
||||||
|
// 2. running FQ function for given layer
|
||||||
|
if (weightDims.size() != 2) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " layout of weigths not equal to NC not yet supported";
|
||||||
|
}
|
||||||
|
auto outputSize = details::product(weightDims.begin(), weightDims.end());
|
||||||
|
|
||||||
|
// depending on compute precision weights will be recreated
|
||||||
|
// for integer mode - weights might be simply copied - to avoid furter quantisations overhead
|
||||||
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(weightableLayer);
|
||||||
|
if (quantized) {
|
||||||
|
// assign already quantized Weights
|
||||||
|
assignWeightsAndBiases(weightableLayer, quantizedWeights, biases);
|
||||||
|
|
||||||
|
// modify scale factors for quantized component
|
||||||
|
auto levels = gnaFakeQuantizeLayer.getLevels();
|
||||||
|
auto inputRange = gnaFakeQuantizeLayer.getInputRange();
|
||||||
|
auto outputRange = gnaFakeQuantizeLayer.getOutputRange();
|
||||||
|
if (outputRange.first.size() != outputRange.second.size()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " number of min and max data must be equal, min size: "
|
||||||
|
<< outputRange.first.size() << ", max size: " << outputRange.second.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inputRange.first.size() != outputRange.first.size() ||
|
||||||
|
inputRange.second.size() != outputRange.second.size()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " size of input and output range differs. "
|
||||||
|
<< "input min size: " << inputRange.first.size() << ", "
|
||||||
|
<< "output min size: " << outputRange.first.size() << ", "
|
||||||
|
<< "input max size: " << inputRange.second.size() << ", "
|
||||||
|
<< "output max size: " << outputRange.second.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (levels > std::numeric_limits<uint8_t>::max() && outputRange.first.size() > 1) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantization for int16 weights."
|
||||||
|
<< " Per-channel quantization ";
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if
|
||||||
|
// - weights were float values and need to be quantized,
|
||||||
|
// - weights are integer values and quantization can be skipped
|
||||||
|
for (size_t i = 0; i < outputRange.first.size(); ++i) {
|
||||||
|
if (inputRange.first[i] > outputRange.first[i] ||
|
||||||
|
inputRange.second[i] > outputRange.second[i]) {
|
||||||
|
quantized->_weights_quantized = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quantized->_weights_quant.SetMinValues(outputRange.first);
|
||||||
|
quantized->_weights_quant.SetMaxValues(outputRange.second);
|
||||||
|
quantized->_weights_quant.SetLevels(levels);
|
||||||
|
|
||||||
|
// lets find out minimum scale factor among channels
|
||||||
|
if (quantized->_weights_quant.GetMinValues().empty()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " per channel/tensor weigths scales are missed";
|
||||||
|
}
|
||||||
|
auto getScale = [&quantized](size_t i) {
|
||||||
|
return (quantized->_weights_quant.GetLevels() - 1) /
|
||||||
|
(quantized->_weights_quant.GetMaxValues()[i] - quantized->_weights_quant.GetMinValues()[i]);
|
||||||
|
};
|
||||||
|
|
||||||
|
float min_channel_scale = getScale(0);
|
||||||
|
for (uint32_t i = 1; i < quantized->_weights_quant.GetMinValues().size(); i++) {
|
||||||
|
min_channel_scale = std::min(min_channel_scale, getScale(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto multiplier = 1.0f;
|
||||||
|
if (quantized->_weights_quant.GetLevels() <= std::numeric_limits<uint8_t>::max()) {
|
||||||
|
// GNA supports additional multiplier for only 8bit weights.
|
||||||
|
// The multipler is used to extend dynamic range.
|
||||||
|
multiplier = MAX_OUT_MULTIPLIER;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Common weights scale calculation
|
||||||
|
quantized->_weights_quant.SetScale(min_channel_scale * multiplier);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
intel_dnn_component_t component;
|
||||||
|
component.num_columns_in = weightDims[1];
|
||||||
|
component.num_rows_in = weightDims[0];
|
||||||
|
|
||||||
|
intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component.op.pwl);
|
||||||
|
transform->func_id = gnaFakeQuantizeLayer.parseAsActivation();
|
||||||
|
|
||||||
|
auto quantizedWeightsData = quantizedWeights->buffer();
|
||||||
|
component.ptr_inputs = quantizedWeightsData.as<float*>();
|
||||||
|
|
||||||
|
auto dequantizedWeights = make_shared_blob<float>(TensorDesc(Precision::FP32, {outputSize}, Layout::C));
|
||||||
|
dequantizedWeights->allocate();
|
||||||
|
|
||||||
|
auto resultBuffer = dequantizedWeights->buffer();
|
||||||
|
component.ptr_outputs = resultBuffer.as<float*>();
|
||||||
|
|
||||||
|
PwlApply32(&component, 0, component.num_rows_in - 1, 0, component.num_columns_in - 1);
|
||||||
|
|
||||||
|
// 3. assign dequantized const blob to weightable layer
|
||||||
|
assignWeightsAndBiases(weightableLayer, dequantizedWeights, biases);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void MoveFakeQuantizeLayerIntoQuantParamsPass :: run() {
|
||||||
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
|
||||||
|
if (!quantized) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto donotSkip = [](CNNLayerPtr) {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
for (auto &&l : *pLayers) {
|
||||||
|
if (!LayerInfo(l).isFakeQuantize()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
GNAFakeQuantizeLayer fqLayer(l);
|
||||||
|
auto prevLayer = CNNNetPrevLayerSkipCertain(*fqLayer, 0, donotSkip);
|
||||||
|
if (prevLayer->outData.size() != 1) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(prevLayer) << " fake quantize input that connected to something else not supported";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto inputRange = fqLayer.getInputRange();
|
||||||
|
auto outputRange = fqLayer.getOutputRange();
|
||||||
|
if (inputRange.second.size() != 1 || inputRange.second.size() != 1 ||
|
||||||
|
outputRange.second.size() != 1 || outputRange.second.size() != 1) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " unsupported per-channel quantisation";
|
||||||
|
}
|
||||||
|
|
||||||
|
float fqLevels = fqLayer.getLevels();
|
||||||
|
float scaleInput = (fqLevels - 1) / (inputRange.second[0] - inputRange.first[0]);
|
||||||
|
float scaleOutputs = (fqLevels - 1) / (outputRange.second[0] - outputRange.first[0]);
|
||||||
|
|
||||||
|
// Before FQ layer is removed, the previous layer has to be updated with its quantization data
|
||||||
|
auto quantParamsPrevLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(prevLayer);
|
||||||
|
quantParamsPrevLayer->_dst_quant.SetScale(scaleOutputs);
|
||||||
|
quantParamsPrevLayer->_dst_quant.SetLevels(fqLevels);
|
||||||
|
quantParamsPrevLayer->_dst_quant.SetMinValues({ inputRange.first[0] });
|
||||||
|
quantParamsPrevLayer->_dst_quant.SetMaxValues({ inputRange.second[0] });
|
||||||
|
|
||||||
|
auto prevData = prevLayer->outData.front();
|
||||||
|
getInputTo(prevLayer->outData.front()).clear();
|
||||||
|
|
||||||
|
// Find all output layers connected to FQ
|
||||||
|
auto nextLayers = CNNNetGetAllNextLayersSkipCertain(*fqLayer, -1, donotSkip);
|
||||||
|
if (nextLayers.empty()) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize does not have any output layers connected";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Connect all next layers after FQ to the layer that is before FQ
|
||||||
|
// and propagate quantization data
|
||||||
|
for (size_t i = 0; i < nextLayers.size(); ++i) {
|
||||||
|
auto insDatas = CNNLayerFindInsDataIdxes(fqLayer->outData.front(), nextLayers[i]);
|
||||||
|
if (insDatas.size() != 1) {
|
||||||
|
THROW_GNA_LAYER_EXCEPTION(fqLayer) << " fake quantize connection to layer: "
|
||||||
|
<< LAYER_NAME(nextLayers[i]) << " is not correct";
|
||||||
|
}
|
||||||
|
|
||||||
|
nextLayers[i]->insData[insDatas.front()] = prevData;
|
||||||
|
getInputTo(prevLayer->outData.front())[nextLayers[i]->name] = nextLayers[i];
|
||||||
|
|
||||||
|
// After layer gets removed lets absorb its params in QuantParams structure
|
||||||
|
// replacing scale factor from this fq layer
|
||||||
|
auto quantParamsNextLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(nextLayers[i]);
|
||||||
|
quantParamsNextLayer->_src_quant.SetScale(scaleOutputs);
|
||||||
|
quantParamsNextLayer->_src_quant.SetLevels(fqLevels);
|
||||||
|
quantParamsNextLayer->_src_quant.SetMinValues({ outputRange.first[0] });
|
||||||
|
quantParamsNextLayer->_src_quant.SetMaxValues({ outputRange.second[0] });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int PassManager::run(int index) {
|
int PassManager::run(int index) {
|
||||||
#ifdef PLOT
|
#ifdef PLOT
|
||||||
auto dumpNetworkAfterPass = [&index, this] (std::shared_ptr<Pass> pass) {
|
auto dumpNetworkAfterPass = [&index, this] (std::shared_ptr<Pass> pass) {
|
||||||
|
@ -199,6 +199,17 @@ DECL_PASS(FuseMultipleIdentities);
|
|||||||
*/
|
*/
|
||||||
DECL_PASS(BroadcastConst);
|
DECL_PASS(BroadcastConst);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief runs static quantisation on given floating weights and replaces fakeQuantize with constblobs
|
||||||
|
*/
|
||||||
|
DECL_PASS(FuseFQIntoWeights);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief remove all fake quantize layers while moving it's settings into QuantParams for certain layer
|
||||||
|
*/
|
||||||
|
DECL_PASS(MoveFakeQuantizeLayerIntoQuantParams);
|
||||||
|
|
||||||
|
|
||||||
struct PassManagerSettings {
|
struct PassManagerSettings {
|
||||||
Policy policy;
|
Policy policy;
|
||||||
/// @brief whether to run passes before copy
|
/// @brief whether to run passes before copy
|
||||||
|
@ -1047,25 +1047,32 @@ void PwlApply32(intel_dnn_component_t *component,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case kActFakeQuantize: {
|
case kActFakeQuantize: {
|
||||||
auto input_low = transform->func_id.args.fakeQuantize.input_low;
|
|
||||||
auto input_high = transform->func_id.args.fakeQuantize.input_high;
|
|
||||||
auto output_low = transform->func_id.args.fakeQuantize.output_low;
|
|
||||||
auto output_high = transform->func_id.args.fakeQuantize.output_high;
|
|
||||||
auto levels = transform->func_id.args.fakeQuantize.levels;
|
auto levels = transform->func_id.args.fakeQuantize.levels;
|
||||||
// TODO: this special modification for spedup-compute give different result with straight FQ forulae
|
|
||||||
// but this used in referencen graph FakeQuantize implementations so we need to honor it for a while
|
|
||||||
float scaleInput = (input_high - input_low) / (levels-1);
|
|
||||||
float scaleOutputs = (output_high - output_low) / (levels-1);
|
|
||||||
|
|
||||||
for (uint32_t i = num_row_start; i <= num_row_end; i++) {
|
for (uint32_t i = num_row_start; i <= num_row_end; i++) {
|
||||||
|
auto inputChannel = transform->func_id.args.fakeQuantize.inputPerChannel ? i : 0;
|
||||||
|
auto outputChannel = transform->func_id.args.fakeQuantize.outputPerChannel ? i : 0;
|
||||||
|
|
||||||
|
auto input_low = transform->func_id.args.fakeQuantize.input_low[inputChannel];
|
||||||
|
auto input_high = transform->func_id.args.fakeQuantize.input_high[inputChannel];
|
||||||
|
auto output_low = transform->func_id.args.fakeQuantize.output_low[outputChannel];
|
||||||
|
auto output_high = transform->func_id.args.fakeQuantize.output_high[outputChannel];
|
||||||
|
|
||||||
|
// TODO: this special modification for spedup-compute give different result with straight FQ formulae
|
||||||
|
// but this used in reference graph FakeQuantize implementations so we need to honor it for a while
|
||||||
|
float scaleInput = (input_high - input_low) / (levels-1);
|
||||||
|
float scaleOutputs = (output_high - output_low) / (levels-1);
|
||||||
|
|
||||||
for (uint32_t j = num_col_start; j <= num_col_end; j++) {
|
for (uint32_t j = num_col_start; j <= num_col_end; j++) {
|
||||||
auto x = ptr_in[i * num_columns + j];
|
auto offset = i * num_columns + j;
|
||||||
|
auto x = ptr_in[offset];
|
||||||
|
|
||||||
if (x < std::min(input_low, input_high)) {
|
if (x < std::min(input_low, input_high)) {
|
||||||
ptr_out[i * num_columns + j] = output_low;
|
ptr_out[offset] = output_low;
|
||||||
} else if (x > std::max(input_low, input_high)) {
|
} else if (x > std::max(input_low, input_high)) {
|
||||||
ptr_out[i * num_columns + j] = output_high;
|
ptr_out[offset] = output_high;
|
||||||
} else {
|
} else {
|
||||||
ptr_out[i * num_columns + j] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low;
|
ptr_out[offset] = nearbyint((x - input_low) / scaleInput) * scaleOutputs + output_low;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,17 +41,45 @@ const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes = {
|
|||||||
// {"sw_exact_i8", configInt8},
|
// {"sw_exact_i8", configInt8},
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::vector<std::vector<size_t>> inputShapes = {{1, 1, 1, 1}, {3, 10, 5, 6}};
|
const std::vector<std::vector<size_t>> inputShapes = {
|
||||||
|
{3, 10, 5, 6},
|
||||||
|
{1, 1, 1, 1},
|
||||||
|
{1, 8, 8, 256},
|
||||||
|
{1, 2, 2, 2},
|
||||||
|
{1, 3, 4, 5},
|
||||||
|
};
|
||||||
const std::vector<std::vector<size_t>> constShapes = {{1}};
|
const std::vector<std::vector<size_t>> constShapes = {{1}};
|
||||||
const std::vector<size_t> levels = {16, 255, 256};
|
const std::vector<size_t> levels = {16, 255, 256};
|
||||||
|
|
||||||
const std::vector<std::vector<float>> fqArgs = {{0, 10, 2, 5}, {}};
|
const std::vector<std::vector<float>> fqArgs = {{}};
|
||||||
const std::vector<std::vector<float>> inputParams = {{-10, 10, 0.1}, {}};
|
const std::vector<std::vector<float>> inputParams = {{-10, 10, 0.1}, {}};
|
||||||
|
|
||||||
|
const std::vector<float> fqInputMin = {0, 1, 2, 3, 4, 5};
|
||||||
|
const std::vector<float> fqInputMax = {10, 9, 8, 7, 6};
|
||||||
|
const std::vector<float> fqOutputMin = {1, 2, 3, 4};
|
||||||
|
const std::vector<float> fqOutputMax = {8, 7, 6, 5};
|
||||||
|
|
||||||
|
std::vector<std::vector<float>> getInputOutputShapes(const std::vector<float> inputsMin,
|
||||||
|
const std::vector<float> inputsMax,
|
||||||
|
const std::vector<float> OutputsMin,
|
||||||
|
const std::vector<float> OutputsMax,
|
||||||
|
std::vector<std::vector<float>> fqArg) {
|
||||||
|
for (const auto& inputMin : inputsMin) {
|
||||||
|
for (const auto& inputMax : inputsMax) {
|
||||||
|
for (const auto& outputMin : OutputsMin) {
|
||||||
|
for (const auto& outputMax : OutputsMax) {
|
||||||
|
fqArg.push_back({inputMin, inputMax, outputMin, outputMax});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fqArg;
|
||||||
|
}
|
||||||
|
|
||||||
const auto fqParams = ::testing::Combine(
|
const auto fqParams = ::testing::Combine(
|
||||||
::testing::ValuesIn(levels),
|
::testing::ValuesIn(levels),
|
||||||
::testing::ValuesIn(constShapes),
|
::testing::ValuesIn(constShapes),
|
||||||
::testing::ValuesIn(fqArgs),
|
::testing::ValuesIn(getInputOutputShapes(fqInputMin, fqInputMax, fqOutputMin, fqOutputMax, fqArgs)),
|
||||||
::testing::ValuesIn(inputParams)
|
::testing::ValuesIn(inputParams)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -0,0 +1,125 @@
|
|||||||
|
// Copyright (C) 2020 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <gna/gna_config.hpp>
|
||||||
|
|
||||||
|
#include "subgraph_tests/two_fake_quantize_to_fullyconnected.hpp"
|
||||||
|
#include "common_test_utils/test_constants.hpp"
|
||||||
|
|
||||||
|
using namespace LayerTestsDefinitions;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||||
|
InferenceEngine::Precision::FP32, InferenceEngine::Precision::FP16,
|
||||||
|
};
|
||||||
|
|
||||||
|
using ConfigType = std::map<std::string, std::string>;
|
||||||
|
const ConfigType configFP32 = {
|
||||||
|
{"GNA_DEVICE_MODE", "GNA_SW_FP32"},
|
||||||
|
};
|
||||||
|
|
||||||
|
const ConfigType configSWExact = {
|
||||||
|
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
|
||||||
|
{"GNA_COMPACT_MODE", "NO"}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief specific quantisation mode to be used internally
|
||||||
|
*/
|
||||||
|
const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes = {
|
||||||
|
{"sw_fp32", configFP32},
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<std::pair<std::string, ConfigType>> gnaQuantModes_I8 = {
|
||||||
|
{"gna_sw_exact", configSWExact},
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<std::vector<size_t>> inputShapes = {
|
||||||
|
{1, 440}
|
||||||
|
};
|
||||||
|
const std::vector<std::vector<std::vector<size_t>>> constShapes = {
|
||||||
|
{{1}, {2048, 1}}
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<std::vector<std::vector<size_t>>> constShapes_int16 = {
|
||||||
|
{{1}, {1}}
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<size_t> levels_fp = {255, 65535};
|
||||||
|
const std::vector<std::vector<size_t>> levels_i16 = {{65535, 65535}, {32767, 32767}, {16383, 16383}};
|
||||||
|
const std::vector<std::vector<size_t>> levels_i8 = {{255, 255}};
|
||||||
|
|
||||||
|
const std::vector<std::vector<float>> fqArgs = {{-2.0f, 2.0f, -2.0f, 2.0f}};
|
||||||
|
const std::vector<std::vector<float>> inputParams = {{-64, 64, 1}, {-10, 10, 0.1}};
|
||||||
|
const std::vector<std::vector<float>> inputParams_I8 = {{-2.0f, 2.0f, 0.1f}};
|
||||||
|
|
||||||
|
const std::vector<bool> biases = {false, true};
|
||||||
|
|
||||||
|
const auto fqParams = ::testing::Combine(
|
||||||
|
::testing::Values(levels_fp),
|
||||||
|
::testing::ValuesIn(constShapes),
|
||||||
|
::testing::ValuesIn(fqArgs),
|
||||||
|
::testing::ValuesIn(inputParams)
|
||||||
|
);
|
||||||
|
|
||||||
|
const auto fqParams_I8 = ::testing::Combine(
|
||||||
|
::testing::ValuesIn(levels_i8),
|
||||||
|
::testing::ValuesIn(constShapes),
|
||||||
|
::testing::ValuesIn(fqArgs),
|
||||||
|
::testing::ValuesIn(inputParams_I8)
|
||||||
|
);
|
||||||
|
|
||||||
|
const auto fqParams_I16 = ::testing::Combine(
|
||||||
|
::testing::ValuesIn(levels_i16),
|
||||||
|
::testing::ValuesIn(constShapes_int16),
|
||||||
|
::testing::ValuesIn(fqArgs),
|
||||||
|
::testing::ValuesIn(inputParams_I8)
|
||||||
|
);
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph, FakeQuantizeSubgraphTest,
|
||||||
|
::testing::Combine(
|
||||||
|
fqParams,
|
||||||
|
::testing::ValuesIn(netPrecisions),
|
||||||
|
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||||
|
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||||
|
::testing::Values(InferenceEngine::Layout::ANY),
|
||||||
|
::testing::Values(InferenceEngine::Layout::ANY),
|
||||||
|
::testing::ValuesIn(inputShapes),
|
||||||
|
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||||
|
::testing::ValuesIn(gnaQuantModes),
|
||||||
|
::testing::ValuesIn(biases)),
|
||||||
|
FakeQuantizeSubgraphTest::getTestCaseName);
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph_U8, FakeQuantizeSubgraphTest,
|
||||||
|
::testing::Combine(
|
||||||
|
fqParams_I8,
|
||||||
|
::testing::ValuesIn(netPrecisions),
|
||||||
|
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||||
|
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||||
|
::testing::Values(InferenceEngine::Layout::ANY),
|
||||||
|
::testing::Values(InferenceEngine::Layout::ANY),
|
||||||
|
::testing::ValuesIn(inputShapes),
|
||||||
|
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||||
|
::testing::ValuesIn(gnaQuantModes_I8),
|
||||||
|
::testing::ValuesIn(biases)),
|
||||||
|
FakeQuantizeSubgraphTest::getTestCaseName);
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_CASE_P(smoke_FakeQuantize_subgraph_I16, FakeQuantizeSubgraphTest,
|
||||||
|
::testing::Combine(
|
||||||
|
fqParams_I16,
|
||||||
|
::testing::ValuesIn(netPrecisions),
|
||||||
|
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||||
|
::testing::Values(InferenceEngine::Precision::UNSPECIFIED),
|
||||||
|
::testing::Values(InferenceEngine::Layout::ANY),
|
||||||
|
::testing::Values(InferenceEngine::Layout::ANY),
|
||||||
|
::testing::ValuesIn(inputShapes),
|
||||||
|
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||||
|
::testing::ValuesIn(gnaQuantModes_I8),
|
||||||
|
::testing::ValuesIn(biases)),
|
||||||
|
FakeQuantizeSubgraphTest::getTestCaseName);
|
||||||
|
|
||||||
|
} // namespace
|
@ -0,0 +1,52 @@
|
|||||||
|
// Copyright (C) 2020 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "functional_test_utils/layer_test_utils.hpp"
|
||||||
|
#include "ngraph_functions/builders.hpp"
|
||||||
|
#include "ngraph_functions/utils/ngraph_helpers.hpp"
|
||||||
|
|
||||||
|
typedef std::tuple<
|
||||||
|
std::vector<size_t>, // levels
|
||||||
|
std::vector<std::vector<size_t>>, // const inputs shape
|
||||||
|
std::vector<float>, // fake quantize inputLow, inputHigh, outputLow, outputHigh or empty for random
|
||||||
|
std::vector<float> // input generator data: low, high, resolution
|
||||||
|
> fqSpecificParams;
|
||||||
|
typedef std::tuple<
|
||||||
|
fqSpecificParams,
|
||||||
|
InferenceEngine::Precision, // Net precision
|
||||||
|
InferenceEngine::Precision, // Input precision
|
||||||
|
InferenceEngine::Precision, // Output precision
|
||||||
|
InferenceEngine::Layout, // Input layout
|
||||||
|
InferenceEngine::Layout, // Output layout
|
||||||
|
InferenceEngine::SizeVector, // Input shapes
|
||||||
|
LayerTestsUtils::TargetDevice, // Device name
|
||||||
|
std::pair<std::string, std::map<std::string, std::string>>, // Additional backend configuration and alis name to it
|
||||||
|
bool
|
||||||
|
> fqSubgraphTestParamsSet;
|
||||||
|
namespace LayerTestsDefinitions {
|
||||||
|
|
||||||
|
class FakeQuantizeSubgraphTest : public testing::WithParamInterface<fqSubgraphTestParamsSet>,
|
||||||
|
virtual public LayerTestsUtils::LayerTestsCommon {
|
||||||
|
public:
|
||||||
|
static std::string getTestCaseName(testing::TestParamInfo<fqSubgraphTestParamsSet> obj);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void SetUp() override;
|
||||||
|
InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
float inputDataMin = 0.0;
|
||||||
|
float inputDataMax = 10.0;
|
||||||
|
float inputDataResolution = 1.0;
|
||||||
|
int32_t seed = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace LayerTestsDefinitions
|
@ -111,8 +111,6 @@ void FakeQuantizeLayerTest::SetUp() {
|
|||||||
{fqDirectArg[2]},
|
{fqDirectArg[2]},
|
||||||
{fqDirectArg[3]});
|
{fqDirectArg[3]});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
auto fq = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(fakeQNode);
|
auto fq = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(fakeQNode);
|
||||||
|
|
||||||
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(fq)};
|
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(fq)};
|
||||||
|
@ -0,0 +1,168 @@
|
|||||||
|
// Copyright (C) 2020 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
#include <functional>
|
||||||
|
#include <functional_test_utils/skip_tests_config.hpp>
|
||||||
|
|
||||||
|
#include "ie_core.hpp"
|
||||||
|
|
||||||
|
#include "common_test_utils/common_utils.hpp"
|
||||||
|
#include "functional_test_utils/blob_utils.hpp"
|
||||||
|
#include "functional_test_utils/plugin_cache.hpp"
|
||||||
|
#include "functional_test_utils/layer_test_utils.hpp"
|
||||||
|
|
||||||
|
#include "subgraph_tests/two_fake_quantize_to_fullyconnected.hpp"
|
||||||
|
|
||||||
|
|
||||||
|
namespace LayerTestsDefinitions {
|
||||||
|
|
||||||
|
std::string FakeQuantizeSubgraphTest::getTestCaseName(testing::TestParamInfo<fqSubgraphTestParamsSet> obj) {
|
||||||
|
fqSpecificParams fqParams;
|
||||||
|
InferenceEngine::Precision netPrecision;
|
||||||
|
InferenceEngine::Precision inPrc, outPrc;
|
||||||
|
InferenceEngine::Layout inLayout, outLayout;
|
||||||
|
InferenceEngine::SizeVector inputShapes;
|
||||||
|
std::string targetDevice;
|
||||||
|
std::pair<std::string, std::map<std::string, std::string>> config;
|
||||||
|
bool biases = false;
|
||||||
|
std::tie(fqParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShapes, targetDevice, config, biases) = obj.param;
|
||||||
|
std::vector<size_t> levels;
|
||||||
|
std::vector<std::vector<size_t>> constShape;
|
||||||
|
std::vector<float> fqDirectArgs;
|
||||||
|
std::vector<float> inputArg;
|
||||||
|
std::tie(levels, constShape, fqDirectArgs, inputArg) = fqParams;
|
||||||
|
|
||||||
|
std::ostringstream result;
|
||||||
|
result << "IS=" << CommonTestUtils::vec2str(inputShapes) << "_";
|
||||||
|
result << "CS=" << CommonTestUtils::vec2str(constShape) << "_";
|
||||||
|
result << "LEVELS=" << CommonTestUtils::vec2str(levels) << "_";
|
||||||
|
result << "netPRC=" << netPrecision.name() << "_";
|
||||||
|
result << "inPRC=" << inPrc.name() << "_";
|
||||||
|
result << "outPRC=" << outPrc.name() << "_";
|
||||||
|
result << "inL=" << inLayout << "_";
|
||||||
|
result << "outL=" << outLayout << "_";
|
||||||
|
result << "biases=" << biases << "_";
|
||||||
|
result << "trgDev=" << targetDevice;
|
||||||
|
if (!config.first.empty()) {
|
||||||
|
result << "_targetConfig=" << config.first;
|
||||||
|
}
|
||||||
|
if (!fqDirectArgs.empty()) {
|
||||||
|
result << "_fqArgs=" << fqDirectArgs[0] << "_" << fqDirectArgs[1] << "_" << fqDirectArgs[2] << "_" << fqDirectArgs[3];
|
||||||
|
}
|
||||||
|
if (inputArg.size() == 3) {
|
||||||
|
result << "_inputArg=" << inputArg[0] << "_" << inputArg[1] << "_" << inputArg[2];
|
||||||
|
}
|
||||||
|
return result.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void FakeQuantizeSubgraphTest::SetUp() {
|
||||||
|
fqSpecificParams fqParams;
|
||||||
|
std::vector<size_t> inputShape;
|
||||||
|
std::pair<std::string, std::map<std::string, std::string>> config;
|
||||||
|
auto netPrecision = InferenceEngine::Precision::UNSPECIFIED;
|
||||||
|
bool biases = false;
|
||||||
|
std::tie(fqParams, netPrecision, inPrc, outPrc, inLayout, outLayout, inputShape, targetDevice, config, biases) = this->GetParam();
|
||||||
|
InferenceEngine::SizeVector kernel, stride, dilation;
|
||||||
|
std::vector<size_t> levels;
|
||||||
|
std::vector<std::vector<size_t>> constShape;
|
||||||
|
std::vector<float> fqDirectArg;
|
||||||
|
std::vector<float> inputArg;
|
||||||
|
std::tie(levels, constShape, fqDirectArg, inputArg) = fqParams;
|
||||||
|
if (inputArg.size() == 3) {
|
||||||
|
inputDataMin = inputArg[0];
|
||||||
|
inputDataMax = inputArg[1];
|
||||||
|
inputDataResolution = inputArg[2];
|
||||||
|
}
|
||||||
|
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||||
|
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
|
||||||
|
auto paramOuts = ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes<ngraph::op::Parameter>(params));
|
||||||
|
|
||||||
|
const int seed = 0;
|
||||||
|
std::mt19937 gen(static_cast<float>(seed));
|
||||||
|
|
||||||
|
|
||||||
|
auto generateFloatNumbers = [gen](std::size_t vec_len, float min, float max) mutable {
|
||||||
|
std::vector<float> res;
|
||||||
|
|
||||||
|
std::uniform_real_distribution<float> dist(min, max);
|
||||||
|
for (int i = 0; i < vec_len; i++)
|
||||||
|
res.emplace_back(static_cast<float>(dist(gen)));
|
||||||
|
|
||||||
|
return res;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
auto weightsRowNum = constShape[1][0];
|
||||||
|
auto weightsColNum = inputShape[1];
|
||||||
|
auto weightsData = generateFloatNumbers(weightsRowNum * weightsColNum, inputDataMin, inputDataMax);
|
||||||
|
auto const_param = ngraph::builder::makeConstant<float>(ngPrc, { constShape[1][0], inputShape[1] }, { 1.0f });
|
||||||
|
auto inputMinRange = std::vector<float>{};
|
||||||
|
auto inputMaxRange = std::vector<float>{};
|
||||||
|
auto channelDataSize = constShape[1];
|
||||||
|
|
||||||
|
if (channelDataSize[0] == 1) {
|
||||||
|
// If per tensor data needs to be provided
|
||||||
|
inputMinRange.push_back(inputDataMin);
|
||||||
|
inputMaxRange.push_back(inputDataMax);
|
||||||
|
} else if (channelDataSize[0] == weightsRowNum) {
|
||||||
|
// If per channel data needs to be provided
|
||||||
|
for (size_t i = 0; i < weightsRowNum; ++i) {
|
||||||
|
auto minChannelVal = std::numeric_limits<float>::max();
|
||||||
|
auto maxChannelVal = std::numeric_limits<float>::min();
|
||||||
|
for (size_t j = 0; j < weightsColNum; ++j) {
|
||||||
|
minChannelVal = std::min(minChannelVal, weightsData[i * weightsColNum + j]);
|
||||||
|
maxChannelVal = std::max(maxChannelVal, weightsData[i * weightsColNum + j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
inputMinRange.push_back(minChannelVal);
|
||||||
|
inputMaxRange.push_back(maxChannelVal);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
FAIL() << "Invalid test configuration";
|
||||||
|
}
|
||||||
|
|
||||||
|
auto lowNode = ngraph::builder::makeConstant(ngraph::element::f32, channelDataSize, inputMinRange, false);
|
||||||
|
auto highNode = ngraph::builder::makeConstant(ngraph::element::f32, channelDataSize, inputMaxRange, false);
|
||||||
|
|
||||||
|
auto inputFQNode = ngraph::builder::makeFakeQuantize(paramOuts[0], ngraph::element::f32, levels[0], constShape[0],
|
||||||
|
{ inputDataMin }, { inputDataMax }, { inputDataMin }, { inputDataMax });
|
||||||
|
|
||||||
|
auto weightsFQNode = std::make_shared<ngraph::opset1::FakeQuantize>(const_param,
|
||||||
|
lowNode, highNode, lowNode, highNode, levels[1]);
|
||||||
|
|
||||||
|
auto inputFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(inputFQNode);
|
||||||
|
auto weightsFQ = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(weightsFQNode);
|
||||||
|
auto matmul = std::make_shared<ngraph::opset1::MatMul>(inputFQ, weightsFQ, false, true);
|
||||||
|
std::shared_ptr<ngraph::Node> biases_node;
|
||||||
|
if (biases) {
|
||||||
|
auto const_bias = ngraph::builder::makeConstant(ngPrc, {1, constShape[1][0]}, std::vector<float>{ -1.0f });
|
||||||
|
biases_node = std::make_shared<ngraph::opset1::Add>(matmul, const_bias);
|
||||||
|
} else {
|
||||||
|
biases_node = matmul;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto sigmoid = std::make_shared<ngraph::opset1::Sigmoid>(biases_node);
|
||||||
|
ngraph::ResultVector results{std::make_shared<ngraph::opset1::Result>(sigmoid)};
|
||||||
|
if (biases) {
|
||||||
|
auto sigmoid_2 = std::make_shared<ngraph::opset1::Sigmoid>(inputFQ);
|
||||||
|
results.push_back(std::make_shared<ngraph::opset1::Result>(sigmoid_2));
|
||||||
|
}
|
||||||
|
function = std::make_shared<ngraph::Function>(results, params, "fakeQuantizeSubgraph");
|
||||||
|
|
||||||
|
configuration = config.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
InferenceEngine::Blob::Ptr FakeQuantizeSubgraphTest::GenerateInput(const InferenceEngine::InputInfo &info) const {
|
||||||
|
return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), inputDataMax - inputDataMin, inputDataMin, 1 / inputDataResolution,
|
||||||
|
seed);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_P(FakeQuantizeSubgraphTest, CompareWithRefs) {
|
||||||
|
Run();
|
||||||
|
}
|
||||||
|
} // namespace LayerTestsDefinitions
|
@ -137,8 +137,8 @@ TEST_F(I16QuantisationTest, DISABLED_outputScaleFactorForAffineIsCorrect){
|
|||||||
auto quantParams = getInjectedData<QuantizedLayerParams>(affineLayerPtr);
|
auto quantParams = getInjectedData<QuantizedLayerParams>(affineLayerPtr);
|
||||||
|
|
||||||
|
|
||||||
ASSERT_FLOAT_EQ(quantParams->_dst_quant.scale, 100);
|
ASSERT_FLOAT_EQ(quantParams->_dst_quant.GetScale(), 100);
|
||||||
ASSERT_FLOAT_EQ(quantParams->_weights_quant.scale, 100);
|
ASSERT_FLOAT_EQ(quantParams->_weights_quant.GetScale(), 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion) {
|
TEST_F(I16QuantisationTest, OnlyAffine_NoActivationInsertion) {
|
||||||
|
Loading…
Reference in New Issue
Block a user