GNA low precision (#4109)
* [GNA] Add low precision support for some layers (affine, eltwise, ReLU, sigmoid) * [GNA] Fix convolution and scaleshift regression fail * [GNA] Fix const scale factor calc * [GNA] Fix extra whitespace * [GNA] Eltwise padding and low precision primitive creation support * [GNA] Changes after review * [GNA] Remove INPUT_PRECISION GNA flag Remove INPUT_PRECISION flag, all references to it and tests using it. Other minor stylistic changes and cleanup.
This commit is contained in:
parent
f60c45b788
commit
2486c5b90a
@ -43,12 +43,11 @@ namespace GNAConfigParams {
|
|||||||
DECLARE_GNA_CONFIG_KEY(SCALE_FACTOR);
|
DECLARE_GNA_CONFIG_KEY(SCALE_FACTOR);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief By default gna api work in Int16 precision, however this can be adjusted if necessary,
|
* @brief By default gna api works with Int16 weights precision, however this can be adjusted if necessary,
|
||||||
* currently supported values are I16, I8
|
* currently supported values are I16, I8
|
||||||
*/
|
*/
|
||||||
DECLARE_GNA_CONFIG_KEY(PRECISION);
|
DECLARE_GNA_CONFIG_KEY(PRECISION);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief if turned on, dump GNA firmware model into specified file
|
* @brief if turned on, dump GNA firmware model into specified file
|
||||||
*/
|
*/
|
||||||
|
@ -824,20 +824,38 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
|
|||||||
std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
|
std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
|
||||||
|
|
||||||
if (num_bytes_per_weight == 1) {
|
if (num_bytes_per_weight == 1) {
|
||||||
int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
|
if (num_bytes_per_bias != 1) {
|
||||||
gna_compound_bias_t *ptr_bias = reinterpret_cast<gna_compound_bias_t *>(component[i].op.affine.ptr_biases);
|
int8_t* ptr_weight = reinterpret_cast<int8_t*>(component[i].op.affine.ptr_weights);
|
||||||
|
gna_compound_bias_t* ptr_bias = reinterpret_cast<gna_compound_bias_t*>(component[i].op.affine.ptr_biases);
|
||||||
#ifdef DUMP_WB
|
#ifdef DUMP_WB
|
||||||
for (uint32_t row = 0; row < num_weight_rows; row++) {
|
for (uint32_t row = 0; row < num_weight_rows; row++) {
|
||||||
for (uint32_t col = 0; col < num_weight_columns; col++) {
|
for (uint32_t col = 0; col < num_weight_columns; col++) {
|
||||||
if (logging_precision == kDnnFloat) {
|
if (logging_precision == kDnnFloat) {
|
||||||
float val =
|
float val =
|
||||||
static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
|
static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
|
||||||
/ weight_scale_factor;
|
/ weight_scale_factor;
|
||||||
out_wfile << std::setprecision(4) << val << " ";
|
out_wfile << std::setprecision(4) << val << " ";
|
||||||
} else {
|
} else {
|
||||||
out_wfile << int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
|
out_wfile << int((int8_t)ptr_weight[row * num_weight_columns + col]) << " ";
|
||||||
|
}
|
||||||
|
out_wfile << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
int8_t* ptr_weight = reinterpret_cast<int8_t*>(component[i].op.affine.ptr_weights);
|
||||||
|
#ifdef DUMP_WB
|
||||||
|
for (uint32_t row = 0; row < num_weight_rows; row++) {
|
||||||
|
for (uint32_t col = 0; col < num_weight_columns; col++) {
|
||||||
|
if (logging_precision == kDnnFloat) {
|
||||||
|
float val =
|
||||||
|
static_cast<float>(ptr_weight[row * num_weight_columns + col]) / weight_scale_factor;
|
||||||
|
out_wfile << std::setprecision(4) << val << " ";
|
||||||
|
} else {
|
||||||
|
out_wfile << int((int8_t)ptr_weight[row * num_weight_columns + col]) << " ";
|
||||||
|
}
|
||||||
|
out_wfile << "\n";
|
||||||
}
|
}
|
||||||
out_wfile << "\n";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -873,18 +891,31 @@ void GNAPluginNS::backend::AMIntelDNN::WriteDnnText(const char *filename, intel_
|
|||||||
}
|
}
|
||||||
if (compute_precision_ == kDnnInt) {
|
if (compute_precision_ == kDnnInt) {
|
||||||
if (num_bytes_per_weight == 1) {
|
if (num_bytes_per_weight == 1) {
|
||||||
gna_compound_bias_t
|
if (num_bytes_per_bias != 1) {
|
||||||
*ptr_biases = reinterpret_cast<gna_compound_bias_t *>(component[i].op.affine.ptr_biases);
|
gna_compound_bias_t
|
||||||
|
* ptr_biases = reinterpret_cast<gna_compound_bias_t*>(component[i].op.affine.ptr_biases);
|
||||||
#ifdef DUMP_WB
|
#ifdef DUMP_WB
|
||||||
for (uint32_t row = 0; row < num_rows_out; row++) {
|
for (uint32_t row = 0; row < num_rows_out; row++) {
|
||||||
if (logging_precision == kDnnInt) {
|
if (logging_precision == kDnnInt) {
|
||||||
out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
|
out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
|
||||||
out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
|
out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
|
||||||
} else {
|
} else {
|
||||||
out_bfile << std::setw(8) << ptr_biases[row].bias / output_scale_factor << "\n";
|
out_bfile << std::setw(8) << ptr_biases[row].bias / output_scale_factor << "\n";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
} else {
|
||||||
|
int8_t *ptr_biases = reinterpret_cast<int8_t*>(component[i].op.affine.ptr_biases);
|
||||||
|
#ifdef DUMP_WB
|
||||||
|
for (uint32_t row = 0; row < num_rows_out; row++) {
|
||||||
|
if (logging_precision == kDnnInt) {
|
||||||
|
out_bfile << std::setw(8) << ptr_biases[row] << "\n";
|
||||||
|
} else {
|
||||||
|
out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
|
int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
|
||||||
#ifdef DUMP_WB
|
#ifdef DUMP_WB
|
||||||
@ -2102,9 +2133,12 @@ void GNAPluginNS::backend::AMIntelDNN::WriteInputAndOutputText() {
|
|||||||
} else {
|
} else {
|
||||||
floatValue = reinterpret_cast<float*>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
|
floatValue = reinterpret_cast<float*>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
|
||||||
}
|
}
|
||||||
} else {
|
} else if (component[i].num_bytes_per_output == 2) {
|
||||||
auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
|
auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
|
||||||
floatValue = static_cast<float>(value);
|
floatValue = static_cast<float>(value);
|
||||||
|
} else {
|
||||||
|
auto value = reinterpret_cast<int8_t*>(component[i].ptr_outputs)[k * component[i].num_columns_out + j];
|
||||||
|
floatValue = static_cast<float>(value);
|
||||||
}
|
}
|
||||||
floatValue /= component[i].output_scale_factor;
|
floatValue /= component[i].output_scale_factor;
|
||||||
out_file << std::setw(8) << floatValue << "\n";
|
out_file << std::setw(8) << floatValue << "\n";
|
||||||
@ -2142,10 +2176,14 @@ void GNAPluginNS::backend::AMIntelDNN::WriteInputAndOutputText() {
|
|||||||
} else {
|
} else {
|
||||||
floatValue = reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
|
floatValue = reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
|
||||||
}
|
}
|
||||||
} else {
|
} else if (component[i].num_bytes_per_input == 2) {
|
||||||
auto value = reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in+ j];
|
auto value = reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in+ j];
|
||||||
floatValue = static_cast<float>(value);
|
floatValue = static_cast<float>(value);
|
||||||
|
} else {
|
||||||
|
auto value = reinterpret_cast<int8_t*>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
|
||||||
|
floatValue = static_cast<float>(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
in_file << std::setw(8) << floatValue / input_scale_factor << "\n";
|
in_file << std::setw(8) << floatValue / input_scale_factor << "\n";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -13,6 +13,8 @@ constexpr uint32_t convMinFiltersNum = 4;
|
|||||||
constexpr uint32_t convMaxFiltersNum = 65532;
|
constexpr uint32_t convMaxFiltersNum = 65532;
|
||||||
constexpr uint32_t convFiltersNumDivider = 4;
|
constexpr uint32_t convFiltersNumDivider = 4;
|
||||||
constexpr uint32_t convEachKernelByteAlignment = 16;
|
constexpr uint32_t convEachKernelByteAlignment = 16;
|
||||||
|
constexpr uint32_t noOfInputsDivisor = 8;
|
||||||
|
constexpr uint32_t noOfInputsLowPrecDivisor = 16;
|
||||||
|
|
||||||
}
|
}
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -18,6 +18,7 @@ void make_gna_pwl(const DnnActivation fun,
|
|||||||
const double u_bound,
|
const double u_bound,
|
||||||
const double in_scale,
|
const double in_scale,
|
||||||
const double out_scale,
|
const double out_scale,
|
||||||
|
const bool low_precision,
|
||||||
std::vector<gna_pwl_segment_t> &gna_pwl) {
|
std::vector<gna_pwl_segment_t> &gna_pwl) {
|
||||||
pwl_gna_slope_scale_t s;
|
pwl_gna_slope_scale_t s;
|
||||||
uint32_t pwl_size = static_cast<int32_t>(pwl.size());
|
uint32_t pwl_size = static_cast<int32_t>(pwl.size());
|
||||||
@ -230,7 +231,7 @@ void make_gna_pwl(const DnnActivation fun,
|
|||||||
gnalog() << "=========================== LeakyReLU Segments ======================\n";
|
gnalog() << "=========================== LeakyReLU Segments ======================\n";
|
||||||
int32_t x_lower = INT32_MIN;
|
int32_t x_lower = INT32_MIN;
|
||||||
int32_t x_upper = INT32_MAX;
|
int32_t x_upper = INT32_MAX;
|
||||||
int16_t y_lower = INT16_MIN;
|
int16_t y_lower = low_precision ? INT8_MIN : INT16_MIN;
|
||||||
int16_t y_upper = INT16_MAX;
|
int16_t y_upper = INT16_MAX;
|
||||||
if (fun.fqParams.set) {
|
if (fun.fqParams.set) {
|
||||||
x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale);
|
x_lower = FLOAT_TO_INT32(*fun.fqParams.input_low * 1.25 * in_scale);
|
||||||
|
@ -15,4 +15,5 @@ void make_gna_pwl(const DnnActivation fun,
|
|||||||
const double u_bound,
|
const double u_bound,
|
||||||
const double in_scale,
|
const double in_scale,
|
||||||
const double out_scale,
|
const double out_scale,
|
||||||
|
const bool low_precision,
|
||||||
std::vector<gna_pwl_segment_t> &gna_pwl);
|
std::vector<gna_pwl_segment_t> &gna_pwl);
|
||||||
|
@ -18,5 +18,6 @@ struct GNAFlags {
|
|||||||
bool sw_fp32 = false;
|
bool sw_fp32 = false;
|
||||||
bool fake_quantized = false;
|
bool fake_quantized = false;
|
||||||
bool performance_counting = false;
|
bool performance_counting = false;
|
||||||
|
bool input_low_precision = false;
|
||||||
};
|
};
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -18,7 +18,11 @@ size_t InputDesc::minBytesRequiredForStoreInput(CNNLayerPtr layer) {
|
|||||||
auto quantized = getInjectedData<QuantizedLayerParams>(layer);
|
auto quantized = getInjectedData<QuantizedLayerParams>(layer);
|
||||||
size_t precision_bytes;
|
size_t precision_bytes;
|
||||||
if (quantized) {
|
if (quantized) {
|
||||||
precision_bytes = 2;
|
if (quantized->lowPrecision) {
|
||||||
|
precision_bytes = 1;
|
||||||
|
} else {
|
||||||
|
precision_bytes = 2;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
precision_bytes = 4;
|
precision_bytes = 4;
|
||||||
}
|
}
|
||||||
|
@ -25,6 +25,7 @@ namespace frontend {
|
|||||||
/**
|
/**
|
||||||
* @brief description of quantisation precision
|
* @brief description of quantisation precision
|
||||||
* @tparam Ip - input precision
|
* @tparam Ip - input precision
|
||||||
|
* @tparam Op - output precision
|
||||||
* @tparam Wp - weights precision
|
* @tparam Wp - weights precision
|
||||||
* @tparam Bp - biases precision
|
* @tparam Bp - biases precision
|
||||||
* @tparam Np - network precision - can be auto generated in future
|
* @tparam Np - network precision - can be auto generated in future
|
||||||
@ -82,6 +83,12 @@ struct QuantI8 : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), gna
|
|||||||
_Np = InferenceEngine::Precision::MIXED;
|
_Np = InferenceEngine::Precision::MIXED;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
// Low precision path quantizer (I8 inputs, weights, biases)
|
||||||
|
struct QuantI8_I8 : public QuantDescTmpl<PRECISION_TYPE(I8, I32, I8, I8, MIXED)> {
|
||||||
|
QuantI8_I8() {
|
||||||
|
_Np = InferenceEngine::Precision::MIXED;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// for support proper trait instantiation for quantization function callback
|
// for support proper trait instantiation for quantization function callback
|
||||||
struct FakeQuantI16 : public QuantI16 {};
|
struct FakeQuantI16 : public QuantI16 {};
|
||||||
@ -155,6 +162,17 @@ class Quant<QuantI8> {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<>
|
||||||
|
class Quant<QuantI8_I8> {
|
||||||
|
public:
|
||||||
|
template<class ...Args>
|
||||||
|
void operator()(Args && ... args) const {
|
||||||
|
QuantizationCallback<int8_t, int8_t> {
|
||||||
|
std::forward<Args>(args)...
|
||||||
|
}.runQuantize();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
class Quant<FakeQuantI16> {
|
class Quant<FakeQuantI16> {
|
||||||
public:
|
public:
|
||||||
@ -650,8 +668,8 @@ template<class Desc>
|
|||||||
class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
|
class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
|
||||||
public:
|
public:
|
||||||
explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
|
explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
|
||||||
bool operator()(InferenceEngine::WeightableLayer *wl) const {
|
bool operator()(InferenceEngine::ConvolutionLayer *cl) const {
|
||||||
quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>());
|
quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), cl, Quant<typename Desc::OptionalType>());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -660,8 +678,8 @@ template<class Desc>
|
|||||||
class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
|
class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
|
||||||
public:
|
public:
|
||||||
explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
|
explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
|
||||||
bool operator()(InferenceEngine::ScaleShiftLayer *wl) const {
|
bool operator()(InferenceEngine::ScaleShiftLayer *ssl) const {
|
||||||
quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>(), true);
|
quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), ssl, Quant<typename Desc::OptionalType>(), true);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -680,6 +698,7 @@ class LayersQuantizer : public frontend::DataQuantizerBase {
|
|||||||
|
|
||||||
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
|
using QuantI16 = frontend::QuantPair<frontend::QuantI16, frontend::QuantI16>;
|
||||||
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
|
using QuantI8 = frontend::QuantPair<frontend::QuantI8, frontend::QuantI16>;
|
||||||
|
using QuantI8_I8 = frontend::QuantPair<frontend::QuantI8_I8, frontend::QuantI8_I8>;
|
||||||
|
|
||||||
|
|
||||||
using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
|
using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
|
||||||
|
@ -26,7 +26,7 @@ template<class T>
|
|||||||
class ModelQuantizer {
|
class ModelQuantizer {
|
||||||
public:
|
public:
|
||||||
InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, float scaleFactor) const {
|
InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, float scaleFactor) const {
|
||||||
return quantize(model, [](const InferenceEngine::CNNNetwork &, bool runBeforeCopy){}, std::vector<float>({scaleFactor}));
|
return quantize(model, [](const InferenceEngine::CNNNetwork &, bool runBeforeCopy, bool lowPrecision){}, std::vector<float>({scaleFactor}));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class PreQuantisationCb>
|
template <class PreQuantisationCb>
|
||||||
@ -35,7 +35,7 @@ class ModelQuantizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, std::vector<float> scaleFactor) const {
|
InferenceEngine::CNNNetwork quantize(const InferenceEngine::CNNNetwork &model, std::vector<float> scaleFactor) const {
|
||||||
return quantize(model, [](InferenceEngine::CNNNetwork &, bool runBeforeCopy){}, scaleFactor);
|
return quantize(model, [](InferenceEngine::CNNNetwork &, bool runBeforeCopy, bool lowPrecision){}, scaleFactor);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class PreQuantisationCb>
|
template <class PreQuantisationCb>
|
||||||
@ -45,14 +45,15 @@ class ModelQuantizer {
|
|||||||
transformLayer(newLayer, WeightsConverter());
|
transformLayer(newLayer, WeightsConverter());
|
||||||
return newLayer;
|
return newLayer;
|
||||||
};
|
};
|
||||||
|
bool lowPrecision = (T::mandatory().getInputPrecision().size() == sizeof(uint8_t));
|
||||||
InferenceEngine::CNNNetwork copiedNet = InferenceEngine::CNNNetCopy(model);
|
InferenceEngine::CNNNetwork copiedNet = InferenceEngine::CNNNetCopy(model);
|
||||||
cb(copiedNet, true);
|
cb(copiedNet, true, lowPrecision);
|
||||||
|
|
||||||
copiedNet = InferenceEngine::CNNNetCopy(copiedNet, visitor);
|
copiedNet = InferenceEngine::CNNNetCopy(copiedNet, visitor);
|
||||||
|
|
||||||
// allow client code to access copied topology, to avoid copies if user would like to chain quantisation with
|
// allow client code to access copied topology, to avoid copies if user would like to chain quantisation with
|
||||||
// another preprocessing
|
// another preprocessing
|
||||||
cb(copiedNet, false);
|
cb(copiedNet, false, lowPrecision);
|
||||||
|
|
||||||
if (scaleFactor.empty()) {
|
if (scaleFactor.empty()) {
|
||||||
THROW_GNA_EXCEPTION << "Scale factor is empty";
|
THROW_GNA_EXCEPTION << "Scale factor is empty";
|
||||||
@ -62,6 +63,8 @@ class ModelQuantizer {
|
|||||||
auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(copiedNet);
|
auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(copiedNet);
|
||||||
gnalog() << "Sorted layers: " << std::endl;
|
gnalog() << "Sorted layers: " << std::endl;
|
||||||
for (auto &&layer : sortedNewNet) {
|
for (auto &&layer : sortedNewNet) {
|
||||||
|
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
||||||
|
quantData->lowPrecision = lowPrecision;
|
||||||
gnalog() << layer->name << std::endl;
|
gnalog() << layer->name << std::endl;
|
||||||
}
|
}
|
||||||
/// filling scale factors for input layers, memory layers will have scaleFactor of 1.0 by default
|
/// filling scale factors for input layers, memory layers will have scaleFactor of 1.0 by default
|
||||||
@ -79,7 +82,8 @@ class ModelQuantizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool isFakeQuantize = std::is_same<T, FakeQuantI8>() || std::is_same<T, FakeQuantI16>();
|
bool isFakeQuantize = std::is_same<T, FakeQuantI8>() || std::is_same<T, FakeQuantI16>();
|
||||||
propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), isFakeQuantize);
|
propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), T::optional().getWeightsPrecision().size(),
|
||||||
|
T::mandatory().getInputPrecision().size(), isFakeQuantize);
|
||||||
|
|
||||||
// sorted order gives possibility for propagate quantisation along depended layers
|
// sorted order gives possibility for propagate quantisation along depended layers
|
||||||
for (auto &&layer : sortedNewNet) {
|
for (auto &&layer : sortedNewNet) {
|
||||||
@ -90,8 +94,9 @@ class ModelQuantizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private :
|
private :
|
||||||
void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int weightsBytesSize, bool fakeQuantize) const {
|
void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int mandWeightsBytesSize,
|
||||||
ScaleFactorCalculator sf(net, weightsBytesSize, fakeQuantize);
|
int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize) const {
|
||||||
|
ScaleFactorCalculator sf(net, mandWeightsBytesSize, optWeightsBytesSize, inputsBytesSize, fakeQuantize);
|
||||||
|
|
||||||
while (!sf.allLayersProcessed()) {
|
while (!sf.allLayersProcessed()) {
|
||||||
for (auto &&layer : sf.getStartLayers()) {
|
for (auto &&layer : sf.getStartLayers()) {
|
||||||
|
@ -358,7 +358,6 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
|
|||||||
int8_t *ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
|
int8_t *ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
|
rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
|
||||||
|
|
||||||
|
|
||||||
value = ptr_float_weights[row * num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
|
value = ptr_float_weights[row * num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
|
||||||
if (value > 127.0) {
|
if (value > 127.0) {
|
||||||
*ptr_weight_8 = 127;
|
*ptr_weight_8 = 127;
|
||||||
@ -404,3 +403,57 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runQuantize() const {
|
|||||||
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
|
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<>
|
||||||
|
void QuantizationCallback<int8_t, int8_t>::runQuantize() const {
|
||||||
|
uint32_t num_saturate = 0;
|
||||||
|
for (uint32_t row = 0; row < num_rows; row++) {
|
||||||
|
for (uint32_t col = 0; col < num_columns; col++) {
|
||||||
|
float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value;
|
||||||
|
int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
|
if (value > 127.0) {
|
||||||
|
*ptr_weight_8 = 127;
|
||||||
|
num_saturate++;
|
||||||
|
} else if (value < -128.0) {
|
||||||
|
*ptr_weight_8 = -128;
|
||||||
|
num_saturate++;
|
||||||
|
} else {
|
||||||
|
*ptr_weight_8 = (int8_t)value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (uint32_t col = num_columns; col < num_columns_padded; col++) {
|
||||||
|
int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
|
*ptr_weight_8 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (uint32_t row = num_rows; row < num_rows_padded; row++) {
|
||||||
|
for (uint32_t col = 0; col < num_columns_padded; col++) {
|
||||||
|
int8_t* ptr_weight_8 = ptr_int_weights + (row * num_columns_padded + col);
|
||||||
|
*ptr_weight_8 = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
|
||||||
|
for (uint32_t j = 0; j < num_rows; j++) {
|
||||||
|
float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
|
||||||
|
if (value > 127.0) {
|
||||||
|
ptr_int_biases[j] = 127;
|
||||||
|
num_saturate++;
|
||||||
|
} else if (value < -128.0) {
|
||||||
|
ptr_int_biases[j] = -128;
|
||||||
|
num_saturate++;
|
||||||
|
} else {
|
||||||
|
ptr_int_biases[j] = (int8_t)value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (uint32_t j = num_rows; j < num_rows_padded; j++) {
|
||||||
|
ptr_int_biases[j] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (num_saturate > 0) {
|
||||||
|
QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8_8()\n", num_saturate, num_rows * num_columns + num_rows);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
|
|
||||||
#define MAX_OUT_MULTIPLIER 230
|
#define MAX_OUT_MULTIPLIER 230
|
||||||
#define MAX_VAL_1B_WEIGHT 127
|
#define MAX_VAL_1B_WEIGHT 127
|
||||||
|
#define MAX_VAL_1B_FEAT 64
|
||||||
|
#define MAX_VAL_1B_BIAS 127
|
||||||
#define MAX_VAL_2B_WEIGHT 16384
|
#define MAX_VAL_2B_WEIGHT 16384
|
||||||
#define MAX_VAL_2B_FEAT 16384
|
#define MAX_VAL_2B_FEAT 16384
|
||||||
#define MAX_VAL_4B_BIAS 1073741824
|
#define MAX_VAL_4B_BIAS 1073741824
|
||||||
@ -45,6 +47,7 @@ struct QuantizationCallback {
|
|||||||
|
|
||||||
template class QuantizationCallback<int16_t, int32_t>;
|
template class QuantizationCallback<int16_t, int32_t>;
|
||||||
template class QuantizationCallback<int8_t, gna_compound_bias_t>;
|
template class QuantizationCallback<int8_t, gna_compound_bias_t>;
|
||||||
|
template class QuantizationCallback<int8_t, int8_t>;
|
||||||
|
|
||||||
std::pair<float, float> FindMinMaxValues(void* ptr_float_memory, size_t num_elements);
|
std::pair<float, float> FindMinMaxValues(void* ptr_float_memory, size_t num_elements);
|
||||||
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
|
float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
|
||||||
|
@ -84,8 +84,8 @@ struct QuantizedLayerParams {
|
|||||||
// deprecate this
|
// deprecate this
|
||||||
Quantization _weights_quant;
|
Quantization _weights_quant;
|
||||||
Quantization _bias_quant;
|
Quantization _bias_quant;
|
||||||
float _o_shift = 0.0f;
|
|
||||||
float _b_shift = 0.0f;
|
bool lowPrecision = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -182,14 +182,14 @@ template<class T>
|
|||||||
class ScaleFactorPerLayer {
|
class ScaleFactorPerLayer {
|
||||||
public:
|
public:
|
||||||
/**
|
/**
|
||||||
* @brief calculates weights scale factor for fit dynamic range into target bitsize,
|
* @brief calculates weights scale factor to fit dynamic range into target bitsize,
|
||||||
* also calculates output scale factor for the given layer
|
* also calculates output scale factor for the given layer
|
||||||
* @param cnnLayer
|
* @param cnnLayer
|
||||||
* @param weightsSize
|
* @param weightsSize
|
||||||
* @param result
|
* @param result
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
bool operator()(T cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
bool operator()(T cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -198,6 +198,7 @@ template<>
|
|||||||
class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
||||||
private :
|
private :
|
||||||
const float activation_scale_factor = 2048.f;
|
const float activation_scale_factor = 2048.f;
|
||||||
|
const float low_prec_activation_scale_factor = 4.f;
|
||||||
const float identity_scale_factor = 2049.0f;
|
const float identity_scale_factor = 2049.0f;
|
||||||
const float max_activation_scale_factor = 4096.0f;
|
const float max_activation_scale_factor = 4096.0f;
|
||||||
const float k = 5;
|
const float k = 5;
|
||||||
@ -207,12 +208,13 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
protected :
|
protected :
|
||||||
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
|
float getActivationScale(InferenceEngine::CNNLayer const* cnnLayer,
|
||||||
GNAPluginNS::LayerInfo const& layer,
|
GNAPluginNS::LayerInfo const& layer,
|
||||||
|
int inputsSize,
|
||||||
const bool fakeQuantize) {
|
const bool fakeQuantize) {
|
||||||
auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
|
auto quantizedParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
|
||||||
|
|
||||||
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
|
// todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
|
||||||
// set the initial value
|
// set the initial value
|
||||||
float result = activation_scale_factor;
|
float result = (inputsSize == 2 ? activation_scale_factor : low_prec_activation_scale_factor);
|
||||||
if (layer.isIdentity()) {
|
if (layer.isIdentity()) {
|
||||||
// #define accurate_identity_scale_factor
|
// #define accurate_identity_scale_factor
|
||||||
#ifdef accurate_identity_scale_factor
|
#ifdef accurate_identity_scale_factor
|
||||||
@ -247,11 +249,13 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor;
|
result = fabs(scale_extra) > fabs(scale_default) ? identity_scale_factor / 2 : identity_scale_factor;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
} else if (layer.isRelu() &&
|
} else if (layer.isRelu()) {
|
||||||
static_cast<uint64_t>(activation_scale_factor * quantizedParams->_src_quant.GetScale())
|
|
||||||
> std::numeric_limits<int32_t>::max()-1) {
|
|
||||||
// if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
|
// if activation is one from relu family, we need to apply heuristic to avoid activation output overflow
|
||||||
result = (activation_scale_factor * 0.5);
|
auto limit = (inputsSize == 1 ? std::numeric_limits<int8_t>::max() : std::numeric_limits<int32_t>::max()) - 1;
|
||||||
|
|
||||||
|
if (static_cast<uint64_t>(result * quantizedParams->_src_quant.GetScale()) > limit) {
|
||||||
|
result *= 0.5;
|
||||||
|
}
|
||||||
} else if (layer.isPower()) {
|
} else if (layer.isPower()) {
|
||||||
auto powerLayer = dynamic_cast<InferenceEngine::PowerLayer const*>(cnnLayer);
|
auto powerLayer = dynamic_cast<InferenceEngine::PowerLayer const*>(cnnLayer);
|
||||||
if (!powerLayer) {
|
if (!powerLayer) {
|
||||||
@ -381,7 +385,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
(layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
|
(layer.isIdentity() || layer.isFakeQuantize()) && LayerInfo(prevLayer).isWeightableIdentity()) {
|
||||||
auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
|
auto prevLayerQuant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
|
||||||
if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
|
if (!fp32eq(prevLayerQuant->_src_quant.GetScale(), 1.0f) &&
|
||||||
(prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
|
(prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
|
||||||
result = prevLayerQuant->_src_quant.GetScale();
|
result = prevLayerQuant->_src_quant.GetScale();
|
||||||
usePrevScaleFactor = true;
|
usePrevScaleFactor = true;
|
||||||
}
|
}
|
||||||
@ -412,7 +416,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public :
|
public :
|
||||||
bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
||||||
if ( !cnnLayer ) {
|
if ( !cnnLayer ) {
|
||||||
IE_THROW() << "Incorrect Convolutional Layer pointer \n";
|
IE_THROW() << "Incorrect Convolutional Layer pointer \n";
|
||||||
}
|
}
|
||||||
@ -544,7 +548,13 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto levels = fakeQuantize ? MAX_VAL_2B_FEAT : std::numeric_limits<int16_t>::max();
|
auto levels = 0;
|
||||||
|
if (fakeQuantize) {
|
||||||
|
levels = (inputsSize == 2) ? MAX_VAL_2B_FEAT : MAX_VAL_1B_FEAT;
|
||||||
|
} else {
|
||||||
|
levels = (inputsSize == 2) ? std::numeric_limits<int16_t>::max() : std::numeric_limits<int8_t>::max();
|
||||||
|
}
|
||||||
|
|
||||||
auto abs_val = std::max(std::abs(max_val), std::abs(min_val));
|
auto abs_val = std::max(std::abs(max_val), std::abs(min_val));
|
||||||
auto scale_val = static_cast<float>(levels) / abs_val;
|
auto scale_val = static_cast<float>(levels) / abs_val;
|
||||||
//TODO: use FQ formula for scale factor calculation
|
//TODO: use FQ formula for scale factor calculation
|
||||||
@ -592,7 +602,7 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) ||
|
if (!quant->_dst_quant.IsScaleSet() || fp32eq(quant->_dst_quant.GetScale(), 1.0f) ||
|
||||||
!fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
|
!fp32eq(quant->_src_quant.GetScale(), inputQuant->_dst_quant.GetScale())) {
|
||||||
quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
|
quant->_src_quant.SetScale(inputQuant->_dst_quant.GetScale());
|
||||||
auto scale = getActivationScale(cnnLayer, layerInfo, fakeQuantize);
|
auto scale = getActivationScale(cnnLayer, layerInfo, inputsSize, fakeQuantize);
|
||||||
quant->_dst_quant.SetScale(scale);
|
quant->_dst_quant.SetScale(scale);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -613,10 +623,12 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
|
|||||||
template<>
|
template<>
|
||||||
class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
||||||
public:
|
public:
|
||||||
bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
||||||
if ( !eltwiseLayer ) {
|
if ( !eltwiseLayer ) {
|
||||||
THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
|
THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
|
||||||
}
|
}
|
||||||
|
bool lowPrecision = (inputsSize == sizeof(int8_t));
|
||||||
|
|
||||||
auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
|
auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
|
||||||
auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
|
auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
|
||||||
|
|
||||||
@ -641,7 +653,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
}) : in0;
|
}) : in0;
|
||||||
|
|
||||||
if (LayerInfo(in0).has32BOutput() ||
|
if (LayerInfo(in0).has32BOutput() ||
|
||||||
(LayerInfo(in0).isNonFunctional() && (LayerInfo(eltwiseFunctionalPrev).has32BOutput()))) {
|
(LayerInfo(in0).isNonFunctional() && LayerInfo(eltwiseFunctionalPrev).has32BOutput())) {
|
||||||
std::swap(in0, in1);
|
std::swap(in0, in1);
|
||||||
std::swap(quantParams0, quantParams1);
|
std::swap(quantParams0, quantParams1);
|
||||||
}
|
}
|
||||||
@ -654,47 +666,50 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
// this path might result in significant data loss
|
// this path might result in significant data loss
|
||||||
quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
|
quantData->_bias_quant.SetScale(quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale());
|
||||||
auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale();
|
auto weightsScale = quantParams1->_dst_quant.GetScale() / quantParams0->_dst_quant.GetScale();
|
||||||
auto prevLayerIn1 = CNNNetPrevLayer(in1);
|
|
||||||
// If a previous layer is a layer where freely weights scale factor can be selected,
|
// If a previous layer is a layer where freely weights scale factor can be selected,
|
||||||
// try to find the scale factor that will allow to use integer as weights scale factor for eltwise
|
// try to find the scale factor that will allow to use integer as weights scale factor for eltwise
|
||||||
// operation.
|
// operation.
|
||||||
// If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation.
|
// If the weights scale factor for eltwise sum/sub is not integer, it will cause accuracy degradation.
|
||||||
if (fakeQuantize && LayerInfo(in1).isWeightableIdentity() &&
|
if (fakeQuantize) {
|
||||||
(prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has16BOutput())) {
|
auto prevLayerIn1 = CNNNetPrevLayer(in1);
|
||||||
auto bestWeightsScale = 0.0f;
|
if (LayerInfo(in1).isWeightableIdentity() &&
|
||||||
auto bestError = static_cast<float>(std::numeric_limits<int16_t>::max());
|
(prevLayerIn1 == nullptr || LayerInfo(prevLayerIn1).has8BOr16BOutput())) {
|
||||||
auto scaleIn0Dst = quantParams0->_dst_quant.GetScale();
|
auto bestWeightsScale = 0.0f;
|
||||||
auto scaleIn1Src = quantParams1->_src_quant.GetScale();
|
auto bestError = static_cast<float>(std::numeric_limits<int16_t>::max());
|
||||||
for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) {
|
auto scaleIn0Dst = quantParams0->_dst_quant.GetScale();
|
||||||
auto scaleIn1Dst = i * scaleIn1Src;
|
auto scaleIn1Src = quantParams1->_src_quant.GetScale();
|
||||||
auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst;
|
for (size_t i = MAX_VAL_2B_FEAT; i > 0; --i) {
|
||||||
if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits<int16_t>::max() - 1) {
|
auto scaleIn1Dst = i * scaleIn1Src;
|
||||||
continue;
|
auto eltwiseWeightsScale = scaleIn1Dst / scaleIn0Dst;
|
||||||
|
if (eltwiseWeightsScale < 1.0 || eltwiseWeightsScale > std::numeric_limits<int16_t>::max() - 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto error = std::abs(eltwiseWeightsScale - static_cast<int16_t>(eltwiseWeightsScale));
|
||||||
|
if (error < bestError) {
|
||||||
|
bestError = error;
|
||||||
|
bestWeightsScale = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fp32eq(error, 0.0f)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto error = std::abs(eltwiseWeightsScale - static_cast<int16_t>(eltwiseWeightsScale));
|
if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
|
||||||
if (error < bestError) {
|
quantParams1->_weights_quant.SetScale(bestWeightsScale);
|
||||||
bestError = error;
|
quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
|
||||||
bestWeightsScale = i;
|
result = ScaleFactorUpdateResult(in1.get());
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fp32eq(error, 0.0f)) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
|
|
||||||
quantParams1->_weights_quant.SetScale(bestWeightsScale);
|
|
||||||
quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
|
|
||||||
result = ScaleFactorUpdateResult(in1.get());
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
quantData->_weights_quant.SetScale(weightsScale);
|
quantData->_weights_quant.SetScale(weightsScale);
|
||||||
quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
|
quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());
|
||||||
|
|
||||||
// eltwise will always work in int16
|
// eltwise will work in int16 or int8 if low precision inputs are used
|
||||||
auto maxValue = std::numeric_limits<int16_t>::max() - 1;
|
auto maxValue = lowPrecision ? (std::numeric_limits<int8_t>::max() - 1) : (std::numeric_limits<int16_t>::max() - 1);
|
||||||
if (quantData->_weights_quant.GetScale() > maxValue + 1) {
|
if (quantData->_weights_quant.GetScale() > maxValue + 1) {
|
||||||
// rescaling it's activation input
|
// rescaling it's activation input
|
||||||
// iterating thru previous layers of eltwise
|
// iterating thru previous layers of eltwise
|
||||||
@ -710,7 +725,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
// this case for input from port 0
|
// this case for input from port 0
|
||||||
if (info.isSplit() || info.isSlice()) {
|
if (info.isSplit() || info.isSlice()) {
|
||||||
continue;
|
continue;
|
||||||
} else if (info.has16BOutput() && info.isActivation()) {
|
} else if (info.has8BOr16BOutput() && info.isActivation()) {
|
||||||
auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue;
|
auto newOutputScale = quantParams->_dst_quant.GetScale() / maxValue;
|
||||||
if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
|
if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
|
||||||
break;
|
break;
|
||||||
@ -722,7 +737,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
quantDataForActivation->_dst_quant.SetScale(newOutputScale);
|
quantDataForActivation->_dst_quant.SetScale(newOutputScale);
|
||||||
result = ScaleFactorUpdateResult(in.get());
|
result = ScaleFactorUpdateResult(in.get());
|
||||||
return true;
|
return true;
|
||||||
} else if (info.has16BOutput()) {
|
} else if (info.has8BOr16BOutput()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -768,7 +783,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
|
|||||||
template<>
|
template<>
|
||||||
class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
||||||
public:
|
public:
|
||||||
bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
||||||
if ( !concatLayer ) {
|
if ( !concatLayer ) {
|
||||||
THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
|
THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
|
||||||
}
|
}
|
||||||
@ -960,7 +975,7 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr;
|
auto prevLayer2 = prevLayer != nullptr ? CNNNetPrevLayerSkipCertain(prevLayer, 0, skipNonFunctional) : nullptr;
|
||||||
|
|
||||||
if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() &&
|
if (fakeQuantize && prevLayer != nullptr && LayerInfo(prevLayer).isWeightableIdentity() &&
|
||||||
(prevLayer2 == nullptr || LayerInfo(prevLayer2).has16BOutput())) {
|
(prevLayer2 == nullptr || LayerInfo(prevLayer2).has8BOr16BOutput())) {
|
||||||
auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL,
|
auto weightsScales = generateScaleFactors(MIN_SEARCH_WEIGHTS_VAL, MAX_SEARCH_WEIGHTS_VAL,
|
||||||
MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL);
|
MAX_SEARCH_WEIGHTS_VAL - MIN_SEARCH_WEIGHTS_VAL);
|
||||||
|
|
||||||
@ -1000,18 +1015,17 @@ class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
|
|||||||
template<>
|
template<>
|
||||||
class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||||
private:
|
private:
|
||||||
float const _scale_reduction_50 = 0.50;
|
std::vector<std::tuple<uint16_t const, float const, float const>> thresholds {
|
||||||
float const _scale_reduction_45 = 0.45;
|
// tuple values: scale factor threshold, scale factor reduction factor for I16 precision, for I8 precision
|
||||||
float const _scale_reduction_40 = 0.40;
|
std::make_tuple(30, 0.50f, 0.50f), // entry check value
|
||||||
float const _scale_reduction_35 = 0.35;
|
std::make_tuple(100, 0.50f, 0.50f), // if below this threshold, then use this factor
|
||||||
|
std::make_tuple(150, 0.45f, 0.45f),
|
||||||
uint16_t const _scale_change_req_threshold = 30;
|
std::make_tuple(200, 0.40f, 0.40f),
|
||||||
uint16_t const _scale_change_threshold_100 = 100;
|
std::make_tuple(200, 0.35f, 0.35f) // max level -> if above, then use this factor
|
||||||
uint16_t const _scale_change_threshold_150 = 150;
|
};
|
||||||
uint16_t const _scale_change_threshold_200 = 200;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
||||||
if ( !wl ) {
|
if ( !wl ) {
|
||||||
THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n";
|
THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n";
|
||||||
} else if (!wl->_weights) {
|
} else if (!wl->_weights) {
|
||||||
@ -1063,18 +1077,30 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (wl->_biases) {
|
if (wl->_biases) {
|
||||||
quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float *>(),
|
// for now the only case of INT8 bias we support comes with INT8 inputs and weights as well
|
||||||
MAX_VAL_4B_BIAS,
|
if (inputsSize == 1 && weightsSize == 1) {
|
||||||
wl->_biases->size()));
|
quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float*>(),
|
||||||
|
MAX_VAL_1B_BIAS,
|
||||||
|
wl->_biases->size()));
|
||||||
|
} else {
|
||||||
|
quant->_bias_quant.SetScale(ScaleFactorForQuantization(wl->_biases->buffer().as<float*>(),
|
||||||
|
MAX_VAL_4B_BIAS,
|
||||||
|
wl->_biases->size()));
|
||||||
|
}
|
||||||
if (quant->_bias_quant.GetScale() != -1.0f) {
|
if (quant->_bias_quant.GetScale() != -1.0f) {
|
||||||
quant->_bias_quant.SetScale(
|
// for low precision we don't change bias scale factor based on source and weights scale factors
|
||||||
std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
|
// in order not to loose too much precision
|
||||||
|
if (inputsSize != 1 || weightsSize != 1) {
|
||||||
|
quant->_bias_quant.SetScale(
|
||||||
|
std::min(quant->_weights_quant.GetScale() * quant->_src_quant.GetScale(), quant->_bias_quant.GetScale()));
|
||||||
|
}
|
||||||
quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
|
quant->_weights_quant.SetScale(quant->_bias_quant.GetScale() / quant->_src_quant.GetScale());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: findout why ???
|
// use the MAX_OUT_MULTIPLIER only for int8_t weigths with compound bias (for now handled here only with int16_t inputs)
|
||||||
if (weightsSize == 1) {
|
// it gives the possibility to exetend the output dynamic range
|
||||||
|
if (weightsSize == 1 && inputsSize == 2) {
|
||||||
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER);
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * MAX_OUT_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1089,23 +1115,22 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
|
double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
|
||||||
if (weightsSize == 1 &&
|
if (weightsSize == 1) {
|
||||||
static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
|
auto itt = thresholds.begin();
|
||||||
static_cast<uint64_t>(std::numeric_limits<int32_t>::max() - 1) * _scale_change_req_threshold) {
|
auto limit = std::numeric_limits<int32_t>::max();
|
||||||
gnawarn() << "Output scale for " << wl->name
|
|
||||||
<< " too large and are being reduced. Else saturations likely will happen \n";
|
if (inputsSize == 1) {
|
||||||
// reduce weight scale according experimental heuristic
|
limit = std::numeric_limits<int8_t>::max();
|
||||||
if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
}
|
||||||
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_100) {
|
|
||||||
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_50);
|
if (static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.GetScale()) >
|
||||||
} else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
static_cast<uint64_t>(limit - 1) * std::get<0>(*itt)) {
|
||||||
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_150) {
|
gnawarn() << "Output scale for " << wl->name
|
||||||
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_45);
|
<< " too large and are being reduced. Else saturations likely will happen \n";
|
||||||
} else if (quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
// reduce weight scale according experimental heuristic
|
||||||
static_cast<float>(std::numeric_limits<int32_t>::max()) < _scale_change_threshold_200) {
|
while ((itt + 1) != thresholds.end() && quant->_dst_quant.GetScale() * quant->_src_quant.GetScale() /
|
||||||
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_40);
|
static_cast<float>(limit) >= std::get<0>(*(++itt))) {}
|
||||||
} else {
|
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * (inputsSize == 2 ? std::get<1>(*itt) : std::get<2>(*itt)));
|
||||||
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() * _scale_reduction_35);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1149,17 +1174,10 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
|||||||
|
|
||||||
template<>
|
template<>
|
||||||
class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||||
public:
|
|
||||||
bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, ScaleFactorUpdateResult &result, const bool fakeQuantize) {
|
|
||||||
return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, result, fakeQuantize);
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* GNA convolutions cannot be quantized in int8, remove when library starts support that
|
|
||||||
*/
|
|
||||||
template<>
|
template<>
|
||||||
class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
|
class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -1174,12 +1192,15 @@ class ScaleFactorCalculator {
|
|||||||
Cnt net;
|
Cnt net;
|
||||||
mutable Cnt::const_iterator idx;
|
mutable Cnt::const_iterator idx;
|
||||||
mutable bool needRestart = false;
|
mutable bool needRestart = false;
|
||||||
int weightsBytesSize;
|
int mandWeightsBytesSize;
|
||||||
|
int optWeightsBytesSize;
|
||||||
bool isFakeQuantize;
|
bool isFakeQuantize;
|
||||||
|
int inputsBytesSize;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
ScaleFactorCalculator(Cnt &net, int weightsBytesSize, bool fakeQuantize)
|
ScaleFactorCalculator(Cnt &net, int mandWeightsBytesSize, int optWeightsBytesSize, int inputsBytesSize, bool fakeQuantize)
|
||||||
: net(net), weightsBytesSize(weightsBytesSize), isFakeQuantize(fakeQuantize) {
|
: net(net), mandWeightsBytesSize(mandWeightsBytesSize), optWeightsBytesSize(optWeightsBytesSize),
|
||||||
|
inputsBytesSize(inputsBytesSize), isFakeQuantize(fakeQuantize) {
|
||||||
idx = std::begin(this->net);
|
idx = std::begin(this->net);
|
||||||
}
|
}
|
||||||
bool needToRestart() const {
|
bool needToRestart() const {
|
||||||
@ -1195,7 +1216,13 @@ class ScaleFactorCalculator {
|
|||||||
bool operator()(T ptr) const {
|
bool operator()(T ptr) const {
|
||||||
needRestart = false;
|
needRestart = false;
|
||||||
frontend::ScaleFactorUpdateResult result;
|
frontend::ScaleFactorUpdateResult result;
|
||||||
if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, result, isFakeQuantize)) {
|
auto weightsBytesSize = mandWeightsBytesSize;
|
||||||
|
|
||||||
|
if (LayerInfo(ptr).isConvolution() || LayerInfo(ptr).isScaleShift()) {
|
||||||
|
weightsBytesSize = optWeightsBytesSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!frontend::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputsBytesSize, result, isFakeQuantize)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (result) {
|
if (result) {
|
||||||
|
@ -235,7 +235,7 @@ void GNADeviceHelper::checkGna2Status(Gna2Status status, const Gna2Model& gnaMod
|
|||||||
? errorReasons.at(reason)
|
? errorReasons.at(reason)
|
||||||
: "Unknown Error Reason";
|
: "Unknown Error Reason";
|
||||||
ss << " Reason (" << std::to_string(reason) << "): " << errorReason << "\n";
|
ss << " Reason (" << std::to_string(reason) << "): " << errorReason << "\n";
|
||||||
ss << " Value (0x" << std::hex << std::to_string(error.Value) << ")";
|
ss << " Value (0x" << std::hex << error.Value << ")";
|
||||||
|
|
||||||
THROW_GNA_EXCEPTION << "\nUnsuccessful Gna2Status: (" << status << ") " <<
|
THROW_GNA_EXCEPTION << "\nUnsuccessful Gna2Status: (" << status << ") " <<
|
||||||
gna2StatusBuffer.data() << ss.str() <<
|
gna2StatusBuffer.data() << ss.str() <<
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#include "round_float_define.hpp"
|
#include "round_float_define.hpp"
|
||||||
#include "gna_plugin_policy.hpp"
|
#include "gna_plugin_policy.hpp"
|
||||||
#include "gna_groups.hpp"
|
#include "gna_groups.hpp"
|
||||||
|
#include "backend/gna_limitations.hpp"
|
||||||
|
|
||||||
using namespace InferenceEngine;
|
using namespace InferenceEngine;
|
||||||
using namespace std;
|
using namespace std;
|
||||||
@ -773,17 +774,19 @@ void GNAGraphCompiler::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
}
|
}
|
||||||
ptr_pwl_segments.resize(num_segments);
|
ptr_pwl_segments.resize(num_segments);
|
||||||
|
|
||||||
PwlDesign16(activation_type,
|
PwlDesign(activation_type,
|
||||||
&*ptr_pwl_segments.begin(),
|
&*ptr_pwl_segments.begin(),
|
||||||
static_cast<uint32_t>(ptr_pwl_segments.size()),
|
static_cast<uint32_t>(ptr_pwl_segments.size()),
|
||||||
input_pwl_scale_factor,
|
input_pwl_scale_factor,
|
||||||
output_pwl_scale_factor);
|
output_pwl_scale_factor,
|
||||||
|
gnaFlags->input_low_precision);
|
||||||
} else {
|
} else {
|
||||||
PwlDesignOpt16(activation_type,
|
PwlDesignOpt(activation_type,
|
||||||
ptr_pwl_segments,
|
ptr_pwl_segments,
|
||||||
input_pwl_scale_factor,
|
input_pwl_scale_factor,
|
||||||
output_pwl_scale_factor,
|
output_pwl_scale_factor,
|
||||||
gnaFlags->pwlMaxErrorPercent);
|
gnaFlags->pwlMaxErrorPercent,
|
||||||
|
gnaFlags->input_low_precision);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1139,8 +1142,11 @@ void GNAGraphCompiler::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
||||||
auto& eltwise = dynamic_cast<EltwiseLayer&>(*layer.get());
|
auto& eltwise = dynamic_cast<EltwiseLayer&>(*layer.get());
|
||||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
|
||||||
|
uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ?
|
||||||
|
GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor;
|
||||||
|
|
||||||
// for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that
|
// for eltwise sum/sub in 16-bit precision one input should be 4 bytes and one 2 bytes - detecting that below
|
||||||
|
// the names of variables are left for clarity although not always reflecting the real precision/size
|
||||||
auto inputs2Bytes = layer->insData[0].lock();
|
auto inputs2Bytes = layer->insData[0].lock();
|
||||||
auto inputs4Bytes = layer->insData[1].lock();
|
auto inputs4Bytes = layer->insData[1].lock();
|
||||||
|
|
||||||
@ -1151,19 +1157,32 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
case InferenceEngine::EltwiseLayer::Sum:
|
case InferenceEngine::EltwiseLayer::Sum:
|
||||||
case InferenceEngine::EltwiseLayer::Sub:
|
case InferenceEngine::EltwiseLayer::Sub:
|
||||||
{
|
{
|
||||||
if (inputs4Bytes->getPrecision().size() != 4) {
|
if (gnaFlags->input_low_precision == false) {
|
||||||
std::swap(inputs4Bytes, inputs2Bytes);
|
if (inputs4Bytes->getPrecision().size() != 4) {
|
||||||
biasesLayerIdx = 0;
|
std::swap(inputs4Bytes, inputs2Bytes);
|
||||||
|
biasesLayerIdx = 0;
|
||||||
|
}
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 4);
|
||||||
|
} else {
|
||||||
|
// for low precision both inputs should be 1 bytes in size
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 1);
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 1);
|
||||||
}
|
}
|
||||||
GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
|
|
||||||
GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 4);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case InferenceEngine::EltwiseLayer::Prod:
|
case InferenceEngine::EltwiseLayer::Prod:
|
||||||
{
|
{
|
||||||
// for mul both inputs should be 2 bytes precision
|
if (gnaFlags->input_low_precision == false) {
|
||||||
GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
|
// for mul both inputs should be 2 bytes precision
|
||||||
GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 2);
|
GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 2);
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 2);
|
||||||
|
} else {
|
||||||
|
// for mul both inputs should be 1 byte precision
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs2Bytes->getPrecision().size() == 1);
|
||||||
|
GNA_LAYER_ASSERT(layer, inputs4Bytes->getPrecision().size() == 1);
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@ -1196,7 +1215,7 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
uint32_t num_rows_in = in_4b_channels * in_4b_height * in_4b_width;
|
uint32_t num_rows_in = in_4b_channels * in_4b_height * in_4b_width;
|
||||||
uint32_t num_columns_in = in_4b_batch;
|
uint32_t num_columns_in = in_4b_batch;
|
||||||
uint32_t num_rows_out = num_rows_in;
|
uint32_t num_rows_out = num_rows_in;
|
||||||
uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
|
uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in;
|
||||||
|
|
||||||
void* ptr_inputs = nullptr;
|
void* ptr_inputs = nullptr;
|
||||||
void* ptr_outputs = nullptr;
|
void* ptr_outputs = nullptr;
|
||||||
@ -1211,8 +1230,8 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
inputs2Bytes->getPrecision().size(),
|
inputs2Bytes->getPrecision().size(),
|
||||||
outputs->getPrecision().size(),
|
outputs->getPrecision().size(),
|
||||||
// TODO: only fp32 and Int16 tested
|
// TODO: only fp32 and Int16 tested
|
||||||
quantized == nullptr ? inputs2Bytes->getPrecision().size() : 2,
|
quantized == nullptr ? inputs2Bytes->getPrecision().size() : (!gnaFlags->input_low_precision ? 2 : 1),
|
||||||
quantized == nullptr ? inputs4Bytes->getPrecision().size() : 4,
|
quantized == nullptr ? inputs4Bytes->getPrecision().size() : (!gnaFlags->input_low_precision ? 4 : 1),
|
||||||
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
quantized == nullptr ? 1 : quantized->_weights_quant.GetScale(),
|
||||||
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
quantized == nullptr ? 1 : quantized->_dst_quant.GetScale(),
|
||||||
ptr_inputs,
|
ptr_inputs,
|
||||||
@ -1237,9 +1256,15 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
} else {
|
} else {
|
||||||
auto scaledIdentity = -quantized->_weights_quant.GetScale();
|
auto scaledIdentity = -quantized->_weights_quant.GetScale();
|
||||||
|
|
||||||
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
if (gnaFlags->input_low_precision == false) {
|
||||||
|
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
||||||
|
|
||||||
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
|
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
|
||||||
|
} else {
|
||||||
|
auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));
|
||||||
|
|
||||||
|
gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
|
connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
|
||||||
break;
|
break;
|
||||||
@ -1249,9 +1274,15 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
} else {
|
} else {
|
||||||
auto scaledIdentity = quantized->_weights_quant.GetScale();
|
auto scaledIdentity = quantized->_weights_quant.GetScale();
|
||||||
|
|
||||||
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
if (gnaFlags->input_low_precision == false) {
|
||||||
|
auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
|
||||||
|
|
||||||
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
|
gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
|
||||||
|
} else {
|
||||||
|
auto quantizedIdentity = FLOAT_TO_INT8(std::min(scaledIdentity, static_cast<float>(INT8_MAX)));
|
||||||
|
|
||||||
|
gnamem->readonly().push_value<int8_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
|
connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
|
||||||
break;
|
break;
|
||||||
@ -1260,7 +1291,11 @@ void GNAGraphCompiler::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
|
|||||||
if (quantized == nullptr) {
|
if (quantized == nullptr) {
|
||||||
gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
|
gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
|
||||||
} else {
|
} else {
|
||||||
gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
|
if (gnaFlags->input_low_precision == false) {
|
||||||
|
gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
|
||||||
|
} else {
|
||||||
|
gnamem->readonly().push_value<int8_t>(ptr_biases, 0, num_rows_out, 64);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
|
connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
|
||||||
break;
|
break;
|
||||||
@ -1278,7 +1313,17 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
|
|||||||
IE_ASSERT(!layer->outData.empty());
|
IE_ASSERT(!layer->outData.empty());
|
||||||
auto inputs = layer->insData.begin()->lock();
|
auto inputs = layer->insData.begin()->lock();
|
||||||
auto outputs = *layer->outData.begin();
|
auto outputs = *layer->outData.begin();
|
||||||
auto inputPrecision = quantized ? Precision(Precision::I16) : inputs->getPrecision();
|
Precision inputPrecision;
|
||||||
|
uint32_t noOfInputsDivisor = GNALimitations::noOfInputsDivisor;
|
||||||
|
|
||||||
|
if (!quantized) {
|
||||||
|
inputPrecision = inputs->getPrecision();
|
||||||
|
} else if (gnaFlags->input_low_precision == false) {
|
||||||
|
inputPrecision = Precision(Precision::I16);
|
||||||
|
} else {
|
||||||
|
inputPrecision = Precision(Precision::I8);
|
||||||
|
noOfInputsDivisor = GNALimitations::noOfInputsLowPrecDivisor;
|
||||||
|
}
|
||||||
|
|
||||||
auto input_data = HasTo2DReshapeData(layer) ? Get2DReshapedData(inputs, 8) : inputs;
|
auto input_data = HasTo2DReshapeData(layer) ? Get2DReshapedData(inputs, 8) : inputs;
|
||||||
auto in_dims = input_data->getDims();
|
auto in_dims = input_data->getDims();
|
||||||
@ -1286,7 +1331,7 @@ void GNAGraphCompiler::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool
|
|||||||
uint32_t num_rows_in = InferenceEngine::details::product(in_dims) / batch_size;
|
uint32_t num_rows_in = InferenceEngine::details::product(in_dims) / batch_size;
|
||||||
uint32_t num_columns_in = batch_size;
|
uint32_t num_columns_in = batch_size;
|
||||||
uint32_t num_rows_out = isDiag ? num_rows_in : GetDataDimSize(outputs, 1);
|
uint32_t num_rows_out = isDiag ? num_rows_in : GetDataDimSize(outputs, 1);
|
||||||
uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
|
uint32_t num_padding = ALIGN(num_rows_in, noOfInputsDivisor) - num_rows_in;
|
||||||
uint32_t num_padding_out = isDiag ? num_padding : 0;
|
uint32_t num_padding_out = isDiag ? num_padding : 0;
|
||||||
|
|
||||||
void* ptr_inputs = nullptr;
|
void* ptr_inputs = nullptr;
|
||||||
@ -1860,17 +1905,19 @@ case name:\
|
|||||||
default:
|
default:
|
||||||
THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type;
|
THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type;
|
||||||
}
|
}
|
||||||
PwlDesign16(activation_type,
|
PwlDesign(activation_type,
|
||||||
&*ptr_pwl_segments.begin(),
|
&*ptr_pwl_segments.begin(),
|
||||||
static_cast<uint32_t>(ptr_pwl_segments.size()),
|
static_cast<uint32_t>(ptr_pwl_segments.size()),
|
||||||
input_pwl_scale_factor,
|
input_pwl_scale_factor,
|
||||||
output_pwl_scale_factor);
|
output_pwl_scale_factor,
|
||||||
|
gnaFlags->input_low_precision);
|
||||||
} else {
|
} else {
|
||||||
PwlDesignOpt16(activation_type,
|
PwlDesignOpt(activation_type,
|
||||||
ptr_pwl_segments,
|
ptr_pwl_segments,
|
||||||
input_pwl_scale_factor,
|
input_pwl_scale_factor,
|
||||||
output_pwl_scale_factor,
|
output_pwl_scale_factor,
|
||||||
gnaFlags->pwlMaxErrorPercent);
|
gnaFlags->pwlMaxErrorPercent,
|
||||||
|
gnaFlags->input_low_precision);
|
||||||
}
|
}
|
||||||
ptr_pwl_segments_target = reinterpret_cast<gna_pwl_segment_t*>(&ptr_pwl_segments_target);
|
ptr_pwl_segments_target = reinterpret_cast<gna_pwl_segment_t*>(&ptr_pwl_segments_target);
|
||||||
}
|
}
|
||||||
@ -2229,8 +2276,9 @@ GNAPluginNS::ConnectionDetails GNAGraphCompiler::connectInput(CNNLayerPtr layer,
|
|||||||
// if request for allocation less that realTensorInput - we need to extend request
|
// if request for allocation less that realTensorInput - we need to extend request
|
||||||
auto minInput = inputDesc->minBytesRequiredForStoreInput(prevLayer);
|
auto minInput = inputDesc->minBytesRequiredForStoreInput(prevLayer);
|
||||||
if (num_data_bytes_in < minInput) {
|
if (num_data_bytes_in < minInput) {
|
||||||
gnalog() << "[INPUT] : requested bytes: " << num_data_bytes_in << ", extended to" << ALIGN(minInput, 8);
|
uint32_t noOfInputsDivisor = gnaFlags->input_low_precision ? GNALimitations::noOfInputsLowPrecDivisor : GNALimitations::noOfInputsDivisor;
|
||||||
num_data_bytes_in = ALIGN(minInput, 8);
|
gnalog() << "[INPUT] : requested bytes: " << num_data_bytes_in << ", extended to" << ALIGN(minInput, noOfInputsDivisor);
|
||||||
|
num_data_bytes_in = ALIGN(minInput, noOfInputsDivisor);
|
||||||
}
|
}
|
||||||
|
|
||||||
// real allocation pointer will be kept in ptr not in ptr_inputs_global
|
// real allocation pointer will be kept in ptr not in ptr_inputs_global
|
||||||
|
@ -107,7 +107,11 @@ void GNAPlugin::copyInputData(T *dst,
|
|||||||
for (uint32_t i = 0; i < num_frames; i++) {
|
for (uint32_t i = 0; i < num_frames; i++) {
|
||||||
for (uint32_t j = 0; j < num_vector_elements; j++) {
|
for (uint32_t j = 0; j < num_vector_elements; j++) {
|
||||||
if (!std::is_same<T, U>::value) {
|
if (!std::is_same<T, U>::value) {
|
||||||
dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor);
|
if (!gnaFlags->input_low_precision) {
|
||||||
|
dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor);
|
||||||
|
} else {
|
||||||
|
dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt8(src[i * num_vector_elements + j] * scaleFactor);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
dst[j * num_group + i] = src[i * num_vector_elements + j];
|
dst[j * num_group + i] = src[i * num_vector_elements + j];
|
||||||
}
|
}
|
||||||
@ -129,8 +133,14 @@ void GNAPlugin::copyInputData(T *dst,
|
|||||||
T *ptr_dst_vec = reinterpret_cast<T *>(dst) + i * num_vector_stride;
|
T *ptr_dst_vec = reinterpret_cast<T *>(dst) + i * num_vector_stride;
|
||||||
const U *ptr_src_vec = reinterpret_cast<const U *>(src) + i * num_vector_elements;
|
const U *ptr_src_vec = reinterpret_cast<const U *>(src) + i * num_vector_elements;
|
||||||
std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
|
std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
|
||||||
for (uint32_t j=0; j < num_vector_elements; j++) {
|
if (!gnaFlags->input_low_precision) {
|
||||||
ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor);
|
for (uint32_t j = 0; j < num_vector_elements; j++) {
|
||||||
|
ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (uint32_t j = 0; j < num_vector_elements; j++) {
|
||||||
|
ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt8(ptr_src_vec[j] * scaleFactor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,6 +228,10 @@ void GNAPlugin::ExportScores(void *ptr_dst,
|
|||||||
auto dst_ptr = dst + (i * num_vector_elements + j);
|
auto dst_ptr = dst + (i * num_vector_elements + j);
|
||||||
|
|
||||||
switch (num_bytes_per_element_input) {
|
switch (num_bytes_per_element_input) {
|
||||||
|
case 1: {
|
||||||
|
*dst_ptr = static_cast<int32_t>(*reinterpret_cast<const int8_t*>(input_ptr));
|
||||||
|
break;
|
||||||
|
}
|
||||||
case 2 : {
|
case 2 : {
|
||||||
*dst_ptr = static_cast<int32_t>(*reinterpret_cast<const int16_t*>(input_ptr));
|
*dst_ptr = static_cast<int32_t>(*reinterpret_cast<const int16_t*>(input_ptr));
|
||||||
break;
|
break;
|
||||||
@ -284,21 +298,36 @@ void GNAPlugin::ImportFrames(
|
|||||||
// TODO : fix that as well
|
// TODO : fix that as well
|
||||||
if (input_precision == Precision::U8) {
|
if (input_precision == Precision::U8) {
|
||||||
auto src = reinterpret_cast<const uint8_t *>(ptr_src);
|
auto src = reinterpret_cast<const uint8_t *>(ptr_src);
|
||||||
auto dst = reinterpret_cast<int16_t *>(ptr_dst);
|
if (!gnaFlags->input_low_precision) {
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
auto dst = reinterpret_cast<int16_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
} else {
|
||||||
|
auto dst = reinterpret_cast<int8_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
}
|
||||||
} else if (input_precision.size() == 2) {
|
} else if (input_precision.size() == 2) {
|
||||||
auto dst = reinterpret_cast<int16_t *>(ptr_dst);
|
|
||||||
auto src = reinterpret_cast<const int16_t *>(ptr_src);
|
auto src = reinterpret_cast<const int16_t *>(ptr_src);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
if (!gnaFlags->input_low_precision) {
|
||||||
|
auto dst = reinterpret_cast<int16_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
} else {
|
||||||
|
auto dst = reinterpret_cast<int8_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
}
|
||||||
} else if (input_precision.size() == 4) {
|
} else if (input_precision.size() == 4) {
|
||||||
if (!gnadevice) {
|
if (!gnadevice) {
|
||||||
auto dst = reinterpret_cast<float *>(ptr_dst);
|
auto dst = reinterpret_cast<float *>(ptr_dst);
|
||||||
auto src = reinterpret_cast<const float *>(ptr_src);
|
auto src = reinterpret_cast<const float *>(ptr_src);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
} else {
|
} else {
|
||||||
auto dst = reinterpret_cast<int16_t *>(ptr_dst);
|
|
||||||
auto src = reinterpret_cast<const float *>(ptr_src);
|
auto src = reinterpret_cast<const float *>(ptr_src);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
if (!gnaFlags->input_low_precision) {
|
||||||
|
auto dst = reinterpret_cast<int16_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
} else {
|
||||||
|
auto dst = reinterpret_cast<int8_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -307,24 +336,36 @@ void GNAPlugin::ImportFrames(
|
|||||||
if (!gnadevice) {
|
if (!gnadevice) {
|
||||||
auto dst = reinterpret_cast<float *>(ptr_dst);
|
auto dst = reinterpret_cast<float *>(ptr_dst);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
} else if (!gnaFlags->input_low_precision) {
|
||||||
|
auto dst = reinterpret_cast<int16_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
} else {
|
} else {
|
||||||
auto dst = reinterpret_cast<int16_t *>(ptr_dst);
|
auto dst = reinterpret_cast<int8_t*>(ptr_dst);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
}
|
}
|
||||||
|
|
||||||
} else if (input_precision.size()== 2) {
|
} else if (input_precision.size()== 2) {
|
||||||
auto dst = reinterpret_cast<int16_t *>(ptr_dst);
|
|
||||||
auto src = reinterpret_cast<const int16_t *>(ptr_src);
|
auto src = reinterpret_cast<const int16_t *>(ptr_src);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
if (!gnaFlags->input_low_precision) {
|
||||||
|
auto dst = reinterpret_cast<int16_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
} else {
|
||||||
|
auto dst = reinterpret_cast<int8_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
}
|
||||||
} else if (input_precision.size() == 4) {
|
} else if (input_precision.size() == 4) {
|
||||||
if (!gnadevice) {
|
if (!gnadevice) {
|
||||||
auto dst = reinterpret_cast<float *>(ptr_dst);
|
auto dst = reinterpret_cast<float *>(ptr_dst);
|
||||||
auto src = reinterpret_cast<const float *>(ptr_src);
|
auto src = reinterpret_cast<const float *>(ptr_src);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
} else {
|
} else {
|
||||||
auto dst = reinterpret_cast<uint16_t *>(ptr_dst);
|
|
||||||
auto src = reinterpret_cast<const float *>(ptr_src);
|
auto src = reinterpret_cast<const float *>(ptr_src);
|
||||||
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
if (!gnaFlags->input_low_precision) {
|
||||||
|
auto dst = reinterpret_cast<int16_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
} else {
|
||||||
|
auto dst = reinterpret_cast<int8_t*>(ptr_dst);
|
||||||
|
copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -663,8 +704,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
|
|||||||
|
|
||||||
// network optimisation phases
|
// network optimisation phases
|
||||||
int passIdx = 0;
|
int passIdx = 0;
|
||||||
auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy) {
|
auto run_passes = [&] (const CNNNetwork& network, bool runBeforeCopy, bool lowPrecision) {
|
||||||
auto passes = make_shared<PassManager>(PassManagerSettings{policy, runBeforeCopy}, network);
|
auto passes = make_shared<PassManager>(PassManagerSettings{policy, runBeforeCopy, lowPrecision}, network);
|
||||||
passes->registerPass<RemoveConstPass>();
|
passes->registerPass<RemoveConstPass>();
|
||||||
passes->registerPass<UnrollTIPass>();
|
passes->registerPass<UnrollTIPass>();
|
||||||
passes->registerPass<RemoveConstPass>();
|
passes->registerPass<RemoveConstPass>();
|
||||||
@ -716,8 +757,8 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
|
|||||||
};
|
};
|
||||||
newNet = InferenceEngine::CNNNetCopy(network, visitor);
|
newNet = InferenceEngine::CNNNetCopy(network, visitor);
|
||||||
// to run all passes need to have two calls to pass manager
|
// to run all passes need to have two calls to pass manager
|
||||||
run_passes(newNet, true);
|
run_passes(newNet, true, gnaFlags->input_low_precision);
|
||||||
run_passes(newNet, false);
|
run_passes(newNet, false, gnaFlags->input_low_precision);
|
||||||
} else if (gnaFlags->fake_quantized) {
|
} else if (gnaFlags->fake_quantized) {
|
||||||
switch (config.gnaPrecision) {
|
switch (config.gnaPrecision) {
|
||||||
case Precision::I16:
|
case Precision::I16:
|
||||||
@ -738,8 +779,13 @@ void GNAPlugin::LoadNetwork(CNNNetwork & _network) {
|
|||||||
newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||||
break;
|
break;
|
||||||
case Precision::I8:
|
case Precision::I8:
|
||||||
ModelQuantizer<QuantI8> q8;
|
if (gnaFlags->input_low_precision == false) {
|
||||||
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
ModelQuantizer<QuantI8> q8;
|
||||||
|
newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||||
|
} else {
|
||||||
|
ModelQuantizer<QuantI8_I8> q8_8;
|
||||||
|
newNet = q8_8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
|
THROW_GNA_EXCEPTION << "unsupported GNA precision for quantisation: " << config.gnaPrecision;
|
||||||
@ -1164,7 +1210,7 @@ uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, Infer
|
|||||||
auto importedFrames = (is3D || is1D) ? 1 : dims[0];
|
auto importedFrames = (is3D || is1D) ? 1 : dims[0];
|
||||||
auto targetGroups = is1D ? 1 : dims[0]; // TODO: no proper support for groups yet
|
auto targetGroups = is1D ? 1 : dims[0]; // TODO: no proper support for groups yet
|
||||||
|
|
||||||
auto importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : 2;
|
auto importedElementSizeBytes = gnaFlags->sw_fp32 ? 4 : (gnaFlags->input_low_precision ? 1 : 2);
|
||||||
auto importedBytes = importedElements * importedFrames * importedElementSizeBytes;
|
auto importedBytes = importedElements * importedFrames * importedElementSizeBytes;
|
||||||
|
|
||||||
if (inputsDesc->bytes_allocated_for_input[input.first] < importedBytes) {
|
if (inputsDesc->bytes_allocated_for_input[input.first] < importedBytes) {
|
||||||
|
@ -49,7 +49,7 @@ struct Config {
|
|||||||
std::string GetParameter(const std::string& name) const;
|
std::string GetParameter(const std::string& name) const;
|
||||||
std::vector<std::string> GetSupportedKeys() const;
|
std::vector<std::string> GetSupportedKeys() const;
|
||||||
|
|
||||||
// precision of GNA hardware model
|
// default precision of GNA hardware model (see QuantI16 quantizer struct)
|
||||||
InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
|
InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
|
||||||
|
|
||||||
std::string dumpXNNPath;
|
std::string dumpXNNPath;
|
||||||
|
@ -54,10 +54,13 @@ class LayerInfo {
|
|||||||
IS_VALID();
|
IS_VALID();
|
||||||
return layer->insData.size() > 1;
|
return layer->insData.size() > 1;
|
||||||
}
|
}
|
||||||
bool has16BOutput() const noexcept {
|
// The name of the funciton may be somehwat misleading
|
||||||
|
// Explanation: when in low precision mode the listed layers have 8-bit outputs
|
||||||
|
// and when in 16-bit input mode, they have 16-bit outputs
|
||||||
|
bool has8BOr16BOutput() const noexcept {
|
||||||
IS_VALID();
|
IS_VALID();
|
||||||
static InferenceEngine::details::caseless_set<std::string> layersWith16BOutputs = {"memory", "input", "split", "slice", "concat", "copy", "const"};
|
static InferenceEngine::details::caseless_set<std::string> layersWith8BOr16BOutputs = {"memory", "input", "split", "slice", "concat", "copy", "const"};
|
||||||
return layersWith16BOutputs.find(layer->type) != layersWith16BOutputs.end() ||
|
return layersWith8BOr16BOutputs.find(layer->type) != layersWith8BOr16BOutputs.end() ||
|
||||||
isActivation() ||
|
isActivation() ||
|
||||||
(isCrop() && !isCropAffined());
|
(isCrop() && !isCropAffined());
|
||||||
}
|
}
|
||||||
|
@ -126,7 +126,7 @@ static CNNLayerPtr InsertCopyLayer(CNNLayerPtr prevLayer, CNNLayerPtr nextLayer,
|
|||||||
return copyWithQuant;
|
return copyWithQuant;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l) {
|
static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayerPtr l, std::shared_ptr<IPassManager> passmanager) {
|
||||||
std::vector<CNNLayerPtr> prevLayers;
|
std::vector<CNNLayerPtr> prevLayers;
|
||||||
|
|
||||||
// skipping memory inputs and true inputs layers
|
// skipping memory inputs and true inputs layers
|
||||||
@ -148,15 +148,24 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
|
|||||||
if (eltwise != nullptr) {
|
if (eltwise != nullptr) {
|
||||||
// eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted
|
// eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted
|
||||||
|
|
||||||
// for sum if we have 4-4 inputs we will handle that by inserting identity activation case (1)
|
// for sum with 16-bit input precision
|
||||||
// for sum if we have 4-2 - OK
|
// if we have 4-4 inputs - we will handle that by inserting identity activation case (1)
|
||||||
// for sum if we have 2-2 inputs we need to insert diagonal
|
// if we have 4-2 inputs - OK
|
||||||
|
// if we have 2-2 inputs - we need to insert diagonal
|
||||||
|
|
||||||
// for mul if we have 2-2 - OK
|
// for sum with 8-bit input precision
|
||||||
// for mul if we have 2-4 - inputs we need to insert identity activation to make 2 bytes input
|
// if we have 1-1 inputs - OK
|
||||||
// for mul if we have 4-4 - there 2 options
|
// if we have 4-4 inputs - there are 2 options
|
||||||
// option 1 both inputs came from single outdata - we will insert 1 identity to just convert single input into 2 bytes
|
// option 1 both inputs came from single outdata - we need to insert 1 identity activation to just convert single input into 1 byte
|
||||||
// option 2 each input came from it's own outdata - we need to insert 2 identities activations to convert both and feed weights and inputs
|
// option 2 each input came from its own outdata - we need to insert 2 identity activations to convert both and feed weights and inputs
|
||||||
|
|
||||||
|
// for mul if we have 2-2 or 1-1 (low precision case) inputs - OK
|
||||||
|
// for mul if we have 2-4 or 1-4 (low precision case) inputs - we need to insert identity activation to make 2 bytes input
|
||||||
|
// or 1 byte input (low precision case)
|
||||||
|
// for mul if we have 4-4 inputs - there are 2 options
|
||||||
|
// option 1 both inputs came from single outdata - we need to insert 1 identity activation to just convert single input into 2 bytes
|
||||||
|
// or 1 byte (low precision case)
|
||||||
|
// option 2 each input came from its own outdata - we need to insert 2 identity activations to convert both and feed weights and inputs
|
||||||
|
|
||||||
auto prev0 = PrevFunctionalLayer(l, 0);
|
auto prev0 = PrevFunctionalLayer(l, 0);
|
||||||
auto prev1 = PrevFunctionalLayer(l, 1);
|
auto prev1 = PrevFunctionalLayer(l, 1);
|
||||||
@ -164,14 +173,32 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
|
|||||||
switch (eltwise->_operation) {
|
switch (eltwise->_operation) {
|
||||||
case EltwiseLayer::Sub:
|
case EltwiseLayer::Sub:
|
||||||
case EltwiseLayer::Sum:
|
case EltwiseLayer::Sum:
|
||||||
if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) {
|
if (!passmanager->isLowPrecision()) {
|
||||||
return prevLayers;
|
if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) {
|
||||||
|
return prevLayers;
|
||||||
|
}
|
||||||
|
// TODO: whether there are possibility to select after what layer identity gets inserted
|
||||||
|
prevLayers.push_back(CNNNetPrevLayer(l, 0));
|
||||||
|
} else {
|
||||||
|
if (LayerInfo(prev0).has8BOr16BOutput() && LayerInfo(prev1).has8BOr16BOutput()) {
|
||||||
|
return prevLayers;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LayerInfo(prev0).has32BOutput()) {
|
||||||
|
prevLayers.push_back(CNNNetPrevLayer(l, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// if layers of outdata are different
|
||||||
|
auto prevData0 = l->insData[0].lock();
|
||||||
|
auto prevData1 = l->insData[1].lock();
|
||||||
|
|
||||||
|
if ((prev0 != prev1 || prevData0 != prevData1) && LayerInfo(prev1).has32BOutput()) {
|
||||||
|
prevLayers.push_back(CNNNetPrevLayer(l, 1));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// TODO: whether there are possibility to select after what layer identity gets inserted
|
|
||||||
prevLayers.push_back(CNNNetPrevLayer(l, 0));
|
|
||||||
break;
|
break;
|
||||||
case EltwiseLayer::Prod: {
|
case EltwiseLayer::Prod: {
|
||||||
if (LayerInfo(prev0).has16BOutput() && LayerInfo(prev1).has16BOutput()) {
|
if (LayerInfo(prev0).has8BOr16BOutput() && LayerInfo(prev1).has8BOr16BOutput()) {
|
||||||
return prevLayers;
|
return prevLayers;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -227,6 +254,8 @@ static std::vector<CNNLayerPtr> getCandidatesForIdentityInsertion(const CNNLayer
|
|||||||
}
|
}
|
||||||
|
|
||||||
void InsertDiagonalLayerPass::run() {
|
void InsertDiagonalLayerPass::run() {
|
||||||
|
bool lowPrecision = getPassManager()->isLowPrecision();
|
||||||
|
|
||||||
for (auto & l : *pLayers) {
|
for (auto & l : *pLayers) {
|
||||||
if (l->insData.empty()) continue;
|
if (l->insData.empty()) continue;
|
||||||
auto prevLayer = CNNNetPrevLayerSkipCertain(l, 0, [](CNNLayerPtr ptr) {
|
auto prevLayer = CNNNetPrevLayerSkipCertain(l, 0, [](CNNLayerPtr ptr) {
|
||||||
@ -241,12 +270,16 @@ void InsertDiagonalLayerPass::run() {
|
|||||||
if (!eltwise) {
|
if (!eltwise) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// in case of eltwise sum one of input would be 4 bytes one - 2
|
// in case of eltwise sum in 16-bit input precision one of input would be 4 bytes one - 2
|
||||||
// in case of eltwise mull one of input would be 2 bytes one - 2
|
// in case of eltwise mul in 16-bit input precision one of input would be 2 bytes one - 2
|
||||||
|
// in case of eltwise sum in low (8-bit) input precision both inputs are 1 byte
|
||||||
|
// in case of eltwise mul in low (8-bit) input precision both inputs are 1 byte
|
||||||
// for e sum if we have 4-4 inputs we will handle that by inserting identity activation
|
// for e sum if we have 4-4 inputs we will handle that by inserting identity activation
|
||||||
// for e sum if we have 4-2 - OK
|
// for e sum if we have 4-2 - OK
|
||||||
// for e sum if we have 2-2 inputs we need to insert diagonal -- handling here
|
// for e sum if we have 2-2 inputs we need to insert diagonal -- handling here
|
||||||
|
// for e sum if we have 1-1 inputs in low precision mode - OK
|
||||||
// for e mul if we have 2-2 - OK
|
// for e mul if we have 2-2 - OK
|
||||||
|
// for e mul if we have 1-1 in low precision mode - OK
|
||||||
// for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights
|
// for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights
|
||||||
// for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights
|
// for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights
|
||||||
|
|
||||||
@ -256,7 +289,10 @@ void InsertDiagonalLayerPass::run() {
|
|||||||
auto prevLayer1 = CNNNetPrevLayerSkipCertain(l, 1, [](CNNLayerPtr ptr) {
|
auto prevLayer1 = CNNNetPrevLayerSkipCertain(l, 1, [](CNNLayerPtr ptr) {
|
||||||
return LayerInfo(ptr).isNonFunctional();
|
return LayerInfo(ptr).isNonFunctional();
|
||||||
});
|
});
|
||||||
if (!LayerInfo(prevLayer).has16BOutput() || !LayerInfo(prevLayer1).has16BOutput())
|
if (!LayerInfo(prevLayer).has8BOr16BOutput() || !LayerInfo(prevLayer1).has8BOr16BOutput())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (lowPrecision && LayerInfo(prevLayer).has8BOr16BOutput() && LayerInfo(prevLayer1).has8BOr16BOutput())
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto prevDirectLayer = CNNNetPrevLayer(l, 0);
|
auto prevDirectLayer = CNNNetPrevLayer(l, 0);
|
||||||
@ -736,7 +772,7 @@ void RemovePermutationsNHWCToNCHWPass::run() {
|
|||||||
void InsertIdentityLayerPass::run() {
|
void InsertIdentityLayerPass::run() {
|
||||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
|
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
|
||||||
for (auto & l : *pLayers) {
|
for (auto & l : *pLayers) {
|
||||||
for (auto && prev : getCandidatesForIdentityInsertion(l)) {
|
for (auto && prev : getCandidatesForIdentityInsertion(l, getPassManager())) {
|
||||||
// Do an upstream search until Functional layer is found
|
// Do an upstream search until Functional layer is found
|
||||||
auto original_prev_layer = prev;
|
auto original_prev_layer = prev;
|
||||||
auto true_layer = l;
|
auto true_layer = l;
|
||||||
@ -811,7 +847,7 @@ void InsertIdentityLayerPass::run() {
|
|||||||
for (auto && nextLayer : getInputTo(nextData)) {
|
for (auto && nextLayer : getInputTo(nextData)) {
|
||||||
if (nextLayer.second.get() == l.get())
|
if (nextLayer.second.get() == l.get())
|
||||||
continue;
|
continue;
|
||||||
if (getCandidatesForIdentityInsertion(nextLayer.second).empty()) {
|
if (getCandidatesForIdentityInsertion(nextLayer.second, getPassManager()).empty()) {
|
||||||
notAll = true;
|
notAll = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,7 @@ public:
|
|||||||
virtual ~IPassManager() = default;
|
virtual ~IPassManager() = default;
|
||||||
virtual int &getIntVar(std::string name) = 0;
|
virtual int &getIntVar(std::string name) = 0;
|
||||||
virtual const Policy &getPolicy() const = 0;
|
virtual const Policy &getPolicy() const = 0;
|
||||||
|
virtual const bool& isLowPrecision() const = 0;
|
||||||
virtual InferenceEngine::CNNNetwork &getNetwork() = 0;
|
virtual InferenceEngine::CNNNetwork &getNetwork() = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -221,6 +222,7 @@ struct PassManagerSettings {
|
|||||||
Policy policy;
|
Policy policy;
|
||||||
/// @brief whether to run passes before copy
|
/// @brief whether to run passes before copy
|
||||||
bool runBeforeCopy;
|
bool runBeforeCopy;
|
||||||
|
bool lowPrecision;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -245,6 +247,9 @@ public:
|
|||||||
const Policy & getPolicy() const override {
|
const Policy & getPolicy() const override {
|
||||||
return settings.policy;
|
return settings.policy;
|
||||||
}
|
}
|
||||||
|
const bool& isLowPrecision() const override {
|
||||||
|
return settings.lowPrecision;
|
||||||
|
}
|
||||||
InferenceEngine::CNNNetwork& getNetwork() override {
|
InferenceEngine::CNNNetwork& getNetwork() override {
|
||||||
return network;
|
return network;
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,17 @@ int16_t GNAPluginNS::ConvertFloatToInt16(float src) {
|
|||||||
return (int16_t)value;
|
return (int16_t)value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int8_t GNAPluginNS::ConvertFloatToInt8(float src) {
|
||||||
|
float rounding_value = (src > 0) ? 0.5f : -0.5f;
|
||||||
|
float value = src + rounding_value;
|
||||||
|
if (value > 127.0) {
|
||||||
|
return 127;
|
||||||
|
} else if (value < -128.0) {
|
||||||
|
return -128;
|
||||||
|
}
|
||||||
|
return (int8_t)value;
|
||||||
|
}
|
||||||
|
|
||||||
void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
|
void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
|
||||||
const float *ptr_src,
|
const float *ptr_src,
|
||||||
const uint32_t num_rows,
|
const uint32_t num_rows,
|
||||||
|
@ -21,4 +21,5 @@ void ConvertToFloat(float *ptr_dst,
|
|||||||
const float scale_factor);
|
const float scale_factor);
|
||||||
|
|
||||||
int16_t ConvertFloatToInt16(float src);
|
int16_t ConvertFloatToInt16(float src);
|
||||||
|
int8_t ConvertFloatToInt8(float src);
|
||||||
} // namespace GNAPluginNS
|
} // namespace GNAPluginNS
|
||||||
|
@ -7,5 +7,6 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
|
||||||
|
|
||||||
|
#define FLOAT_TO_INT8(a) static_cast<int8_t>(((a) < 0)?((a) - 0.5f):((a) + 0.5f))
|
||||||
#define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5f):((a) + 0.5f))
|
#define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5f):((a) + 0.5f))
|
||||||
#define FLOAT_TO_INT32(a) static_cast<int32_t>(((a) < 0)?((a)-0.5f):((a)+0.5f))
|
#define FLOAT_TO_INT32(a) static_cast<int32_t>(((a) < 0)?((a)-0.5f):((a)+0.5f))
|
||||||
|
@ -496,11 +496,12 @@ std::vector<pwl_t> pwl_search(const DnnActivation& activation_type,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void PwlDesignOpt16(const DnnActivation activation_type,
|
void PwlDesignOpt(const DnnActivation activation_type,
|
||||||
std::vector<gna_pwl_segment_t> &ptr_segment,
|
std::vector<gna_pwl_segment_t> &ptr_segment,
|
||||||
const float scale_in,
|
const float scale_in,
|
||||||
const float scale_out,
|
const float scale_out,
|
||||||
const float pwlMaxErrorPercent) {
|
const float pwlMaxErrorPercent,
|
||||||
|
const bool low_precision) {
|
||||||
std::vector<pwl_t> pwl;
|
std::vector<pwl_t> pwl;
|
||||||
double err_pct = 0.0;
|
double err_pct = 0.0;
|
||||||
auto minInputStats = 0.0f;
|
auto minInputStats = 0.0f;
|
||||||
@ -515,7 +516,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
|
|||||||
auto minInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? -absMax : -SIGMOID_DOMAIN;
|
auto minInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? -absMax : -SIGMOID_DOMAIN;
|
||||||
auto maxInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? absMax : SIGMOID_DOMAIN;
|
auto maxInput = (activation_type.srcFQParams.set && absMax < SIGMOID_DOMAIN) ? absMax : SIGMOID_DOMAIN;
|
||||||
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActTanh: {
|
case kActTanh: {
|
||||||
@ -523,7 +524,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
|
|||||||
auto minInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? -absMax : -TANH_DOMAIN;
|
auto minInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? -absMax : -TANH_DOMAIN;
|
||||||
auto maxInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? absMax : TANH_DOMAIN;
|
auto maxInput = (activation_type.srcFQParams.set && absMax < TANH_DOMAIN) ? absMax : TANH_DOMAIN;
|
||||||
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActSoftSign: {
|
case kActSoftSign: {
|
||||||
@ -531,55 +532,56 @@ void PwlDesignOpt16(const DnnActivation activation_type,
|
|||||||
auto minInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? -absMax : -SOFTSIGN_DOMAIN;
|
auto minInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? -absMax : -SOFTSIGN_DOMAIN;
|
||||||
auto maxInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? absMax : SOFTSIGN_DOMAIN;
|
auto maxInput = (activation_type.srcFQParams.set && absMax < SOFTSIGN_DOMAIN) ? absMax : SOFTSIGN_DOMAIN;
|
||||||
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, minInput, maxInput, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, minInput, maxInput, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActRelu:
|
case kActRelu:
|
||||||
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
case kActLeakyRelu:
|
case kActLeakyRelu:
|
||||||
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
case kActIdentity:
|
case kActIdentity:
|
||||||
case kActFakeQuantize:
|
case kActFakeQuantize:
|
||||||
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
case kActKaldiLstmClipping:
|
case kActKaldiLstmClipping:
|
||||||
make_gna_pwl(activation_type, pwl, activation_type.args.clamp.low, activation_type.args.clamp.high, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, activation_type.args.clamp.low, activation_type.args.clamp.high,
|
||||||
|
scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
case kActLog: {
|
case kActLog: {
|
||||||
double x_min = (1 + ~XBASEMASK) / scale_in;
|
double x_min = (1 + ~XBASEMASK) / scale_in;
|
||||||
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
|
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
|
||||||
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActNegLog: {
|
case kActNegLog: {
|
||||||
double x_min = (1 + ~XBASEMASK) / scale_in;
|
double x_min = (1 + ~XBASEMASK) / scale_in;
|
||||||
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
|
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
|
||||||
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActNegHalfLog: {
|
case kActNegHalfLog: {
|
||||||
double x_min = (1 + ~XBASEMASK) / scale_in;
|
double x_min = (1 + ~XBASEMASK) / scale_in;
|
||||||
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
|
double x_max = ((INT32_MAX / scale_in) < LOG_DOMAIN) ? (INT32_MAX / scale_in) : LOG_DOMAIN;
|
||||||
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActExp: {
|
case kActExp: {
|
||||||
double x_min = -log(scale_out);
|
double x_min = -log(scale_out);
|
||||||
double x_max = x_min + log(INT16_MAX);
|
double x_max = x_min + log(INT16_MAX);
|
||||||
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, pwlMaxErrorPercent, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case kActSign:
|
case kActSign:
|
||||||
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
case kActAbs:
|
case kActAbs:
|
||||||
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
case kActPow: {
|
case kActPow: {
|
||||||
auto fp32eq = [](float p1, float p2) -> bool {
|
auto fp32eq = [](float p1, float p2) -> bool {
|
||||||
@ -600,7 +602,7 @@ void PwlDesignOpt16(const DnnActivation activation_type,
|
|||||||
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, maxError, PWL_DESIGN_SAMPLES, err_pct);
|
pwl = pwl_search(activation_type, x_min, x_max, PWL_DESIGN_THRESHOLD, maxError, PWL_DESIGN_SAMPLES, err_pct);
|
||||||
}
|
}
|
||||||
|
|
||||||
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, ptr_segment);
|
make_gna_pwl(activation_type, pwl, x_min, x_max, scale_in, scale_out, low_precision, ptr_segment);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@ -608,11 +610,12 @@ void PwlDesignOpt16(const DnnActivation activation_type,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void PwlDesign16(const DnnActivation activation_type,
|
void PwlDesign(const DnnActivation activation_type,
|
||||||
gna_pwl_segment_t *ptr_segment,
|
gna_pwl_segment_t *ptr_segment,
|
||||||
const uint32_t num_segments,
|
const uint32_t num_segments,
|
||||||
const float scale_in,
|
const float scale_in,
|
||||||
const float scale_out) {
|
const float scale_out,
|
||||||
|
const bool low_precision) {
|
||||||
switch (activation_type) {
|
switch (activation_type) {
|
||||||
case kActSigmoid:
|
case kActSigmoid:
|
||||||
{
|
{
|
||||||
@ -767,12 +770,12 @@ void PwlDesign16(const DnnActivation activation_type,
|
|||||||
else
|
else
|
||||||
gnalog() << "=========================== Identity Segments ===========================\n";
|
gnalog() << "=========================== Identity Segments ===========================\n";
|
||||||
if (x_lower_limit < INT32_MIN) {
|
if (x_lower_limit < INT32_MIN) {
|
||||||
std::cerr << "Warning: saturation in PwlDesign16! " << x_lower_limit << " < INT32_MIN"<< std::endl;
|
std::cerr << "Warning: saturation in PwlDesign! " << x_lower_limit << " < INT32_MIN"<< std::endl;
|
||||||
x_lower_limit = INT32_MIN;
|
x_lower_limit = INT32_MIN;
|
||||||
y_lower_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MIN) - 0.5);
|
y_lower_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MIN) - 0.5);
|
||||||
}
|
}
|
||||||
if (x_upper_limit > INT32_MAX) {
|
if (x_upper_limit > INT32_MAX) {
|
||||||
std::cerr << "Warning: saturation in PwlDesign16! " << x_upper_limit << " > INT32_MAX"<< std::endl;
|
std::cerr << "Warning: saturation in PwlDesign! " << x_upper_limit << " > INT32_MAX"<< std::endl;
|
||||||
x_upper_limit = INT32_MAX;
|
x_upper_limit = INT32_MAX;
|
||||||
y_upper_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MAX) + 0.5);
|
y_upper_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MAX) + 0.5);
|
||||||
}
|
}
|
||||||
|
@ -95,13 +95,15 @@ void PwlApply32(intel_dnn_component_t *component,
|
|||||||
const uint32_t num_row_end,
|
const uint32_t num_row_end,
|
||||||
const uint32_t num_col_start,
|
const uint32_t num_col_start,
|
||||||
const uint32_t num_col_end);
|
const uint32_t num_col_end);
|
||||||
void PwlDesign16(const DnnActivation activation_type,
|
void PwlDesign(const DnnActivation activation_type,
|
||||||
gna_pwl_segment_t *ptr_segment,
|
gna_pwl_segment_t *ptr_segment,
|
||||||
const uint32_t num_segments,
|
const uint32_t num_segments,
|
||||||
const float scale_in,
|
const float scale_in,
|
||||||
const float scale_out);
|
const float scale_out,
|
||||||
void PwlDesignOpt16(const DnnActivation activation_type,
|
const bool low_precision);
|
||||||
|
void PwlDesignOpt(const DnnActivation activation_type,
|
||||||
std::vector<gna_pwl_segment_t> &ptr_segment,
|
std::vector<gna_pwl_segment_t> &ptr_segment,
|
||||||
const float scale_in,
|
const float scale_in,
|
||||||
const float scale_out,
|
const float scale_out,
|
||||||
const float pwlMaxErrorPercent);
|
const float pwlMaxErrorPercent,
|
||||||
|
const bool low_precision);
|
||||||
|
@ -22,7 +22,7 @@ protected:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetUp() {
|
void SetUp() override {
|
||||||
ConvolutionLayerTest::SetUp();
|
ConvolutionLayerTest::SetUp();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -24,7 +24,7 @@ protected:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetUp() {
|
void SetUp() override {
|
||||||
ConvolutionReluSequenceTest::SetUp();
|
ConvolutionReluSequenceTest::SetUp();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user