[GNA] Fixed a bug with channel multiplier for int8 weights quantization (#9234)

This commit is contained in:
Elizaveta Lobanova 2021-12-20 18:12:56 +03:00 committed by GitHub
parent 1901f33bd3
commit d4fdbba7d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 157 additions and 4 deletions

View File

@ -111,9 +111,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
// Check that if second inputs is Constant operation and it's shape without ones dimensions has length <= 2
// we replace MatMul with FullyConnected operation.
// Otherwise we replace MatMul with Gemm.
if ((std::dynamic_pointer_cast<opset1::Constant> (fc_input_b.get_node_shared_ptr()) ||
std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr())) &&
std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
if ((std::dynamic_pointer_cast<opset1::Constant> (fc_input_b.get_node_shared_ptr()) || fq_after_const) &&
std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
return x != 1;
}) <= 2) {
Shape shape_a_aligned, shape_b_aligned;
@ -131,8 +131,18 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
// Weights normalization
if (!matmul->get_transpose_b()) {
fc_input_b = create_transpose(fc_input_b, matmul->get_friendly_name() + "/transpose_b");
Output<ov::Node> constant = fc_input_b;
// transpose the constant itself, not FQ output, to allow constant folding to apply this transpose
if (fq_after_const) {
constant = fc_input_b.get_node_shared_ptr()->input_value(0);
}
fc_input_b = create_transpose(constant, matmul->get_friendly_name() + "/transpose_b");
new_ops.push_back(fc_input_b.get_node_shared_ptr());
if (fq_after_const) {
fc_input_b = fq_after_const->clone_with_new_inputs(OutputVector{fc_input_b, fq_after_const->input_value(1),
fq_after_const->input_value(2), fq_after_const->input_value(3), fq_after_const->input_value(4)});
new_ops.push_back(fc_input_b.get_node_shared_ptr());
}
}
if (shape_b.size() != 2) {

View File

@ -258,6 +258,9 @@ void QuantizationCallback<int8_t, gna_compound_bias_t>::runFakeQuantize() const
channel_multiplier = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
}
// channel multiplier shouldn't be 0
channel_multiplier = channel_multiplier == 0 ? 1 : channel_multiplier;
ptr_int_biases[i].multiplier = static_cast<uint8_t> (channel_multiplier + 0.5f);
if (channel_multiplier > MAX_OUT_MULTIPLIER) {
THROW_GNA_EXCEPTION << "invalid channel multiplier: " << channel_multiplier;

View File

@ -0,0 +1,140 @@
// Copyright (C) 2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <vector>
#include <memory>
#include <tuple>
#include <string>
#include <ie_core.hpp>
#include "common_test_utils/common_utils.hpp"
#include "functional_test_utils/plugin_cache.hpp"
#include "shared_test_classes/base/layer_test_utils.hpp"
#include "functional_test_utils/blob_utils.hpp"
#include "ngraph_functions/utils/ngraph_helpers.hpp"
#include "ngraph_functions/builders.hpp"
#include "ngraph_functions/pass/convert_prc.hpp"
typedef std::tuple<
InferenceEngine::Precision, // Network Precision
std::string, // Target Device
std::map<std::string, std::string>, // Configuration
std::pair<float, float>, // Weights values
std::vector<size_t> // Input shapes
> matmulParams;
namespace LayerTestsDefinitions {
class PerchannelQuantTest : public testing::WithParamInterface<matmulParams>,
public LayerTestsUtils::LayerTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<matmulParams> obj) {
InferenceEngine::Precision netPrecision;
std::string targetDevice;
std::map<std::string, std::string> configuration;
std::pair<float, float> weightsValues;
std::vector<size_t> inputShape;
std::tie(netPrecision, targetDevice, configuration, weightsValues, inputShape) = obj.param;
std::ostringstream result;
result << "netPRC=" << netPrecision.name() << "_";
result << "targetDevice=" << targetDevice << "_";
for (auto const& configItem : configuration) {
result << "_configItem=" << configItem.first << "_" << configItem.second;
}
result << "_range=(" << weightsValues.first << ", " << weightsValues.second << ")";
result << "_IS=(" << CommonTestUtils::vec2str(inputShape) << ")";
return result.str();
}
protected:
void SetUp() override {
InferenceEngine::Precision netPrecision;
std::pair<float, float> weightsValues;
std::vector<size_t> inputShape;
std::tie(netPrecision, targetDevice, configuration, weightsValues, inputShape) = this->GetParam();
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
const ngraph::Shape constShape = {inputShape.back(), inputShape.back()};
auto params = ngraph::builder::makeParams(ngPrc, {inputShape});
std::vector<float> weights;
std::vector<float> weightsMin, weightsMax;
for (int i = 0; i < constShape.front(); ++i) {
// multiplier to increase weights ranges difference between different channels
float mul = (i % 2 ? 1.0 : 0.001);
float rowMin = weightsValues.first * mul;
float rowMax = weightsValues.second * mul;
auto rowWeights = CommonTestUtils::generate_float_numbers(constShape.back(), rowMin, rowMax);
weights.insert(std::end(weights), std::begin(rowWeights), std::end(rowWeights));
weightsMin.push_back(rowMin);
weightsMax.push_back(rowMax);
}
auto constant = ngraph::builder::makeConstant<float>(ngPrc, constShape, weights);
auto wLowNode = ngraph::builder::makeConstant<float>(ngPrc, {constShape.front()}, { weightsMin });
auto wHighNode = ngraph::builder::makeConstant<float>(ngPrc, {constShape.front()}, { weightsMax });
auto wFq = std::make_shared<ngraph::opset8::FakeQuantize>(constant, wLowNode, wHighNode, wLowNode, wHighNode,
std::numeric_limits<uint8_t>::max() - 1);
auto matmul = std::make_shared<ngraph::opset8::MatMul>(params[0], wFq, false, true);
ngraph::ResultVector results{std::make_shared<ngraph::opset8::Result>(matmul)};
function = std::make_shared<ngraph::Function>(results, params, "PerchannelQuantTest");
}
};
TEST_P(PerchannelQuantTest, CompareWithRefImpl) {
LoadNetwork();
GenerateInputs();
Infer();
auto results = GetOutputs();
size_t size = results.front()->size();
auto memory = InferenceEngine::as<InferenceEngine::MemoryBlob>(results.front());
IE_ASSERT(memory);
const auto lockedMemory = memory->wmap();
const auto actualBuffer = lockedMemory.as<const float*>();
// check that outputs haven't been zero out by a channel multilplier
for (size_t i = 0; i < size; ++i) {
if (actualBuffer[i] == 0.0) {
IE_THROW() << "Unexpected 0 output value";
}
}
};
const std::vector<InferenceEngine::Precision> netPrecisions = {
InferenceEngine::Precision::FP32,
InferenceEngine::Precision::FP16
};
const std::vector<std::map<std::string, std::string>> configs = {
{
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
}
};
const std::vector<std::pair<float, float>> weightsValues = {
{-0.1, 0.1},
{-1.0, 1.0},
{-10.0, 10.0}
};
const std::vector<std::vector<size_t>> inputShapes = {
{1, 128},
{1, 38},
{1, 8}
};
INSTANTIATE_TEST_SUITE_P(smoke_base, PerchannelQuantTest,
::testing::Combine(
::testing::ValuesIn(netPrecisions),
::testing::Values(CommonTestUtils::DEVICE_GNA),
::testing::ValuesIn(configs),
::testing::ValuesIn(weightsValues),
::testing::ValuesIn(inputShapes)),
PerchannelQuantTest::getTestCaseName);
} // namespace LayerTestsDefinitions