[GNA] Support of overload correction for MatMul with 2 non-constant layers (#10447)
This commit is contained in:
parent
09246e2db8
commit
4e0a740eb3
@ -112,7 +112,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
|
||||
// we replace MatMul with FullyConnected operation.
|
||||
// Otherwise we replace MatMul with Gemm.
|
||||
auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
|
||||
if ((std::dynamic_pointer_cast<opset1::Constant> (fc_input_b.get_node_shared_ptr()) || fq_after_const) &&
|
||||
bool is_fq_after_const = fq_after_const &&
|
||||
std::dynamic_pointer_cast<opset1::Constant>(fc_input_b.get_node_shared_ptr()->input_value(0).get_node_shared_ptr());
|
||||
if ((std::dynamic_pointer_cast<opset1::Constant> (fc_input_b.get_node_shared_ptr()) || is_fq_after_const) &&
|
||||
std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
|
||||
return x != 1;
|
||||
}) <= 2) {
|
||||
|
@ -207,6 +207,86 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) {
|
||||
return weightsReducer;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Tries to re-quantize an input to reach the desired output scale factor value.
|
||||
* This function searches for layers above which output scale factors can be changed:
|
||||
* - activations,
|
||||
* - constants,
|
||||
* - weightable layers (output scale factor is modified by modification of weights scale factor).
|
||||
* @param input input to be re-quantized
|
||||
* @param newOutputScale the desired output scale factor value
|
||||
* @param result information about the restarted layer
|
||||
* @return true if the input can be re-quantized
|
||||
*/
|
||||
static bool requantizeInput(InferenceEngine::CNNLayerPtr input, float newOutputScale, ScaleFactorUpdateResult &result) {
|
||||
auto layer = input;
|
||||
while (layer && !LayerInfo(layer).isInput() && !LayerInfo(layer).isMemory() && !LayerInfo(layer).isCopy()) {
|
||||
size_t prevInputIdx = 0;
|
||||
auto info = LayerInfo(layer);
|
||||
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*layer);
|
||||
if (quantDataForInputLayer->_dst_quant.IsStatsSet()) {
|
||||
auto levels = LayerInfo(layer).has32BOutput() ? (std::numeric_limits<uint32_t>::max() + 1ul) :
|
||||
(std::numeric_limits<uint16_t>::max() + 1ul);
|
||||
auto maxSF = CalculateScaleFactorFromStats(levels, quantDataForInputLayer->_dst_quant.GetMinValues().front(),
|
||||
quantDataForInputLayer->_dst_quant.GetMaxValues().front());
|
||||
if (newOutputScale > maxSF) {
|
||||
gnalog() << layer->name << ": Scale factor " << newOutputScale << " is too large. The maximum scale factor: "
|
||||
<< maxSF << " levels=" << levels << " min=" << quantDataForInputLayer->_dst_quant.GetMinValues().front()
|
||||
<< " max=" << quantDataForInputLayer->_dst_quant.GetMaxValues().front() << "\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (info.isActivation() || info.isConst()) {
|
||||
gnawarn() << "[WARNING] requantize " << layer->name
|
||||
<< ". Layer new output scale: " << newOutputScale
|
||||
<< ", was " << quantDataForInputLayer->_dst_quant.GetScale() << std::endl;
|
||||
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
|
||||
result = ScaleFactorUpdateResult(layer.get());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (info.isWeightableIdentity() && !fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
|
||||
auto reducer = std::max(1.0f, quantDataForInputLayer->_dst_quant.GetScale() / newOutputScale);
|
||||
auto newWeightsScale = std::max(1.0f, quantDataForInputLayer->_weights_quant.GetScale() / reducer);
|
||||
quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
|
||||
quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
|
||||
quantDataForInputLayer->_src_quant.GetScale());
|
||||
|
||||
result = ScaleFactorUpdateResult(layer.get());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (info.isFullyConnected() || info.isConvolution()) {
|
||||
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
|
||||
quantDataForInputLayer->_weights_quant.SetScale(newOutputScale / quantDataForInputLayer->_src_quant.GetScale());
|
||||
result = ScaleFactorUpdateResult(layer.get());
|
||||
return true;
|
||||
}
|
||||
|
||||
if (LayerInfo(layer).isEltwise()) {
|
||||
// re-quantize bias branch for Eltwise layer
|
||||
if (!LayerInfo(input).has32BOutput()) {
|
||||
break;
|
||||
}
|
||||
|
||||
for (uint8_t ix = 0; ix < 2; ++ix) {
|
||||
if (LayerInfo(InferenceEngine::CNNNetPrevLayer(layer, ix)).has32BOutput()) {
|
||||
prevInputIdx = ix;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto prevLayer = InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx);
|
||||
auto prevQuantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
|
||||
newOutputScale *= prevQuantData->_dst_quant.GetScale() / quantDataForInputLayer->_dst_quant.GetScale();
|
||||
}
|
||||
|
||||
layer = InferenceEngine::CNNNetHasPrevLayer(layer.get(), prevInputIdx) ?
|
||||
InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx) : nullptr;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief calculates output scale factor per layer
|
||||
* @tparam T
|
||||
@ -1292,22 +1372,16 @@ public:
|
||||
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
|
||||
quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
|
||||
quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
|
||||
if (quantData->_src_quant.IsStatsSet()) {
|
||||
auto getScale = [&quantParams0](size_t i) {
|
||||
return CalculateScaleFactorFromStats(quantParams0->_dst_quant.GetLevels(),
|
||||
quantParams0->_dst_quant.GetMinValues(false)[i], quantParams0->_dst_quant.GetMaxValues(false)[i]);
|
||||
};
|
||||
float min_channel_scale = getScale(0);
|
||||
quantParams0->_dst_quant.SetScale(min_channel_scale);
|
||||
quantData->_src_quant.SetScale(min_channel_scale);
|
||||
}
|
||||
quantData->_dst_quant.SetScale(
|
||||
quantData->_src_quant.GetScale() * quantData->_weights_quant.GetScale());
|
||||
|
||||
// If the first input is const it's possible to reduce its scale factor to avoid overflow
|
||||
if (LayerInfo(in0).isConst() && quantData->_dst_quant.IsStatsSet()) {
|
||||
if (!quantData->_dst_quant.IsStatsSet()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Adjust weights scale factor if output values exceed int32 maximum value
|
||||
auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
|
||||
if (LayerInfo(in0).isConst()) {
|
||||
if (!fp32eq(weightsReducer, 1.0f)) {
|
||||
quantParams0->_dst_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
|
||||
quantData->_src_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
|
||||
@ -1318,6 +1392,18 @@ public:
|
||||
}
|
||||
|
||||
quantData->_dst_quant.SetScale(quantData->_weights_quant.GetScale() * quantData->_src_quant.GetScale());
|
||||
} else {
|
||||
if (!fp32eq(weightsReducer, 1.0f)) {
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
auto input = InferenceEngine::CNNNetPrevLayer(gemmLayer, i);
|
||||
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
|
||||
float newOutputScale = quantParams->_dst_quant.GetScale() / weightsReducer;
|
||||
if (requantizeInput(input, newOutputScale, result)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
THROW_GNA_EXCEPTION << "Unable to quantize " << gemmLayer->name;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -0,0 +1,141 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include <ie_core.hpp>
|
||||
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "functional_test_utils/plugin_cache.hpp"
|
||||
#include "shared_test_classes/base/layer_test_utils.hpp"
|
||||
#include "functional_test_utils/blob_utils.hpp"
|
||||
#include "ngraph_functions/utils/ngraph_helpers.hpp"
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
|
||||
#include "ngraph_functions/pass/convert_prc.hpp"
|
||||
|
||||
typedef std::tuple<
|
||||
InferenceEngine::Precision, // Network Precision
|
||||
std::string, // Target Device
|
||||
std::map<std::string, std::string>, // Configuration
|
||||
std::vector<size_t>, // Input shape
|
||||
bool, // Constant second input
|
||||
bool // Swap inputs
|
||||
> matmulOverloadCorrectionParams;
|
||||
|
||||
namespace LayerTestsDefinitions {
|
||||
|
||||
class MatMulOverloadCorrectionTest : public testing::WithParamInterface<matmulOverloadCorrectionParams>,
|
||||
public LayerTestsUtils::LayerTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<matmulOverloadCorrectionParams> obj) {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::string targetDevice;
|
||||
std::map<std::string, std::string> configuration;
|
||||
std::vector<size_t> inputShape;
|
||||
bool isSecondInputConst, swapInputs;
|
||||
std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "netPRC=" << netPrecision.name() << "_";
|
||||
result << "targetDevice=" << targetDevice << "_";
|
||||
for (auto const& configItem : configuration) {
|
||||
result << "_configItem=" << configItem.first << "_" << configItem.second;
|
||||
}
|
||||
result << "_IS=" << CommonTestUtils::vec2str(inputShape);
|
||||
result << "_secondInput=" << (isSecondInputConst ? "const" : "param");
|
||||
result << "_swapInputs=" << swapInputs;
|
||||
|
||||
return result.str();
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
bool isSecondInputConst, swapInputs;
|
||||
std::vector<size_t> inputShape;
|
||||
|
||||
std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = this->GetParam();
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
|
||||
const ngraph::Shape shape1 = inputShape;
|
||||
const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
|
||||
const float maxInputValue = 10.0f;
|
||||
auto params = ngraph::builder::makeParams(ngPrc, {shape1});
|
||||
std::shared_ptr<ngraph::Node> input2;
|
||||
if (isSecondInputConst) {
|
||||
input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
|
||||
CommonTestUtils::generate_float_numbers(shape2[1], 0.0f, maxInputValue));
|
||||
} else {
|
||||
input2 = ngraph::builder::makeInputLayer(ngPrc, ngraph::helpers::InputLayerType::PARAMETER, shape2);
|
||||
params.push_back(std::dynamic_pointer_cast<ngraph::opset8::Parameter>(input2));
|
||||
}
|
||||
|
||||
auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
|
||||
auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
|
||||
auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
|
||||
lowNodeIn1, highNodeIn1, levels16);
|
||||
|
||||
auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
|
||||
auto highNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
|
||||
auto fqIn2 = std::make_shared<ngraph::opset8::FakeQuantize>(input2, lowNodeIn2, highNodeIn2,
|
||||
lowNodeIn2, highNodeIn2, levels16);
|
||||
|
||||
std::shared_ptr<ngraph::Node> matmul_input2 = fqIn2;
|
||||
if (!isSecondInputConst) {
|
||||
auto pattern = std::make_shared<ngraph::opset8::Constant>(ngraph::element::Type_t::i64,
|
||||
ngraph::Shape{ 2 }, ngraph::Shape{shape1[1], shape1[1]});
|
||||
matmul_input2 = std::make_shared<ngraph::opset8::Reshape>(fqIn2, pattern, false);
|
||||
}
|
||||
|
||||
auto matmul = swapInputs ? std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(matmul_input2, fqIn1, false, true)) :
|
||||
std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(fqIn1, matmul_input2, false, true));
|
||||
|
||||
auto lowNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue * maxInputValue * inputShape[1] / 10 });
|
||||
auto highNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue * maxInputValue * inputShape[1] / 10 });
|
||||
auto fqOut = std::make_shared<ngraph::opset8::FakeQuantize>(matmul, lowNodeOut, highNodeOut,
|
||||
lowNodeOut, highNodeOut, levels32);
|
||||
|
||||
ngraph::ResultVector results{std::make_shared<ngraph::opset8::Result>(fqOut)};
|
||||
function = std::make_shared<ngraph::Function>(results, params, "MatMulOverloadCorrection");
|
||||
}
|
||||
|
||||
const size_t levels16 = std::numeric_limits<uint16_t>::max();
|
||||
const size_t levels32 = std::numeric_limits<uint32_t>::max();
|
||||
};
|
||||
|
||||
TEST_P(MatMulOverloadCorrectionTest, CompareWithRefImpl) {
|
||||
Run();
|
||||
};
|
||||
|
||||
const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||
InferenceEngine::Precision::FP32,
|
||||
InferenceEngine::Precision::FP16
|
||||
};
|
||||
|
||||
const std::vector<std::map<std::string, std::string>> configs = {
|
||||
{
|
||||
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
|
||||
}
|
||||
};
|
||||
|
||||
const std::vector<std::vector<size_t>> inputShapes = {
|
||||
{1, 128},
|
||||
{1, 256}
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||
::testing::ValuesIn(configs),
|
||||
::testing::ValuesIn(inputShapes),
|
||||
::testing::ValuesIn({true, false}),
|
||||
::testing::ValuesIn({true, false})),
|
||||
MatMulOverloadCorrectionTest::getTestCaseName);
|
||||
} // namespace LayerTestsDefinitions
|
Loading…
Reference in New Issue
Block a user