[GNA] Support of overload correction for MatMul with 2 non-constant layers (#10447)
This commit is contained in:
parent
09246e2db8
commit
4e0a740eb3
@ -112,7 +112,9 @@ ngraph::pass::ConvertMatMulToFC::ConvertMatMulToFC() {
|
|||||||
// we replace MatMul with FullyConnected operation.
|
// we replace MatMul with FullyConnected operation.
|
||||||
// Otherwise we replace MatMul with Gemm.
|
// Otherwise we replace MatMul with Gemm.
|
||||||
auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
|
auto fq_after_const = std::dynamic_pointer_cast<opset1::FakeQuantize>(fc_input_b.get_node_shared_ptr());
|
||||||
if ((std::dynamic_pointer_cast<opset1::Constant> (fc_input_b.get_node_shared_ptr()) || fq_after_const) &&
|
bool is_fq_after_const = fq_after_const &&
|
||||||
|
std::dynamic_pointer_cast<opset1::Constant>(fc_input_b.get_node_shared_ptr()->input_value(0).get_node_shared_ptr());
|
||||||
|
if ((std::dynamic_pointer_cast<opset1::Constant> (fc_input_b.get_node_shared_ptr()) || is_fq_after_const) &&
|
||||||
std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
|
std::count_if(shape_b.begin(), shape_b.end(), [](size_t x) {
|
||||||
return x != 1;
|
return x != 1;
|
||||||
}) <= 2) {
|
}) <= 2) {
|
||||||
|
@ -207,6 +207,86 @@ static double calculateWeightsReducerFromDstStats(Quantization dst_quant) {
|
|||||||
return weightsReducer;
|
return weightsReducer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Tries to re-quantize an input to reach the desired output scale factor value.
|
||||||
|
* This function searches for layers above which output scale factors can be changed:
|
||||||
|
* - activations,
|
||||||
|
* - constants,
|
||||||
|
* - weightable layers (output scale factor is modified by modification of weights scale factor).
|
||||||
|
* @param input input to be re-quantized
|
||||||
|
* @param newOutputScale the desired output scale factor value
|
||||||
|
* @param result information about the restarted layer
|
||||||
|
* @return true if the input can be re-quantized
|
||||||
|
*/
|
||||||
|
static bool requantizeInput(InferenceEngine::CNNLayerPtr input, float newOutputScale, ScaleFactorUpdateResult &result) {
|
||||||
|
auto layer = input;
|
||||||
|
while (layer && !LayerInfo(layer).isInput() && !LayerInfo(layer).isMemory() && !LayerInfo(layer).isCopy()) {
|
||||||
|
size_t prevInputIdx = 0;
|
||||||
|
auto info = LayerInfo(layer);
|
||||||
|
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*layer);
|
||||||
|
if (quantDataForInputLayer->_dst_quant.IsStatsSet()) {
|
||||||
|
auto levels = LayerInfo(layer).has32BOutput() ? (std::numeric_limits<uint32_t>::max() + 1ul) :
|
||||||
|
(std::numeric_limits<uint16_t>::max() + 1ul);
|
||||||
|
auto maxSF = CalculateScaleFactorFromStats(levels, quantDataForInputLayer->_dst_quant.GetMinValues().front(),
|
||||||
|
quantDataForInputLayer->_dst_quant.GetMaxValues().front());
|
||||||
|
if (newOutputScale > maxSF) {
|
||||||
|
gnalog() << layer->name << ": Scale factor " << newOutputScale << " is too large. The maximum scale factor: "
|
||||||
|
<< maxSF << " levels=" << levels << " min=" << quantDataForInputLayer->_dst_quant.GetMinValues().front()
|
||||||
|
<< " max=" << quantDataForInputLayer->_dst_quant.GetMaxValues().front() << "\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (info.isActivation() || info.isConst()) {
|
||||||
|
gnawarn() << "[WARNING] requantize " << layer->name
|
||||||
|
<< ". Layer new output scale: " << newOutputScale
|
||||||
|
<< ", was " << quantDataForInputLayer->_dst_quant.GetScale() << std::endl;
|
||||||
|
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
|
||||||
|
result = ScaleFactorUpdateResult(layer.get());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (info.isWeightableIdentity() && !fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
|
||||||
|
auto reducer = std::max(1.0f, quantDataForInputLayer->_dst_quant.GetScale() / newOutputScale);
|
||||||
|
auto newWeightsScale = std::max(1.0f, quantDataForInputLayer->_weights_quant.GetScale() / reducer);
|
||||||
|
quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
|
||||||
|
quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
|
||||||
|
quantDataForInputLayer->_src_quant.GetScale());
|
||||||
|
|
||||||
|
result = ScaleFactorUpdateResult(layer.get());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (info.isFullyConnected() || info.isConvolution()) {
|
||||||
|
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
|
||||||
|
quantDataForInputLayer->_weights_quant.SetScale(newOutputScale / quantDataForInputLayer->_src_quant.GetScale());
|
||||||
|
result = ScaleFactorUpdateResult(layer.get());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (LayerInfo(layer).isEltwise()) {
|
||||||
|
// re-quantize bias branch for Eltwise layer
|
||||||
|
if (!LayerInfo(input).has32BOutput()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint8_t ix = 0; ix < 2; ++ix) {
|
||||||
|
if (LayerInfo(InferenceEngine::CNNNetPrevLayer(layer, ix)).has32BOutput()) {
|
||||||
|
prevInputIdx = ix;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto prevLayer = InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx);
|
||||||
|
auto prevQuantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*prevLayer);
|
||||||
|
newOutputScale *= prevQuantData->_dst_quant.GetScale() / quantDataForInputLayer->_dst_quant.GetScale();
|
||||||
|
}
|
||||||
|
|
||||||
|
layer = InferenceEngine::CNNNetHasPrevLayer(layer.get(), prevInputIdx) ?
|
||||||
|
InferenceEngine::CNNNetPrevLayer(layer, prevInputIdx) : nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief calculates output scale factor per layer
|
* @brief calculates output scale factor per layer
|
||||||
* @tparam T
|
* @tparam T
|
||||||
@ -1292,22 +1372,16 @@ public:
|
|||||||
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
|
auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
|
||||||
quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
|
quantData->_src_quant.SetScale(quantParams0->_dst_quant.GetScale());
|
||||||
quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
|
quantData->_weights_quant.SetScale(quantParams1->_dst_quant.GetScale());
|
||||||
if (quantData->_src_quant.IsStatsSet()) {
|
|
||||||
auto getScale = [&quantParams0](size_t i) {
|
|
||||||
return CalculateScaleFactorFromStats(quantParams0->_dst_quant.GetLevels(),
|
|
||||||
quantParams0->_dst_quant.GetMinValues(false)[i], quantParams0->_dst_quant.GetMaxValues(false)[i]);
|
|
||||||
};
|
|
||||||
float min_channel_scale = getScale(0);
|
|
||||||
quantParams0->_dst_quant.SetScale(min_channel_scale);
|
|
||||||
quantData->_src_quant.SetScale(min_channel_scale);
|
|
||||||
}
|
|
||||||
quantData->_dst_quant.SetScale(
|
quantData->_dst_quant.SetScale(
|
||||||
quantData->_src_quant.GetScale() * quantData->_weights_quant.GetScale());
|
quantData->_src_quant.GetScale() * quantData->_weights_quant.GetScale());
|
||||||
|
|
||||||
// If the first input is const it's possible to reduce its scale factor to avoid overflow
|
if (!quantData->_dst_quant.IsStatsSet()) {
|
||||||
if (LayerInfo(in0).isConst() && quantData->_dst_quant.IsStatsSet()) {
|
return true;
|
||||||
// Adjust weights scale factor if output values exceed int32 maximum value
|
}
|
||||||
auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
|
|
||||||
|
// Adjust weights scale factor if output values exceed int32 maximum value
|
||||||
|
auto weightsReducer = calculateWeightsReducerFromDstStats(quantData->_dst_quant);
|
||||||
|
if (LayerInfo(in0).isConst()) {
|
||||||
if (!fp32eq(weightsReducer, 1.0f)) {
|
if (!fp32eq(weightsReducer, 1.0f)) {
|
||||||
quantParams0->_dst_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
|
quantParams0->_dst_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
|
||||||
quantData->_src_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
|
quantData->_src_quant.SetScale(quantData->_src_quant.GetScale() / weightsReducer);
|
||||||
@ -1318,6 +1392,18 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
quantData->_dst_quant.SetScale(quantData->_weights_quant.GetScale() * quantData->_src_quant.GetScale());
|
quantData->_dst_quant.SetScale(quantData->_weights_quant.GetScale() * quantData->_src_quant.GetScale());
|
||||||
|
} else {
|
||||||
|
if (!fp32eq(weightsReducer, 1.0f)) {
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
auto input = InferenceEngine::CNNNetPrevLayer(gemmLayer, i);
|
||||||
|
auto quantParams = InferenceEngine::getInjectedData<QuantizedLayerParams>(input);
|
||||||
|
float newOutputScale = quantParams->_dst_quant.GetScale() / weightsReducer;
|
||||||
|
if (requantizeInput(input, newOutputScale, result)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
THROW_GNA_EXCEPTION << "Unable to quantize " << gemmLayer->name;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,141 @@
|
|||||||
|
// Copyright (C) 2022 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include <ie_core.hpp>
|
||||||
|
|
||||||
|
#include "common_test_utils/common_utils.hpp"
|
||||||
|
#include "functional_test_utils/plugin_cache.hpp"
|
||||||
|
#include "shared_test_classes/base/layer_test_utils.hpp"
|
||||||
|
#include "functional_test_utils/blob_utils.hpp"
|
||||||
|
#include "ngraph_functions/utils/ngraph_helpers.hpp"
|
||||||
|
#include "ngraph_functions/builders.hpp"
|
||||||
|
|
||||||
|
#include "ngraph_functions/pass/convert_prc.hpp"
|
||||||
|
|
||||||
|
typedef std::tuple<
|
||||||
|
InferenceEngine::Precision, // Network Precision
|
||||||
|
std::string, // Target Device
|
||||||
|
std::map<std::string, std::string>, // Configuration
|
||||||
|
std::vector<size_t>, // Input shape
|
||||||
|
bool, // Constant second input
|
||||||
|
bool // Swap inputs
|
||||||
|
> matmulOverloadCorrectionParams;
|
||||||
|
|
||||||
|
namespace LayerTestsDefinitions {
|
||||||
|
|
||||||
|
class MatMulOverloadCorrectionTest : public testing::WithParamInterface<matmulOverloadCorrectionParams>,
|
||||||
|
public LayerTestsUtils::LayerTestsCommon {
|
||||||
|
public:
|
||||||
|
static std::string getTestCaseName(testing::TestParamInfo<matmulOverloadCorrectionParams> obj) {
|
||||||
|
InferenceEngine::Precision netPrecision;
|
||||||
|
std::string targetDevice;
|
||||||
|
std::map<std::string, std::string> configuration;
|
||||||
|
std::vector<size_t> inputShape;
|
||||||
|
bool isSecondInputConst, swapInputs;
|
||||||
|
std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = obj.param;
|
||||||
|
|
||||||
|
std::ostringstream result;
|
||||||
|
result << "netPRC=" << netPrecision.name() << "_";
|
||||||
|
result << "targetDevice=" << targetDevice << "_";
|
||||||
|
for (auto const& configItem : configuration) {
|
||||||
|
result << "_configItem=" << configItem.first << "_" << configItem.second;
|
||||||
|
}
|
||||||
|
result << "_IS=" << CommonTestUtils::vec2str(inputShape);
|
||||||
|
result << "_secondInput=" << (isSecondInputConst ? "const" : "param");
|
||||||
|
result << "_swapInputs=" << swapInputs;
|
||||||
|
|
||||||
|
return result.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
void SetUp() override {
|
||||||
|
InferenceEngine::Precision netPrecision;
|
||||||
|
bool isSecondInputConst, swapInputs;
|
||||||
|
std::vector<size_t> inputShape;
|
||||||
|
|
||||||
|
std::tie(netPrecision, targetDevice, configuration, inputShape, isSecondInputConst, swapInputs) = this->GetParam();
|
||||||
|
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||||
|
|
||||||
|
const ngraph::Shape shape1 = inputShape;
|
||||||
|
const ngraph::Shape shape2 = {1, inputShape[1] * inputShape[1]};
|
||||||
|
const float maxInputValue = 10.0f;
|
||||||
|
auto params = ngraph::builder::makeParams(ngPrc, {shape1});
|
||||||
|
std::shared_ptr<ngraph::Node> input2;
|
||||||
|
if (isSecondInputConst) {
|
||||||
|
input2 = ngraph::builder::makeConstant<float>(ngPrc, ngraph::Shape{shape1[1], shape1[1]},
|
||||||
|
CommonTestUtils::generate_float_numbers(shape2[1], 0.0f, maxInputValue));
|
||||||
|
} else {
|
||||||
|
input2 = ngraph::builder::makeInputLayer(ngPrc, ngraph::helpers::InputLayerType::PARAMETER, shape2);
|
||||||
|
params.push_back(std::dynamic_pointer_cast<ngraph::opset8::Parameter>(input2));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto lowNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
|
||||||
|
auto highNodeIn1 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
|
||||||
|
auto fqIn1 = std::make_shared<ngraph::opset8::FakeQuantize>(params[0], lowNodeIn1, highNodeIn1,
|
||||||
|
lowNodeIn1, highNodeIn1, levels16);
|
||||||
|
|
||||||
|
auto lowNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue });
|
||||||
|
auto highNodeIn2 = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue });
|
||||||
|
auto fqIn2 = std::make_shared<ngraph::opset8::FakeQuantize>(input2, lowNodeIn2, highNodeIn2,
|
||||||
|
lowNodeIn2, highNodeIn2, levels16);
|
||||||
|
|
||||||
|
std::shared_ptr<ngraph::Node> matmul_input2 = fqIn2;
|
||||||
|
if (!isSecondInputConst) {
|
||||||
|
auto pattern = std::make_shared<ngraph::opset8::Constant>(ngraph::element::Type_t::i64,
|
||||||
|
ngraph::Shape{ 2 }, ngraph::Shape{shape1[1], shape1[1]});
|
||||||
|
matmul_input2 = std::make_shared<ngraph::opset8::Reshape>(fqIn2, pattern, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto matmul = swapInputs ? std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(matmul_input2, fqIn1, false, true)) :
|
||||||
|
std::dynamic_pointer_cast<ngraph::opset8::MatMul>(ngraph::builder::makeMatMul(fqIn1, matmul_input2, false, true));
|
||||||
|
|
||||||
|
auto lowNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { -maxInputValue * maxInputValue * inputShape[1] / 10 });
|
||||||
|
auto highNodeOut = ngraph::builder::makeConstant<float>(ngPrc, {1}, { maxInputValue * maxInputValue * inputShape[1] / 10 });
|
||||||
|
auto fqOut = std::make_shared<ngraph::opset8::FakeQuantize>(matmul, lowNodeOut, highNodeOut,
|
||||||
|
lowNodeOut, highNodeOut, levels32);
|
||||||
|
|
||||||
|
ngraph::ResultVector results{std::make_shared<ngraph::opset8::Result>(fqOut)};
|
||||||
|
function = std::make_shared<ngraph::Function>(results, params, "MatMulOverloadCorrection");
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t levels16 = std::numeric_limits<uint16_t>::max();
|
||||||
|
const size_t levels32 = std::numeric_limits<uint32_t>::max();
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_P(MatMulOverloadCorrectionTest, CompareWithRefImpl) {
|
||||||
|
Run();
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||||
|
InferenceEngine::Precision::FP32,
|
||||||
|
InferenceEngine::Precision::FP16
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<std::map<std::string, std::string>> configs = {
|
||||||
|
{
|
||||||
|
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::vector<std::vector<size_t>> inputShapes = {
|
||||||
|
{1, 128},
|
||||||
|
{1, 256}
|
||||||
|
};
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(smoke_base, MatMulOverloadCorrectionTest,
|
||||||
|
::testing::Combine(
|
||||||
|
::testing::ValuesIn(netPrecisions),
|
||||||
|
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||||
|
::testing::ValuesIn(configs),
|
||||||
|
::testing::ValuesIn(inputShapes),
|
||||||
|
::testing::ValuesIn({true, false}),
|
||||||
|
::testing::ValuesIn({true, false})),
|
||||||
|
MatMulOverloadCorrectionTest::getTestCaseName);
|
||||||
|
} // namespace LayerTestsDefinitions
|
Loading…
Reference in New Issue
Block a user