[GNA] Added fix for eltwise layer with more 65k elements (#1943)
This commit is contained in:
parent
245920a95d
commit
bdbb04f47b
@ -256,6 +256,22 @@ inline int CNNLayerFindOutDataIdx(CNNLayerPtr layer, int insDataIdx) {
|
||||
return std::distance(prevLayer->outData.begin(), outDataIt);
|
||||
}
|
||||
|
||||
/// @brief utility to locate output data from given insData index and given layer
|
||||
/// also it returns iterator that represent link to this layer in inputToMap
|
||||
inline std::pair<DataPtr, std::map<std::string, CNNLayerPtr>::iterator> CNNLayerFindOutData(CNNLayerPtr layer, int insDataIdx) {
|
||||
auto oDataIdx = CNNLayerFindOutDataIdx(layer, insDataIdx);
|
||||
auto prevLayer = CNNNetPrevLayer(layer, insDataIdx);
|
||||
auto oData = prevLayer->outData[oDataIdx];
|
||||
for (auto inputTo = getInputTo(oData).begin();
|
||||
inputTo != getInputTo(oData).end();
|
||||
inputTo++) {
|
||||
if (inputTo->second == layer) {
|
||||
return {oData, inputTo};
|
||||
}
|
||||
}
|
||||
THROW_GNA_LAYER_EXCEPTION(layer) << "cannot locate input data for: " << insDataIdx;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief swap two layer in graph - with modifying input/output references
|
||||
* also if layers have different dimensions they are preserved, so layers should be dimensions agnostic
|
||||
|
@ -368,6 +368,7 @@ void GNAPlugin::LoadNetwork(ICNNNetwork & _network) {
|
||||
passes->registerPass<SubstituteSoftSignPass>();
|
||||
|
||||
passes->registerPass<ReorderMaxPoolPass>();
|
||||
passes->registerPass<EltwiseSplitOverChannelsPass>();
|
||||
passes->registerPass<InsertSplitAligningFilterPass>();
|
||||
|
||||
passes->registerPass<InsertConcatAligningFilterPass>();
|
||||
|
@ -49,6 +49,19 @@ class Policy {
|
||||
REMOVE_LAST,
|
||||
REMOVE_ALL
|
||||
} NHWCToNCHWPolicy = NHWCToNCHW::REMOVE_ALL;
|
||||
|
||||
/**
|
||||
* @brief trim of gna diagonal affine layer maximum elements number
|
||||
*/
|
||||
class GNAAffineDiagonal {
|
||||
public:
|
||||
enum : uint32_t {
|
||||
UNLIMIT,
|
||||
// gna limit this to be OxFFFF
|
||||
LIMITED_TO_DEFAULT_GNA2_65536 = 65536 - 64
|
||||
};
|
||||
uint32_t limitedTo = LIMITED_TO_DEFAULT_GNA2_65536;
|
||||
} GNAAffineDiagonalPolicy;
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, Policy::ScaleShift policy) {
|
||||
|
@ -626,6 +626,20 @@ void InsertIdentityLayerPass::run() {
|
||||
|
||||
CNNLayerPtr activationLayer =
|
||||
std::make_shared<GenericLayer>(LayerParams({activationName, "identity", Precision::FP32}));
|
||||
|
||||
// TODO: why index is 0 ? - better use direct indexing in getCandidateFunction
|
||||
// detecting ins-data-idx
|
||||
size_t insDataIdx = std::numeric_limits<size_t>::max();
|
||||
for (size_t i = 0; i != l->insData.size(); i++) {
|
||||
if (getCreatorLayer(l->insData[i].lock()).lock() == prev) {
|
||||
insDataIdx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (insDataIdx == std::numeric_limits<size_t>::max()) {
|
||||
THROW_GNA_EXCEPTION << "cannot insert identity layer after" << prev->name << " and before " << l->name;
|
||||
}
|
||||
|
||||
auto inputData = l->insData[0].lock();
|
||||
|
||||
auto dataPtr = std::make_shared<Data>("identity_data_" + std::to_string(numOfIdentityLayers), inputData->getTensorDesc());
|
||||
@ -1009,6 +1023,107 @@ static InferenceEngine::Blob::Ptr tileBlob(Blob::Ptr& blob, size_t TileTo) {
|
||||
return tiledBlob;
|
||||
}
|
||||
|
||||
void EltwiseSplitOverChannelsPass::run() {
|
||||
if (getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo == Policy::GNAAffineDiagonal::UNLIMIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto & l : *pLayers) {
|
||||
if (!LayerInfo(l).isEltwise()) {
|
||||
continue;
|
||||
}
|
||||
auto masterEltwise = std::dynamic_pointer_cast<EltwiseLayer>(l);
|
||||
if (l->outData.size() != 1) {
|
||||
THROW_GNA_LAYER_EXCEPTION(l) << "number of outputs expected to be 1";
|
||||
}
|
||||
auto oData = l->outData.front();
|
||||
auto totalElementsForOutput = details::product(oData->getDims().begin(), oData->getDims().end());
|
||||
auto maxAffineElements = getPassManager()->getPolicy().GNAAffineDiagonalPolicy.limitedTo;
|
||||
if (totalElementsForOutput <= maxAffineElements) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: for now lets put split of 2 elements as restrictions
|
||||
auto totalSplits = 1 + totalElementsForOutput / maxAffineElements;
|
||||
if (totalSplits > 2) {
|
||||
THROW_GNA_LAYER_EXCEPTION(l) << "split layer over output channels on more than 2 layers unsupported";
|
||||
}
|
||||
|
||||
pass_trace() << "transforming " << LAYER_NAME(l) << " by splitting it to multiple eltwise operations\n";
|
||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(l);
|
||||
|
||||
std::vector<CNNLayerPtr> splitLayers(2);
|
||||
for (size_t kThEltwiseInput = 0; kThEltwiseInput != 2; kThEltwiseInput++) {
|
||||
// create split layer
|
||||
auto splitRaw = std::make_shared<SplitLayer>(
|
||||
LayerParams{l->name + "/split/" + std::to_string(kThEltwiseInput), "Split", Precision::FP32});
|
||||
auto split = quantized ? InferenceEngine::injectData<QuantizedLayerParams>(splitRaw) : splitRaw;
|
||||
splitLayers[kThEltwiseInput] = split;
|
||||
|
||||
split->insData.push_back(l->insData[kThEltwiseInput]);
|
||||
auto inputDesc = l->insData[kThEltwiseInput].lock()->getTensorDesc();
|
||||
// need to split this desc
|
||||
if (inputDesc.getLayout() != Layout::NC) {
|
||||
THROW_GNA_LAYER_EXCEPTION(l)
|
||||
<< "cannot split over channel: input " << std::to_string(kThEltwiseInput)
|
||||
<< " layout need to be NC";
|
||||
}
|
||||
|
||||
// create split layer outputs
|
||||
for (size_t i = 0;; i++) {
|
||||
auto elements_num = std::min(totalElementsForOutput - i * maxAffineElements,
|
||||
static_cast<size_t>(maxAffineElements));
|
||||
|
||||
SizeVector newDims = {1, elements_num};
|
||||
auto newDesc = TensorDesc(inputDesc.getPrecision(), newDims, inputDesc.getLayout());
|
||||
auto data = std::make_shared<Data>(l->name + "/" + std::to_string(kThEltwiseInput) + "/1", newDesc);
|
||||
getCreatorLayer(data) = split;
|
||||
split->outData.push_back(data);
|
||||
|
||||
if (elements_num != maxAffineElements) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// replacing connection X->eltwise to X->split
|
||||
auto oData = CNNLayerFindOutData(l, kThEltwiseInput);
|
||||
oData.second->second = split;
|
||||
}
|
||||
|
||||
// create concatlayer
|
||||
auto concatRaw = std::make_shared<ConcatLayer>(
|
||||
LayerParams{l->name + "/concat", "Concat", Precision::FP32});
|
||||
auto concat = quantized ? InferenceEngine::injectData<QuantizedLayerParams>(concatRaw) : concatRaw;
|
||||
|
||||
concat->outData.push_back(masterEltwise->outData.front());
|
||||
getCreatorLayer(masterEltwise->outData.front()) = concat;
|
||||
|
||||
|
||||
// create new eltwise layers - here 2 hardcode
|
||||
for (size_t k = 0; k != totalSplits; k++) {
|
||||
auto eltwiseRaw = std::make_shared<EltwiseLayer>(
|
||||
LayerParams{l->name + "/eltwise/" + std::to_string(k), "Eltwise", Precision::FP32});
|
||||
eltwiseRaw->_operation = masterEltwise->_operation;
|
||||
eltwiseRaw->coeff = masterEltwise->coeff;
|
||||
auto eltwise = quantized ? InferenceEngine::injectData<QuantizedLayerParams>(eltwiseRaw) : eltwiseRaw;
|
||||
|
||||
|
||||
eltwise->insData.push_back(splitLayers[0]->outData[k]);
|
||||
eltwise->insData.push_back(splitLayers[1]->outData[k]);
|
||||
getInputTo(splitLayers[0]->outData[k])[eltwise->name] = eltwise;
|
||||
getInputTo(splitLayers[1]->outData[k])[eltwise->name] = eltwise;
|
||||
|
||||
SizeVector newDims = splitLayers[1]->outData[k]->getDims();
|
||||
auto newDesc = TensorDesc(splitLayers[1]->outData[k]->getPrecision(), newDims,
|
||||
splitLayers[1]->outData[k]->getLayout());
|
||||
auto data = std::make_shared<Data>(l->name + "/elwise/out/" + std::to_string(k), newDesc);
|
||||
getCreatorLayer(data) = eltwise;
|
||||
eltwise->outData.push_back(data);
|
||||
getInputTo(data)[concat->name] = concat;
|
||||
concat->insData.push_back(data);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SubstituteScaleShiftBroadCastPass::run() {
|
||||
for (auto & l : *pLayers) {
|
||||
LayerInfo layerInfo(l);
|
||||
|
@ -101,6 +101,10 @@ DECL_PASS(SubstitutePRelu);
|
||||
*/
|
||||
DECL_PASS(SubstituteSoftSign);
|
||||
|
||||
/**
|
||||
* brief split ofver channels for Elementwise-layer to avoid GNA-HW limitation of 65 elements per eltwise
|
||||
*/
|
||||
DECL_PASS(EltwiseSplitOverChannels);
|
||||
/**
|
||||
* diagonal layer insertion required in cases where activation followed by split layers, or any other
|
||||
* topology changing layers
|
||||
|
@ -0,0 +1,82 @@
|
||||
// Copyright (C) 2020 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <tuple>
|
||||
#include <string>
|
||||
|
||||
#include <ie_core.hpp>
|
||||
|
||||
#include "functional_test_utils/layer_test_utils.hpp"
|
||||
#include "functional_test_utils/blob_utils.hpp"
|
||||
#include "ngraph_functions/utils/ngraph_helpers.hpp"
|
||||
#include "ngraph_functions/builders.hpp"
|
||||
|
||||
|
||||
typedef std::tuple<
|
||||
InferenceEngine::Precision, // Network Precision
|
||||
std::string, // Target Device
|
||||
std::map<std::string, std::string> //Configuration
|
||||
> EltwiseSplitOverChannelsPassParams;
|
||||
|
||||
namespace LayerTestsDefinitions {
|
||||
|
||||
class EltwiseSplitOverChannelsPassTest : public testing::WithParamInterface<EltwiseSplitOverChannelsPassParams>,
|
||||
public LayerTestsUtils::LayerTestsCommon {
|
||||
public:
|
||||
static std::string getTestCaseName(testing::TestParamInfo<EltwiseSplitOverChannelsPassParams> obj) {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::string targetDevice;
|
||||
std::map<std::string, std::string> configuration;
|
||||
std::tie(netPrecision, targetDevice, configuration) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << "netPRC=" << netPrecision.name() << "_";
|
||||
result << "targetDevice=" << targetDevice << "_";
|
||||
for (auto const& configItem : configuration) {
|
||||
result << "_configItem=" << configItem.first << "_" << configItem.second;
|
||||
}
|
||||
return result.str();
|
||||
}
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
InferenceEngine::Precision netPrecision;
|
||||
std::tie(netPrecision, targetDevice, configuration) = this->GetParam();
|
||||
auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision);
|
||||
|
||||
auto params = ngraph::builder::makeParams(ngPrc, { {1, 67000} });
|
||||
auto const_mult2 = ngraph::builder::makeConstant(ngPrc, {1, 67000}, {-1.0f});
|
||||
|
||||
auto sum = ngraph::builder::makeEltwise(params[0], const_mult2, ngraph::helpers::EltwiseTypes::MULTIPLY);
|
||||
function = std::make_shared<ngraph::Function>(sum, params, "RemovePermutationPass");
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(EltwiseSplitOverChannelsPassTest, CompareWithRefImpl) {
|
||||
Run();
|
||||
};
|
||||
|
||||
const std::vector<InferenceEngine::Precision> netPrecisions = {
|
||||
InferenceEngine::Precision::FP32,
|
||||
InferenceEngine::Precision::FP16,
|
||||
};
|
||||
|
||||
const std::vector<std::map<std::string, std::string>> configs = {
|
||||
{
|
||||
{"GNA_DEVICE_MODE", "GNA_SW_EXACT"},
|
||||
{"GNA_COMPACT_MODE", "NO"}
|
||||
}
|
||||
};
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(EltwiseSplitOverChennels, EltwiseSplitOverChannelsPassTest,
|
||||
::testing::Combine(
|
||||
::testing::ValuesIn(netPrecisions),
|
||||
::testing::Values(CommonTestUtils::DEVICE_GNA),
|
||||
::testing::ValuesIn(configs)),
|
||||
EltwiseSplitOverChannelsPassTest::getTestCaseName);
|
||||
|
||||
} // namespace LayerTestsDefinitions
|
||||
|
Loading…
Reference in New Issue
Block a user