16 byte memory alignment and concat (#17712)

* use device specific alignment instead of ALIGN64 macro

* update for tests

* update after review
This commit is contained in:
Tomasz Adamowicz 2023-06-16 13:30:59 +02:00 committed by GitHub
parent 0b708b5eff
commit a4519f0a2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 570 additions and 340 deletions

View File

@ -661,7 +661,6 @@ constexpr uint32_t Limitations::kConvFiltersNumDivider;
constexpr uint32_t Limitations::kConvFilterSizeDivider; constexpr uint32_t Limitations::kConvFilterSizeDivider;
constexpr uint32_t Limitations::kConvFilterMaxSize; constexpr uint32_t Limitations::kConvFilterMaxSize;
constexpr uint32_t Limitations::kConvEachKernelByteAlignment; constexpr uint32_t Limitations::kConvEachKernelByteAlignment;
constexpr uint32_t Limitations::kInputByteAlignment;
constexpr uint32_t Limitations::kNoOfInputsDivisor; constexpr uint32_t Limitations::kNoOfInputsDivisor;
constexpr uint32_t Limitations::kNoOfInputsLowPrecDivisor; constexpr uint32_t Limitations::kNoOfInputsLowPrecDivisor;
constexpr uint32_t Limitations::kAffineMaxBatchSize; constexpr uint32_t Limitations::kAffineMaxBatchSize;
@ -673,6 +672,7 @@ constexpr uint32_t Limitations::kMaxLayersCountGNA2_0;
constexpr uint32_t Limitations::kMaxLayersCountGNA3_X; constexpr uint32_t Limitations::kMaxLayersCountGNA3_X;
constexpr uint32_t Limitations::kBytesPerSplitElement; constexpr uint32_t Limitations::kBytesPerSplitElement;
constexpr uint32_t Limitations::kBytesPerCropElement; constexpr uint32_t Limitations::kBytesPerCropElement;
constexpr uint32_t Limitations::kBytesPerConcatElement;
constexpr uint32_t Limitations::kMemoryPageSize; constexpr uint32_t Limitations::kMemoryPageSize;
thread_local std::shared_ptr<Limitations> Limitations::k_instance{nullptr}; thread_local std::shared_ptr<Limitations> Limitations::k_instance{nullptr};

View File

@ -248,6 +248,7 @@ public:
bool use_only_16bit_convolution_weights() const; bool use_only_16bit_convolution_weights() const;
bool is_crop_affined_offset(size_t numberOfElements) const; bool is_crop_affined_offset(size_t numberOfElements) const;
bool is_aligned(size_t addr) const;
size_t get_memory_alignment() const; size_t get_memory_alignment() const;
std::shared_ptr<cnn2d::AbstractValidator> get_cnn_validator() const; std::shared_ptr<cnn2d::AbstractValidator> get_cnn_validator() const;
@ -260,7 +261,6 @@ public:
constexpr static uint32_t kConvFilterSizeDivider = 8; constexpr static uint32_t kConvFilterSizeDivider = 8;
constexpr static uint32_t kConvFilterMaxSize = 768; constexpr static uint32_t kConvFilterMaxSize = 768;
constexpr static uint32_t kConvEachKernelByteAlignment = 16; constexpr static uint32_t kConvEachKernelByteAlignment = 16;
constexpr static uint32_t kInputByteAlignment = 64;
constexpr static uint32_t kNoOfInputsDivisor = 8; constexpr static uint32_t kNoOfInputsDivisor = 8;
constexpr static uint32_t kNoOfInputsLowPrecDivisor = 16; constexpr static uint32_t kNoOfInputsLowPrecDivisor = 16;
constexpr static uint32_t kAffineMaxBatchSize = 8; constexpr static uint32_t kAffineMaxBatchSize = 8;
@ -274,10 +274,12 @@ public:
// Currently split layer only supports 2 bytes in int16 and int8 mode. // Currently split layer only supports 2 bytes in int16 and int8 mode.
// In fp32 mode this is not necessary but is useful for testing // In fp32 mode this is not necessary but is useful for testing
constexpr static uint32_t kBytesPerSplitElement = 2; constexpr static uint32_t kBytesPerSplitElement = 2;
// Currently crop layer only supports 2 bytes in int16 and int8 mode. // Currently crop layer only supports 2 bytes in int16 and int8 mode.
// In fp32 mode this is not necessary but is useful for testing // In fp32 mode this is not necessary but is useful for testing
constexpr static uint32_t kBytesPerCropElement = 2; constexpr static uint32_t kBytesPerCropElement = 2;
// currently concat layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this no necessary but usefull
// for testing
constexpr static uint32_t kBytesPerConcatElement = 2;
constexpr static uint32_t kMemoryPageSize = 4096; constexpr static uint32_t kMemoryPageSize = 4096;
private: private:
@ -306,7 +308,11 @@ inline std::shared_ptr<Limitations> Limitations::get_instance() {
inline bool Limitations::is_crop_affined_offset(size_t numberOfElements) const { inline bool Limitations::is_crop_affined_offset(size_t numberOfElements) const {
const auto cropOffset = numberOfElements * kBytesPerCropElement; const auto cropOffset = numberOfElements * kBytesPerCropElement;
return (ALIGN64(cropOffset) != cropOffset); return !is_aligned(cropOffset);
}
inline bool Limitations::is_aligned(size_t addr) const {
return (addr == ALIGN(addr, get_memory_alignment()));
} }
inline size_t Limitations::get_memory_alignment() const { inline size_t Limitations::get_memory_alignment() const {

View File

@ -87,7 +87,7 @@ inline bool is_aligned_split(const std::shared_ptr<ngraph::Node> input_op, size_
offset += outputSize * limitations::Limitations::kBytesPerSplitElement; offset += outputSize * limitations::Limitations::kBytesPerSplitElement;
} }
} }
return (offset == ALIGN64(offset)); return limitations::Limitations::get_instance()->is_aligned(offset);
} }
inline bool is_crop_affined(std::shared_ptr<ngraph::Node> node) { inline bool is_crop_affined(std::shared_ptr<ngraph::Node> node) {

View File

@ -47,12 +47,11 @@ public:
std::vector<SplitConnectedLayerInfo> splitOutputLayers; std::vector<SplitConnectedLayerInfo> splitOutputLayers;
}; };
// @brief Returns sizes of split outputs to split the input tensor to aligned parts not greater than the specified size // @brief Returns sizes of split outputs to split the input tensor into aligned parts that are not greater than the
inline std::vector<uint32_t> GetAlignedSplitSizes(uint32_t totalSize, // specified split size or alignment, depending on which one is larger
uint32_t maxSplitSize, inline std::vector<uint32_t> GetAlignedSplitSizes(uint32_t totalSize, uint32_t splitSize, uint32_t alignment) {
uint32_t alignment = limitations::Limitations::kInputByteAlignment) {
std::vector<uint32_t> splitSizes; std::vector<uint32_t> splitSizes;
uint32_t maxAlignedSplitSize = std::max(maxSplitSize - maxSplitSize % alignment, alignment); uint32_t maxAlignedSplitSize = std::max(splitSize - splitSize % alignment, alignment);
uint32_t usedSize = 0; uint32_t usedSize = 0;
while (usedSize < totalSize) { while (usedSize < totalSize) {
uint32_t partSize = std::min(totalSize - usedSize, maxAlignedSplitSize); uint32_t partSize = std::min(totalSize - usedSize, maxAlignedSplitSize);
@ -73,22 +72,21 @@ inline std::pair<int64_t, std::vector<uint32_t>> AlignedSplitSizesPerAxis(Infere
IE_ASSERT(firstValuableDim != std::end(dims)); IE_ASSERT(firstValuableDim != std::end(dims));
auto splittedElementsSize = *firstValuableDim; auto splittedElementsSize = *firstValuableDim;
auto splittedDimIx = std::distance(std::begin(dims), firstValuableDim); auto splittedDimIx = std::distance(std::begin(dims), firstValuableDim);
auto alignment = limitations::Limitations::kInputByteAlignment; auto alignment = limitations::Limitations::get_instance()->get_memory_alignment();
// Split output size should be multiple by 64 to avoid align filters insertion, // Split output size should be multiple of device memory alignment to avoid align filters insertion,
// but we need to check if our input size to split exceeds 64; if not we can always // but we need to check if our input size to split exceeds alignment; if not we can always
// split if the remaining size is aligned // split if the remaining size is aligned
if (splittedElementsSize <= alignment) { auto split_size = limitations::Limitations::kBufferMaxSize * splittedElementsSize / totalElementsSize;
if (splittedElementsSize <= alignment || split_size < alignment) {
if ((totalElementsSize / splittedElementsSize) % alignment == 0) { if ((totalElementsSize / splittedElementsSize) % alignment == 0) {
alignment = 1; alignment = 1;
} else { } else {
return {splittedDimIx, splitSizes}; return {splittedDimIx, splitSizes};
} }
} }
splitSizes = splitSizes = GetAlignedSplitSizes(splittedElementsSize, split_size, alignment);
GetAlignedSplitSizes(splittedElementsSize,
limitations::Limitations::kBufferMaxSize * splittedElementsSize / totalElementsSize,
alignment);
return {splittedDimIx, splitSizes}; return {splittedDimIx, splitSizes};
} }

View File

@ -1247,9 +1247,6 @@ void FlattenTrivialConcatPass::run() {
void InsertConcatAligningFilterPass::run() { void InsertConcatAligningFilterPass::run() {
OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertConcatAligningFilterPass"); OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertConcatAligningFilterPass");
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front()); auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
// currently concat layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this no necessary but usefull
// for testing
const int bytesPerConcatElement = 2;
int numOfFilterLayers = 0; int numOfFilterLayers = 0;
@ -1273,7 +1270,7 @@ void InsertConcatAligningFilterPass::run() {
auto concatInput = getLayerByIndex(input_idx); auto concatInput = getLayerByIndex(input_idx);
auto dims = concatInput->getDims(); auto dims = concatInput->getDims();
auto outputSize = details::product(++dims.begin(), dims.end()) * bytesPerConcatElement; auto outputSize = details::product(++dims.begin(), dims.end()) * Limitations::kBytesPerConcatElement;
auto useAlignFilterIf = [&concatLayer, &getLayerByIndex](int concat_input_idx) { auto useAlignFilterIf = [&concatLayer, &getLayerByIndex](int concat_input_idx) {
if (concatLayer->insData.size() <= concat_input_idx) if (concatLayer->insData.size() <= concat_input_idx)
@ -1290,7 +1287,8 @@ void InsertConcatAligningFilterPass::run() {
// correcting offset by copy layer insertion. This can be improved by collapsing copy and affine or diagonal // correcting offset by copy layer insertion. This can be improved by collapsing copy and affine or diagonal
// later-on if next concat inputs requires align filter - then current input also requires either copy or // later-on if next concat inputs requires align filter - then current input also requires either copy or
// align filter // align filter
if (ALIGN64(offset) != offset || (ALIGN64(outputSize) != outputSize && useAlignFilterIf(input_idx + 1))) { if ((!Limitations::get_instance()->is_aligned(offset)) ||
((!Limitations::get_instance()->is_aligned(outputSize)) && useAlignFilterIf(input_idx + 1))) {
auto prevLayer = getCreatorLayer(concatInput).lock(); auto prevLayer = getCreatorLayer(concatInput).lock();
// input layer parameters are copied not using GNA-primitives - so nothing to allign here. // input layer parameters are copied not using GNA-primitives - so nothing to allign here.
if (!useAlignFilterIf(input_idx)) if (!useAlignFilterIf(input_idx))
@ -1310,13 +1308,17 @@ void InsertConcatAligningFilterPass::run() {
} }
auto num_rows_in = dims[1]; auto num_rows_in = dims[1];
size_t aligned64_offset = std::max(0, static_cast<int>(ALIGN64(offset) - 64)); size_t aligned_offset =
size_t num_rows_padded = (offset - aligned64_offset) / bytesPerConcatElement; std::max(0,
static_cast<int>(ALIGN(offset, Limitations::get_instance()->get_memory_alignment()) -
Limitations::get_instance()->get_memory_alignment()));
size_t num_rows_padded = (offset - aligned_offset) / Limitations::kBytesPerConcatElement;
size_t num_rows_out = num_rows_padded + num_rows_in; size_t num_rows_out = num_rows_padded + num_rows_in;
// encodes offset to beginning of split layer input // encodes offset to beginning of split layer input
size_t bytesOffset = size_t bytesOffset =
(aligned64_offset / bytesPerConcatElement) * (quantized ? bytesPerConcatElement : 4); (aligned_offset / Limitations::kBytesPerConcatElement) *
(quantized ? Limitations::kBytesPerConcatElement : Precision(Precision::FP32).size());
concatAligningFilter->params["output_offset"] = std::to_string(bytesOffset); concatAligningFilter->params["output_offset"] = std::to_string(bytesOffset);
// for padded rows we cannot use copy layer - TBD how to implement // for padded rows we cannot use copy layer - TBD how to implement
@ -1496,7 +1498,7 @@ void InsertSplitAligningFilterPass::run() {
for (auto&& splitOutput : l->outData) { for (auto&& splitOutput : l->outData) {
auto outputSize = product(begin(splitOutput->getDims()), end(splitOutput->getDims())); auto outputSize = product(begin(splitOutput->getDims()), end(splitOutput->getDims()));
if ((currentOffset != ALIGN64(currentOffset)) || (padding != 0)) { if ((!Limitations::get_instance()->is_aligned(currentOffset)) || (padding != 0)) {
// check that this split output actually connected to further layers // check that this split output actually connected to further layers
if (getInputTo(splitOutput).empty()) { if (getInputTo(splitOutput).empty()) {
log::debug() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n"; log::debug() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n";
@ -1507,7 +1509,7 @@ void InsertSplitAligningFilterPass::run() {
<< " Convolution Filter doesn't support batch=" << splitOutput->getDims().front(); << " Convolution Filter doesn't support batch=" << splitOutput->getDims().front();
} }
// this split output not beginning from 64 bytes aligned boundary - need to correct by aligning // this split output not beginning from aligned bytes boundary - need to correct by aligning
// filter layer insert the filter // filter layer insert the filter
auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++); auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++);
@ -1527,20 +1529,22 @@ void InsertSplitAligningFilterPass::run() {
auto inputData = splitOutput; auto inputData = splitOutput;
size_t aligned64_offset = std::max(0, static_cast<int>(ALIGN64(currentOffset) - 64)); size_t aligned_offset = std::max(
0,
static_cast<int>(ALIGN(currentOffset, Limitations::get_instance()->get_memory_alignment()) -
Limitations::get_instance()->get_memory_alignment()));
IE_ASSERT(filterLayer != nullptr); IE_ASSERT(filterLayer != nullptr);
// encodes offset to beginning of split layer input // encodes offset to beginning of split layer input
filterLayer->params["offset"] = filterLayer->params["offset"] = std::to_string(aligned_offset / Limitations::kBytesPerSplitElement);
std::to_string(aligned64_offset / Limitations::kBytesPerSplitElement);
auto dims = splitOutput->getTensorDesc().getDims(); auto dims = splitOutput->getTensorDesc().getDims();
if (dims.size() > 3) { if (dims.size() > 3) {
THROW_GNA_EXCEPTION << "unsupported split layer dims size: " << dims.size(); THROW_GNA_EXCEPTION << "unsupported split layer dims size: " << dims.size();
} }
const auto offsetOfUnalignment = const auto offsetOfUnalignment =
(currentOffset - aligned64_offset) / Limitations::kBytesPerSplitElement; (currentOffset - aligned_offset) / Limitations::kBytesPerSplitElement;
// TODO consider to use a different number of filters do decrese the number of trailing zeros // TODO consider to use a different number of filters do decrese the number of trailing zeros
// (additionalPaddingOfFilter) // (additionalPaddingOfFilter)
const auto numberOfFilters = Limitations::kConvMinFiltersNum; const auto numberOfFilters = Limitations::kConvMinFiltersNum;

View File

@ -152,7 +152,7 @@ DECL_PASS(InsertSplitAligningFilter);
DECL_PASS(FlattenTrivialConcat); DECL_PASS(FlattenTrivialConcat);
/** /**
* @brief concat-aligning filter layer insertion required in cases when concat inputs size are not 64-aligned * @brief concat-aligning filter layer insertion required in cases when concat inputs size are not aligned
*/ */
DECL_PASS(InsertConcatAligningFilter); DECL_PASS(InsertConcatAligningFilter);

View File

@ -64,7 +64,9 @@ static bool Convert(std::shared_ptr<ngraph::Node> conv,
auto& input = conv->get_input_shape(0); auto& input = conv->get_input_shape(0);
uint32_t width = input.back(); uint32_t width = input.back();
uint32_t in_channels = input.at(1); uint32_t in_channels = input.at(1);
auto split_sizes = GetAlignedSplitSizes(width, Limitations::kBufferMaxSize / in_channels); auto split_sizes = GetAlignedSplitSizes(width,
Limitations::kBufferMaxSize / in_channels,
Limitations::get_instance()->get_memory_alignment());
IE_ASSERT(split_sizes.size() > 1); IE_ASSERT(split_sizes.size() > 1);
std::vector<int64_t> split_sizes_casted(split_sizes.size()); std::vector<int64_t> split_sizes_casted(split_sizes.size());
std::transform(std::begin(split_sizes), std::end(split_sizes), std::begin(split_sizes_casted), [](uint32_t size) { std::transform(std::begin(split_sizes), std::end(split_sizes), std::begin(split_sizes_casted), [](uint32_t size) {

View File

@ -7,9 +7,13 @@
#include <vector> #include <vector>
// to suppress deprecated definition errors // to suppress deprecated definition errors
#define IMPLEMENT_INFERENCE_ENGINE_PLUGIN #define IMPLEMENT_INFERENCE_ENGINE_PLUGIN
#include "common/gna_target.hpp"
#include "layers/gna_split_layer.hpp" #include "layers/gna_split_layer.hpp"
#include "ngraph/opsets/opset9.hpp" #include "ngraph/opsets/opset9.hpp"
using namespace ov::intel_gna::limitations;
using namespace ov::intel_gna::target;
namespace { namespace {
using GetAlignedSplitSizesData = std::tuple<uint32_t, // total size using GetAlignedSplitSizesData = std::tuple<uint32_t, // total size
@ -19,10 +23,15 @@ using GetAlignedSplitSizesData = std::tuple<uint32_t, // total size
>; >;
const std::vector<GetAlignedSplitSizesData> data = { const std::vector<GetAlignedSplitSizesData> data = {
GetAlignedSplitSizesData{10, 100, 64, std::vector<uint32_t>{10}},
GetAlignedSplitSizesData{1024, 100, 64, std::vector<uint32_t>(16, 64)}, GetAlignedSplitSizesData{1024, 100, 64, std::vector<uint32_t>(16, 64)},
GetAlignedSplitSizesData{151, 100, 64, std::vector<uint32_t>{64, 64, 23}}, GetAlignedSplitSizesData{151, 100, 64, std::vector<uint32_t>{64, 64, 23}},
GetAlignedSplitSizesData{151, 65, 32, std::vector<uint32_t>{64, 64, 23}}, GetAlignedSplitSizesData{151, 65, 32, std::vector<uint32_t>{64, 64, 23}},
GetAlignedSplitSizesData{151, 65, 1, std::vector<uint32_t>{65, 65, 21}}}; GetAlignedSplitSizesData{151, 33, 32, std::vector<uint32_t>{32, 32, 32, 32, 23}},
GetAlignedSplitSizesData{151, 17, 16, std::vector<uint32_t>{16, 16, 16, 16, 16, 16, 16, 16, 16, 7}},
GetAlignedSplitSizesData{151, 65, 1, std::vector<uint32_t>{65, 65, 21}},
GetAlignedSplitSizesData{67000, 65528, 64, std::vector<uint32_t>{65472, 1528}},
GetAlignedSplitSizesData{67000, 65528, 16, std::vector<uint32_t>{65520, 1480}}};
TEST(GetAlignedSplitSizesTest, testAlignedSplitSizes) { TEST(GetAlignedSplitSizesTest, testAlignedSplitSizes) {
for (const auto& dataItem : data) { for (const auto& dataItem : data) {
@ -38,55 +47,86 @@ using VariadicSplitParameters = std::tuple<ov::Shape, // input size
bool // supported bool // supported
>; >;
const std::vector<VariadicSplitParameters> variadic_split_data = { void RunVariadicSplitSupportedTest(DeviceVersion device_version, std::vector<VariadicSplitParameters> test_vectors) {
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{192, 192, 320, 320}, true},
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{640, 192, 192}, true},
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{500, 24, 500}, false},
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{700, 300, 24}, false},
};
TEST(CheckSplitSupported, CheckVariadicSplitSupported) {
ov::Shape input_shape; ov::Shape input_shape;
uint32_t axis; uint32_t axis;
std::vector<int32_t> split_lengths; std::vector<int32_t> split_lengths;
bool result; bool result;
for (const auto& item : variadic_split_data) {
Limitations::init(device_version);
for (const auto& item : test_vectors) {
std::tie(input_shape, axis, split_lengths, result) = item; std::tie(input_shape, axis, split_lengths, result) = item;
auto split = std::make_shared<ngraph::opset9::VariadicSplit>( auto split = std::make_shared<ngraph::opset9::VariadicSplit>(
std::make_shared<ngraph::opset9::Parameter>(ngraph::element::f32, input_shape), std::make_shared<ngraph::opset9::Parameter>(ngraph::element::f32, input_shape),
ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::Shape({1}), {axis}), ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::Shape({1}), {axis}),
ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::opset9::Constant::create(ngraph::element::i64,
ngraph::Shape({split_lengths.size()}), ngraph::Shape({split_lengths.size()}),
split_lengths)); split_lengths));
ASSERT_TRUE(ov::intel_gna::limitations::Limitations::is_split_supported(split, false) == result); ASSERT_TRUE(Limitations::is_split_supported(split, false) == result);
} }
} }
TEST(CheckSplitSupported, CheckVariadicSplitSupported_GNA3_5) {
RunVariadicSplitSupportedTest(
DeviceVersion::GNA3_5,
{VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{192, 192, 320, 320}, true},
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{640, 192, 192}, true},
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{16, 1008}, false},
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{500, 24, 500}, false},
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{700, 300, 24}, false}});
}
TEST(CheckSplitSupported, CheckVariadicSplitSupported_GNA3_6) {
RunVariadicSplitSupportedTest(
DeviceVersion::GNA3_6,
{VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{192, 192, 320, 320}, true},
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{640, 192, 192}, true},
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{16, 1008}, true},
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{500, 24, 500}, false},
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{700, 300, 24}, false}});
}
using SplitParameters = std::tuple<ov::Shape, // input size using SplitParameters = std::tuple<ov::Shape, // input size
uint32_t, // axis uint32_t, // axis
uint32_t, // num_splits uint32_t, // num_splits
bool // supported bool // supported
>; >;
const std::vector<SplitParameters> split_data = { void RunSplitSupportedTest(DeviceVersion device_version, std::vector<SplitParameters> test_vectors) {
SplitParameters{ov::Shape{1024}, 0, 4, true},
SplitParameters{ov::Shape{1, 1024}, 1, 16, true},
SplitParameters{ov::Shape{1024}, 0, 64, false},
SplitParameters{ov::Shape{1, 1024}, 1, 256, false},
};
TEST(CheckSplitSupported, CheckSplitSupported) {
ov::Shape input_shape; ov::Shape input_shape;
uint32_t axis; uint32_t axis;
uint32_t num_splits; uint32_t num_splits;
bool result; bool result;
for (const auto& item : split_data) {
Limitations::init(device_version);
for (const auto& item : test_vectors) {
std::tie(input_shape, axis, num_splits, result) = item; std::tie(input_shape, axis, num_splits, result) = item;
auto split = std::make_shared<ngraph::opset9::Split>( auto split = std::make_shared<ngraph::opset9::Split>(
std::make_shared<ngraph::opset9::Parameter>(ngraph::element::f32, input_shape), std::make_shared<ngraph::opset9::Parameter>(ngraph::element::f32, input_shape),
ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::Shape({}), {axis}), ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::Shape({}), {axis}),
num_splits); num_splits);
ASSERT_TRUE(ov::intel_gna::limitations::Limitations::is_split_supported(split, false) == result); ASSERT_TRUE(Limitations::is_split_supported(split, false) == result);
} }
} }
TEST(CheckSplitSupported, CheckSplitSupported_GNA3_5) {
RunSplitSupportedTest(DeviceVersion::GNA3_5,
{
SplitParameters{ov::Shape{1024}, 0, 4, true},
SplitParameters{ov::Shape{1, 1024}, 1, 16, true},
SplitParameters{ov::Shape{1024}, 0, 64, false},
SplitParameters{ov::Shape{1, 1024}, 1, 256, false},
});
}
TEST(CheckSplitSupported, CheckSplitSupported_GNA3_6) {
RunSplitSupportedTest(DeviceVersion::GNA3_6,
{
SplitParameters{ov::Shape{1024}, 0, 4, true},
SplitParameters{ov::Shape{1, 1024}, 1, 16, true},
SplitParameters{ov::Shape{1024}, 0, 64, true},
SplitParameters{ov::Shape{1, 1024}, 1, 256, false},
});
}
} // namespace } // namespace

View File

@ -9,9 +9,14 @@
#include <ngraph/pass/manager.hpp> #include <ngraph/pass/manager.hpp>
#include <transformations/init_node_info.hpp> #include <transformations/init_node_info.hpp>
#include "backend/gna_limitations.hpp"
#include "common/gna_target.hpp"
#include "common_test_utils/ngraph_test_utils.hpp" #include "common_test_utils/ngraph_test_utils.hpp"
#include "transformations/split_convolution_with_large_buffer_size.hpp" #include "transformations/split_convolution_with_large_buffer_size.hpp"
using namespace ov::intel_gna::limitations;
using namespace ov::intel_gna::target;
namespace testing { namespace testing {
namespace { namespace {
@ -126,43 +131,41 @@ ngraph::Output<ngraph::Node> CreateConvolution::createOutputNode(const ngraph::O
} }
// should be used only after CreateBaseDecorator // should be used only after CreateBaseDecorator
template <const ngraph::Shape& kernel_shape, const ngraph::Shape& split_shape>
class CreateSplittedConvolution : public CreateGraphDecorator { class CreateSplittedConvolution : public CreateGraphDecorator {
public: public:
CreateSplittedConvolution(CreateGraphDecoratorPtr prev, CreateSplittedConvolution(CreateGraphDecoratorPtr prev)
const ngraph::Shape& kernel_shape = ngraph::Shape{1, 64, 1, 1},
const ngraph::Shape& split_shape = ngraph::Shape{960, 960, 960, 960, 256})
: CreateGraphDecorator(std::move(prev)), : CreateGraphDecorator(std::move(prev)),
kernel_shape_(kernel_shape), kernel_shape_(kernel_shape),
split_shape_(split_shape) {} split_shape_(split_shape) {}
protected: protected:
void updateGraph(Graph& graph) override; void updateGraph(Graph& graph) override {
auto split_node_c1 =
ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{3});
auto split_node_c2 =
ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_shape_.size()}), split_shape_);
auto split_node =
std::make_shared<ngraph::opset7::VariadicSplit>(graph.input_params, split_node_c1, split_node_c2);
auto kernel = ngraph::opset7::Constant::create(ngraph::element::f32, kernel_shape_, {1});
for (int i = 0; i < split_shape_.size(); ++i) {
auto convolution_operation = std::make_shared<ngraph::opset7::Convolution>(split_node->output(i),
kernel,
ngraph::Strides{1, 1},
ngraph::CoordinateDiff{0, 0},
ngraph::CoordinateDiff{0, 0},
ngraph::Strides{1, 1});
graph.output_nodes.push_back(convolution_operation);
}
}
private: private:
const ngraph::Shape kernel_shape_; const ngraph::Shape kernel_shape_;
const ngraph::Shape split_shape_; const ngraph::Shape split_shape_;
}; };
void CreateSplittedConvolution::updateGraph(Graph& graph) {
auto split_node_c1 =
ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{3});
auto split_node_c2 =
ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_shape_.size()}), split_shape_);
auto split_node = std::make_shared<ngraph::opset7::VariadicSplit>(graph.input_params, split_node_c1, split_node_c2);
auto kernel = ngraph::opset7::Constant::create(ngraph::element::f32, kernel_shape_, {1});
for (int i = 0; i < split_shape_.size(); ++i) {
auto convolution_operation = std::make_shared<ngraph::opset7::Convolution>(split_node->output(i),
kernel,
ngraph::Strides{1, 1},
ngraph::CoordinateDiff{0, 0},
ngraph::CoordinateDiff{0, 0},
ngraph::Strides{1, 1});
graph.output_nodes.push_back(convolution_operation);
}
}
class CreateAdd : public CreateAppendableGraphDecorator { class CreateAdd : public CreateAppendableGraphDecorator {
public: public:
CreateAdd(CreateGraphDecoratorPtr prev) : CreateAppendableGraphDecorator(std::move(prev)) {} CreateAdd(CreateGraphDecoratorPtr prev) : CreateAppendableGraphDecorator(std::move(prev)) {}
@ -261,9 +264,10 @@ Graph createSolidGraph(const ngraph::Shape& input_shape, const ngraph::Shape& ke
// ------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------
using TestParams = std::tuple<Graph, Graph, ngraph::pass::Manager>;
class SplitConvolutionFixture : public CommonTestUtils::TestsCommon, class SplitConvolutionFixture : public CommonTestUtils::TestsCommon,
public ::testing::WithParamInterface< public ::testing::WithParamInterface<std::tuple<DeviceVersion, TestParams>> {
std::tuple<Graph /* tranformed */, Graph /* reference */, ngraph::pass::Manager>> {
public: public:
void SetUp() override; void SetUp() override;
@ -274,10 +278,14 @@ public:
void SplitConvolutionFixture::SetUp() { void SplitConvolutionFixture::SetUp() {
// TODO: use auto & [transformed_graph, reference_graph] = this->GetParam() when C++17 // TODO: use auto & [transformed_graph, reference_graph] = this->GetParam() when C++17
DeviceVersion device_version;
TestParams params;
Graph transformed_graph; Graph transformed_graph;
Graph reference_graph; Graph reference_graph;
std::tie(transformed_graph, reference_graph, pass_manager) = this->GetParam(); std::tie(device_version, params) = this->GetParam();
std::tie(transformed_graph, reference_graph, pass_manager) = params;
Limitations::init(device_version);
function = transformed_graph.createFunction(); function = transformed_graph.createFunction();
reference_function = reference_graph.createFunction(); reference_function = reference_graph.createFunction();
} }
@ -305,34 +313,70 @@ TEST_P(SplitConvolutionFixture, CompareFunctions) {
} }
INSTANTIATE_TEST_SUITE_P( INSTANTIATE_TEST_SUITE_P(
SplitConvolutionTestSuite, SplitConvolution_GNA3_0_3_5_3_6_TestSuite,
SplitConvolutionFixture, SplitConvolutionFixture,
::testing::Values( ::testing::Combine(
std::make_tuple(createGraph<CreateConvolution>(), ::testing::Values(DeviceVersion::GNA3_0, DeviceVersion::GNA3_5, DeviceVersion::GNA3_6),
createGraph<CreateConcat, CreateSplittedConvolution>(), ::testing::Values(
createPassManager<ov::intel_gna::pass::SplitConvolution>()), std::make_tuple(createSolidGraph(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
std::make_tuple(createGraph<CreateAdd, CreateConvolution>(), createSolidGraph(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createGraph<CreateConcat, CreateAdd, CreateSplittedConvolution>(), createPassManager<ov::intel_gna::pass::SplitConvolution>()),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()), std::make_tuple(createSolidGraph<CreateAdd>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
std::make_tuple(createGraph<CreateFakeQuantize, CreateConvolution>(), createSolidGraph<CreateAdd>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createGraph<CreateConcat, CreateFakeQuantize, CreateSplittedConvolution>(), createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()), std::make_tuple(createSolidGraph<CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
std::make_tuple(createGraph<CreateFakeQuantize, CreateAdd, CreateConvolution>(), createSolidGraph<CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createGraph<CreateConcat, CreateFakeQuantize, CreateAdd, CreateSplittedConvolution>(), createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()), std::make_tuple(
std::make_tuple(createSolidGraph(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}), createSolidGraph<CreateAdd, CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createSolidGraph(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}), createSolidGraph<CreateAdd, CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createPassManager<ov::intel_gna::pass::SplitConvolution>()), createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))));
std::make_tuple(createSolidGraph<CreateAdd>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createSolidGraph<CreateAdd>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}), ngraph::Shape kernel_shape_3_5 = {1, 64, 1, 1};
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()), ngraph::Shape split_shape_3_5 = {960, 960, 960, 960, 256};
std::make_tuple(createSolidGraph<CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}), using CreateSplitedConvolution3_5 = CreateSplittedConvolution<kernel_shape_3_5, split_shape_3_5>;
createSolidGraph<CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()), INSTANTIATE_TEST_SUITE_P(
std::make_tuple( SplitConvolution_GNA3_0_3_5_TestSuite,
createSolidGraph<CreateAdd, CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}), SplitConvolutionFixture,
createSolidGraph<CreateAdd, CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}), ::testing::Combine(
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))); ::testing::Values(DeviceVersion::GNA3_0, DeviceVersion::GNA3_5),
::testing::Values(
std::make_tuple(createGraph<CreateConvolution>(),
createGraph<CreateConcat, CreateSplitedConvolution3_5>(),
createPassManager<ov::intel_gna::pass::SplitConvolution>()),
std::make_tuple(createGraph<CreateAdd, CreateConvolution>(),
createGraph<CreateConcat, CreateAdd, CreateSplitedConvolution3_5>(),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()),
std::make_tuple(createGraph<CreateFakeQuantize, CreateConvolution>(),
createGraph<CreateConcat, CreateFakeQuantize, CreateSplitedConvolution3_5>(),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
std::make_tuple(createGraph<CreateFakeQuantize, CreateAdd, CreateConvolution>(),
createGraph<CreateConcat, CreateFakeQuantize, CreateAdd, CreateSplitedConvolution3_5>(),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))));
ngraph::Shape kernel_shape_3_6 = {1, 64, 1, 1};
ngraph::Shape split_shape_3_6 = {1008, 1008, 1008, 1008, 64};
using CreateSplitedConvolution3_6 = CreateSplittedConvolution<kernel_shape_3_6, split_shape_3_6>;
INSTANTIATE_TEST_SUITE_P(
SplitConvolution_GNA3_6_TestSuite,
SplitConvolutionFixture,
::testing::Combine(
::testing::Values(DeviceVersion::GNA3_6),
::testing::Values(
std::make_tuple(createGraph<CreateConvolution>(),
createGraph<CreateConcat, CreateSplitedConvolution3_6>(),
createPassManager<ov::intel_gna::pass::SplitConvolution>()),
std::make_tuple(createGraph<CreateAdd, CreateConvolution>(),
createGraph<CreateConcat, CreateAdd, CreateSplitedConvolution3_6>(),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()),
std::make_tuple(createGraph<CreateFakeQuantize, CreateConvolution>(),
createGraph<CreateConcat, CreateFakeQuantize, CreateSplitedConvolution3_6>(),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
std::make_tuple(createGraph<CreateFakeQuantize, CreateAdd, CreateConvolution>(),
createGraph<CreateConcat, CreateFakeQuantize, CreateAdd, CreateSplitedConvolution3_6>(),
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))));
} // namespace } // namespace
} // namespace testing } // namespace testing

View File

@ -11,10 +11,15 @@
#include <ngraph/pass/manager.hpp> #include <ngraph/pass/manager.hpp>
#include <transformations/init_node_info.hpp> #include <transformations/init_node_info.hpp>
#include "backend/gna_limitations.hpp"
#include "common/gna_target.hpp"
#include "common_test_utils/common_utils.hpp" #include "common_test_utils/common_utils.hpp"
#include "common_test_utils/ngraph_test_utils.hpp" #include "common_test_utils/ngraph_test_utils.hpp"
#include "transformations/split_eltwise.hpp" #include "transformations/split_eltwise.hpp"
using namespace ov::intel_gna::limitations;
using namespace ov::intel_gna::target;
namespace testing { namespace testing {
namespace { namespace {
@ -87,21 +92,24 @@ static std::shared_ptr<ngraph::Function> createFunction(const ngraph::Shape& inp
} }
} }
typedef std::tuple<ngraph::Shape, typedef std::tuple<DeviceVersion, // device version
bool, // with const ngraph::Shape, // input shape
bool, // with fq bool, // with const
ELTWISE_TYPE // eltwise type bool, // with fq
ELTWISE_TYPE // eltwise type
> >
EltwiseSplitParams; EltwiseSplitParams;
static std::string getTestCaseName(testing::TestParamInfo<EltwiseSplitParams> obj) { static std::string getTestCaseName(testing::TestParamInfo<EltwiseSplitParams> obj) {
DeviceVersion device_ver;
ngraph::Shape shape; ngraph::Shape shape;
bool with_const; bool with_const;
bool with_fq; bool with_fq;
ELTWISE_TYPE type; ELTWISE_TYPE type;
std::tie(shape, with_const, with_fq, type) = obj.param; std::tie(device_ver, shape, with_const, with_fq, type) = obj.param;
std::ostringstream result; std::ostringstream result;
result << DeviceToString(device_ver) << "_";
result << "IS=" << CommonTestUtils::vec2str(shape) << "_"; result << "IS=" << CommonTestUtils::vec2str(shape) << "_";
result << "wConst=" << with_const << "_"; result << "wConst=" << with_const << "_";
result << "wFQ=" << with_fq << "_"; result << "wFQ=" << with_fq << "_";
@ -132,11 +140,13 @@ public:
}; };
void SplitEltwiseTestSuiteFixture::SetUp() { void SplitEltwiseTestSuiteFixture::SetUp() {
DeviceVersion device_ver;
ngraph::Shape shape; ngraph::Shape shape;
bool with_const; bool with_const;
bool with_fq; bool with_fq;
ELTWISE_TYPE type; ELTWISE_TYPE type;
std::tie(shape, with_const, with_fq, type) = this->GetParam(); std::tie(device_ver, shape, with_const, with_fq, type) = this->GetParam();
Limitations::init(device_ver);
function = createFunction(shape, with_const, with_fq, type, false); function = createFunction(shape, with_const, with_fq, type, false);
reference_function = createFunction(shape, with_const, with_fq, type, true); reference_function = createFunction(shape, with_const, with_fq, type, true);
} }
@ -158,16 +168,19 @@ TEST_P(SplitEltwiseTestSuiteFixture, CompareFunctions) {
const std::vector<ov::Shape> inputShape = {{1, 67000}, {1, 500000}, {1, 936, 513}, {1, 64, 64, 64}, {1, 256, 64, 64}}; const std::vector<ov::Shape> inputShape = {{1, 67000}, {1, 500000}, {1, 936, 513}, {1, 64, 64, 64}, {1, 256, 64, 64}};
INSTANTIATE_TEST_SUITE_P(SplitEltwiseTestSuite, INSTANTIATE_TEST_SUITE_P(
SplitEltwiseTestSuiteFixture, SplitEltwiseTestSuite,
::testing::Combine(::testing::ValuesIn(inputShape), SplitEltwiseTestSuiteFixture,
::testing::ValuesIn(std::vector<bool>{true, false}), // with const ::testing::Combine(::testing::ValuesIn(std::vector<DeviceVersion>{DeviceVersion::GNA3_0, // device version
::testing::ValuesIn(std::vector<bool>{true, false}), // with fq DeviceVersion::GNA3_5,
::testing::ValuesIn(std::vector<ELTWISE_TYPE>{ DeviceVersion::GNA3_6}),
ELTWISE_TYPE::Sum, ::testing::ValuesIn(inputShape),
ELTWISE_TYPE::Sub, ::testing::ValuesIn(std::vector<bool>{true, false}), // with const
ELTWISE_TYPE::Prod})), // eltwise type ::testing::ValuesIn(std::vector<bool>{true, false}), // with fq
getTestCaseName); ::testing::ValuesIn(std::vector<ELTWISE_TYPE>{ELTWISE_TYPE::Sum,
ELTWISE_TYPE::Sub,
ELTWISE_TYPE::Prod})), // eltwise type
getTestCaseName);
} // namespace } // namespace
} // namespace testing } // namespace testing