16 byte memory alignment and concat (#17712)
* use device specific alignment instead of ALIGN64 macro * update for tests * update after review
This commit is contained in:
parent
0b708b5eff
commit
a4519f0a2c
@ -661,7 +661,6 @@ constexpr uint32_t Limitations::kConvFiltersNumDivider;
|
||||
constexpr uint32_t Limitations::kConvFilterSizeDivider;
|
||||
constexpr uint32_t Limitations::kConvFilterMaxSize;
|
||||
constexpr uint32_t Limitations::kConvEachKernelByteAlignment;
|
||||
constexpr uint32_t Limitations::kInputByteAlignment;
|
||||
constexpr uint32_t Limitations::kNoOfInputsDivisor;
|
||||
constexpr uint32_t Limitations::kNoOfInputsLowPrecDivisor;
|
||||
constexpr uint32_t Limitations::kAffineMaxBatchSize;
|
||||
@ -673,6 +672,7 @@ constexpr uint32_t Limitations::kMaxLayersCountGNA2_0;
|
||||
constexpr uint32_t Limitations::kMaxLayersCountGNA3_X;
|
||||
constexpr uint32_t Limitations::kBytesPerSplitElement;
|
||||
constexpr uint32_t Limitations::kBytesPerCropElement;
|
||||
constexpr uint32_t Limitations::kBytesPerConcatElement;
|
||||
constexpr uint32_t Limitations::kMemoryPageSize;
|
||||
|
||||
thread_local std::shared_ptr<Limitations> Limitations::k_instance{nullptr};
|
||||
|
@ -248,6 +248,7 @@ public:
|
||||
|
||||
bool use_only_16bit_convolution_weights() const;
|
||||
bool is_crop_affined_offset(size_t numberOfElements) const;
|
||||
bool is_aligned(size_t addr) const;
|
||||
size_t get_memory_alignment() const;
|
||||
std::shared_ptr<cnn2d::AbstractValidator> get_cnn_validator() const;
|
||||
|
||||
@ -260,7 +261,6 @@ public:
|
||||
constexpr static uint32_t kConvFilterSizeDivider = 8;
|
||||
constexpr static uint32_t kConvFilterMaxSize = 768;
|
||||
constexpr static uint32_t kConvEachKernelByteAlignment = 16;
|
||||
constexpr static uint32_t kInputByteAlignment = 64;
|
||||
constexpr static uint32_t kNoOfInputsDivisor = 8;
|
||||
constexpr static uint32_t kNoOfInputsLowPrecDivisor = 16;
|
||||
constexpr static uint32_t kAffineMaxBatchSize = 8;
|
||||
@ -274,10 +274,12 @@ public:
|
||||
// Currently split layer only supports 2 bytes in int16 and int8 mode.
|
||||
// In fp32 mode this is not necessary but is useful for testing
|
||||
constexpr static uint32_t kBytesPerSplitElement = 2;
|
||||
|
||||
// Currently crop layer only supports 2 bytes in int16 and int8 mode.
|
||||
// In fp32 mode this is not necessary but is useful for testing
|
||||
constexpr static uint32_t kBytesPerCropElement = 2;
|
||||
// currently concat layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this no necessary but usefull
|
||||
// for testing
|
||||
constexpr static uint32_t kBytesPerConcatElement = 2;
|
||||
constexpr static uint32_t kMemoryPageSize = 4096;
|
||||
|
||||
private:
|
||||
@ -306,7 +308,11 @@ inline std::shared_ptr<Limitations> Limitations::get_instance() {
|
||||
|
||||
inline bool Limitations::is_crop_affined_offset(size_t numberOfElements) const {
|
||||
const auto cropOffset = numberOfElements * kBytesPerCropElement;
|
||||
return (ALIGN64(cropOffset) != cropOffset);
|
||||
return !is_aligned(cropOffset);
|
||||
}
|
||||
|
||||
inline bool Limitations::is_aligned(size_t addr) const {
|
||||
return (addr == ALIGN(addr, get_memory_alignment()));
|
||||
}
|
||||
|
||||
inline size_t Limitations::get_memory_alignment() const {
|
||||
|
@ -87,7 +87,7 @@ inline bool is_aligned_split(const std::shared_ptr<ngraph::Node> input_op, size_
|
||||
offset += outputSize * limitations::Limitations::kBytesPerSplitElement;
|
||||
}
|
||||
}
|
||||
return (offset == ALIGN64(offset));
|
||||
return limitations::Limitations::get_instance()->is_aligned(offset);
|
||||
}
|
||||
|
||||
inline bool is_crop_affined(std::shared_ptr<ngraph::Node> node) {
|
||||
|
@ -47,12 +47,11 @@ public:
|
||||
std::vector<SplitConnectedLayerInfo> splitOutputLayers;
|
||||
};
|
||||
|
||||
// @brief Returns sizes of split outputs to split the input tensor to aligned parts not greater than the specified size
|
||||
inline std::vector<uint32_t> GetAlignedSplitSizes(uint32_t totalSize,
|
||||
uint32_t maxSplitSize,
|
||||
uint32_t alignment = limitations::Limitations::kInputByteAlignment) {
|
||||
// @brief Returns sizes of split outputs to split the input tensor into aligned parts that are not greater than the
|
||||
// specified split size or alignment, depending on which one is larger
|
||||
inline std::vector<uint32_t> GetAlignedSplitSizes(uint32_t totalSize, uint32_t splitSize, uint32_t alignment) {
|
||||
std::vector<uint32_t> splitSizes;
|
||||
uint32_t maxAlignedSplitSize = std::max(maxSplitSize - maxSplitSize % alignment, alignment);
|
||||
uint32_t maxAlignedSplitSize = std::max(splitSize - splitSize % alignment, alignment);
|
||||
uint32_t usedSize = 0;
|
||||
while (usedSize < totalSize) {
|
||||
uint32_t partSize = std::min(totalSize - usedSize, maxAlignedSplitSize);
|
||||
@ -73,22 +72,21 @@ inline std::pair<int64_t, std::vector<uint32_t>> AlignedSplitSizesPerAxis(Infere
|
||||
IE_ASSERT(firstValuableDim != std::end(dims));
|
||||
auto splittedElementsSize = *firstValuableDim;
|
||||
auto splittedDimIx = std::distance(std::begin(dims), firstValuableDim);
|
||||
auto alignment = limitations::Limitations::kInputByteAlignment;
|
||||
auto alignment = limitations::Limitations::get_instance()->get_memory_alignment();
|
||||
|
||||
// Split output size should be multiple by 64 to avoid align filters insertion,
|
||||
// but we need to check if our input size to split exceeds 64; if not we can always
|
||||
// Split output size should be multiple of device memory alignment to avoid align filters insertion,
|
||||
// but we need to check if our input size to split exceeds alignment; if not we can always
|
||||
// split if the remaining size is aligned
|
||||
if (splittedElementsSize <= alignment) {
|
||||
auto split_size = limitations::Limitations::kBufferMaxSize * splittedElementsSize / totalElementsSize;
|
||||
|
||||
if (splittedElementsSize <= alignment || split_size < alignment) {
|
||||
if ((totalElementsSize / splittedElementsSize) % alignment == 0) {
|
||||
alignment = 1;
|
||||
} else {
|
||||
return {splittedDimIx, splitSizes};
|
||||
}
|
||||
}
|
||||
splitSizes =
|
||||
GetAlignedSplitSizes(splittedElementsSize,
|
||||
limitations::Limitations::kBufferMaxSize * splittedElementsSize / totalElementsSize,
|
||||
alignment);
|
||||
splitSizes = GetAlignedSplitSizes(splittedElementsSize, split_size, alignment);
|
||||
return {splittedDimIx, splitSizes};
|
||||
}
|
||||
|
||||
|
@ -1247,9 +1247,6 @@ void FlattenTrivialConcatPass::run() {
|
||||
void InsertConcatAligningFilterPass::run() {
|
||||
OV_ITT_SCOPED_TASK(itt::domains::GNA_LT, "InsertConcatAligningFilterPass");
|
||||
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(pLayers->front());
|
||||
// currently concat layer only supports 2 bytes in int16 and int8 mode. In fp32 mode this no necessary but usefull
|
||||
// for testing
|
||||
const int bytesPerConcatElement = 2;
|
||||
|
||||
int numOfFilterLayers = 0;
|
||||
|
||||
@ -1273,7 +1270,7 @@ void InsertConcatAligningFilterPass::run() {
|
||||
|
||||
auto concatInput = getLayerByIndex(input_idx);
|
||||
auto dims = concatInput->getDims();
|
||||
auto outputSize = details::product(++dims.begin(), dims.end()) * bytesPerConcatElement;
|
||||
auto outputSize = details::product(++dims.begin(), dims.end()) * Limitations::kBytesPerConcatElement;
|
||||
|
||||
auto useAlignFilterIf = [&concatLayer, &getLayerByIndex](int concat_input_idx) {
|
||||
if (concatLayer->insData.size() <= concat_input_idx)
|
||||
@ -1290,7 +1287,8 @@ void InsertConcatAligningFilterPass::run() {
|
||||
// correcting offset by copy layer insertion. This can be improved by collapsing copy and affine or diagonal
|
||||
// later-on if next concat inputs requires align filter - then current input also requires either copy or
|
||||
// align filter
|
||||
if (ALIGN64(offset) != offset || (ALIGN64(outputSize) != outputSize && useAlignFilterIf(input_idx + 1))) {
|
||||
if ((!Limitations::get_instance()->is_aligned(offset)) ||
|
||||
((!Limitations::get_instance()->is_aligned(outputSize)) && useAlignFilterIf(input_idx + 1))) {
|
||||
auto prevLayer = getCreatorLayer(concatInput).lock();
|
||||
// input layer parameters are copied not using GNA-primitives - so nothing to allign here.
|
||||
if (!useAlignFilterIf(input_idx))
|
||||
@ -1310,13 +1308,17 @@ void InsertConcatAligningFilterPass::run() {
|
||||
}
|
||||
|
||||
auto num_rows_in = dims[1];
|
||||
size_t aligned64_offset = std::max(0, static_cast<int>(ALIGN64(offset) - 64));
|
||||
size_t num_rows_padded = (offset - aligned64_offset) / bytesPerConcatElement;
|
||||
size_t aligned_offset =
|
||||
std::max(0,
|
||||
static_cast<int>(ALIGN(offset, Limitations::get_instance()->get_memory_alignment()) -
|
||||
Limitations::get_instance()->get_memory_alignment()));
|
||||
size_t num_rows_padded = (offset - aligned_offset) / Limitations::kBytesPerConcatElement;
|
||||
size_t num_rows_out = num_rows_padded + num_rows_in;
|
||||
|
||||
// encodes offset to beginning of split layer input
|
||||
size_t bytesOffset =
|
||||
(aligned64_offset / bytesPerConcatElement) * (quantized ? bytesPerConcatElement : 4);
|
||||
(aligned_offset / Limitations::kBytesPerConcatElement) *
|
||||
(quantized ? Limitations::kBytesPerConcatElement : Precision(Precision::FP32).size());
|
||||
concatAligningFilter->params["output_offset"] = std::to_string(bytesOffset);
|
||||
|
||||
// for padded rows we cannot use copy layer - TBD how to implement
|
||||
@ -1496,7 +1498,7 @@ void InsertSplitAligningFilterPass::run() {
|
||||
for (auto&& splitOutput : l->outData) {
|
||||
auto outputSize = product(begin(splitOutput->getDims()), end(splitOutput->getDims()));
|
||||
|
||||
if ((currentOffset != ALIGN64(currentOffset)) || (padding != 0)) {
|
||||
if ((!Limitations::get_instance()->is_aligned(currentOffset)) || (padding != 0)) {
|
||||
// check that this split output actually connected to further layers
|
||||
if (getInputTo(splitOutput).empty()) {
|
||||
log::debug() << "Output port: " << splitOutIndex << " of " << l->name << " unconnected, skipping\n";
|
||||
@ -1507,7 +1509,7 @@ void InsertSplitAligningFilterPass::run() {
|
||||
<< " Convolution Filter doesn't support batch=" << splitOutput->getDims().front();
|
||||
}
|
||||
|
||||
// this split output not beginning from 64 bytes aligned boundary - need to correct by aligning
|
||||
// this split output not beginning from aligned bytes boundary - need to correct by aligning
|
||||
// filter layer insert the filter
|
||||
auto filterName = std::string("AlignFilter_") + std::to_string(numOfFilterLayers++);
|
||||
|
||||
@ -1527,20 +1529,22 @@ void InsertSplitAligningFilterPass::run() {
|
||||
|
||||
auto inputData = splitOutput;
|
||||
|
||||
size_t aligned64_offset = std::max(0, static_cast<int>(ALIGN64(currentOffset) - 64));
|
||||
size_t aligned_offset = std::max(
|
||||
0,
|
||||
static_cast<int>(ALIGN(currentOffset, Limitations::get_instance()->get_memory_alignment()) -
|
||||
Limitations::get_instance()->get_memory_alignment()));
|
||||
|
||||
IE_ASSERT(filterLayer != nullptr);
|
||||
|
||||
// encodes offset to beginning of split layer input
|
||||
filterLayer->params["offset"] =
|
||||
std::to_string(aligned64_offset / Limitations::kBytesPerSplitElement);
|
||||
filterLayer->params["offset"] = std::to_string(aligned_offset / Limitations::kBytesPerSplitElement);
|
||||
auto dims = splitOutput->getTensorDesc().getDims();
|
||||
if (dims.size() > 3) {
|
||||
THROW_GNA_EXCEPTION << "unsupported split layer dims size: " << dims.size();
|
||||
}
|
||||
|
||||
const auto offsetOfUnalignment =
|
||||
(currentOffset - aligned64_offset) / Limitations::kBytesPerSplitElement;
|
||||
(currentOffset - aligned_offset) / Limitations::kBytesPerSplitElement;
|
||||
// TODO consider to use a different number of filters do decrese the number of trailing zeros
|
||||
// (additionalPaddingOfFilter)
|
||||
const auto numberOfFilters = Limitations::kConvMinFiltersNum;
|
||||
|
@ -152,7 +152,7 @@ DECL_PASS(InsertSplitAligningFilter);
|
||||
DECL_PASS(FlattenTrivialConcat);
|
||||
|
||||
/**
|
||||
* @brief concat-aligning filter layer insertion required in cases when concat inputs size are not 64-aligned
|
||||
* @brief concat-aligning filter layer insertion required in cases when concat inputs size are not aligned
|
||||
*/
|
||||
DECL_PASS(InsertConcatAligningFilter);
|
||||
|
||||
|
@ -64,7 +64,9 @@ static bool Convert(std::shared_ptr<ngraph::Node> conv,
|
||||
auto& input = conv->get_input_shape(0);
|
||||
uint32_t width = input.back();
|
||||
uint32_t in_channels = input.at(1);
|
||||
auto split_sizes = GetAlignedSplitSizes(width, Limitations::kBufferMaxSize / in_channels);
|
||||
auto split_sizes = GetAlignedSplitSizes(width,
|
||||
Limitations::kBufferMaxSize / in_channels,
|
||||
Limitations::get_instance()->get_memory_alignment());
|
||||
IE_ASSERT(split_sizes.size() > 1);
|
||||
std::vector<int64_t> split_sizes_casted(split_sizes.size());
|
||||
std::transform(std::begin(split_sizes), std::end(split_sizes), std::begin(split_sizes_casted), [](uint32_t size) {
|
||||
|
@ -7,9 +7,13 @@
|
||||
#include <vector>
|
||||
// to suppress deprecated definition errors
|
||||
#define IMPLEMENT_INFERENCE_ENGINE_PLUGIN
|
||||
#include "common/gna_target.hpp"
|
||||
#include "layers/gna_split_layer.hpp"
|
||||
#include "ngraph/opsets/opset9.hpp"
|
||||
|
||||
using namespace ov::intel_gna::limitations;
|
||||
using namespace ov::intel_gna::target;
|
||||
|
||||
namespace {
|
||||
|
||||
using GetAlignedSplitSizesData = std::tuple<uint32_t, // total size
|
||||
@ -19,10 +23,15 @@ using GetAlignedSplitSizesData = std::tuple<uint32_t, // total size
|
||||
>;
|
||||
|
||||
const std::vector<GetAlignedSplitSizesData> data = {
|
||||
GetAlignedSplitSizesData{10, 100, 64, std::vector<uint32_t>{10}},
|
||||
GetAlignedSplitSizesData{1024, 100, 64, std::vector<uint32_t>(16, 64)},
|
||||
GetAlignedSplitSizesData{151, 100, 64, std::vector<uint32_t>{64, 64, 23}},
|
||||
GetAlignedSplitSizesData{151, 65, 32, std::vector<uint32_t>{64, 64, 23}},
|
||||
GetAlignedSplitSizesData{151, 65, 1, std::vector<uint32_t>{65, 65, 21}}};
|
||||
GetAlignedSplitSizesData{151, 33, 32, std::vector<uint32_t>{32, 32, 32, 32, 23}},
|
||||
GetAlignedSplitSizesData{151, 17, 16, std::vector<uint32_t>{16, 16, 16, 16, 16, 16, 16, 16, 16, 7}},
|
||||
GetAlignedSplitSizesData{151, 65, 1, std::vector<uint32_t>{65, 65, 21}},
|
||||
GetAlignedSplitSizesData{67000, 65528, 64, std::vector<uint32_t>{65472, 1528}},
|
||||
GetAlignedSplitSizesData{67000, 65528, 16, std::vector<uint32_t>{65520, 1480}}};
|
||||
|
||||
TEST(GetAlignedSplitSizesTest, testAlignedSplitSizes) {
|
||||
for (const auto& dataItem : data) {
|
||||
@ -38,55 +47,86 @@ using VariadicSplitParameters = std::tuple<ov::Shape, // input size
|
||||
bool // supported
|
||||
>;
|
||||
|
||||
const std::vector<VariadicSplitParameters> variadic_split_data = {
|
||||
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{192, 192, 320, 320}, true},
|
||||
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{640, 192, 192}, true},
|
||||
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{500, 24, 500}, false},
|
||||
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{700, 300, 24}, false},
|
||||
};
|
||||
|
||||
TEST(CheckSplitSupported, CheckVariadicSplitSupported) {
|
||||
void RunVariadicSplitSupportedTest(DeviceVersion device_version, std::vector<VariadicSplitParameters> test_vectors) {
|
||||
ov::Shape input_shape;
|
||||
uint32_t axis;
|
||||
std::vector<int32_t> split_lengths;
|
||||
bool result;
|
||||
for (const auto& item : variadic_split_data) {
|
||||
|
||||
Limitations::init(device_version);
|
||||
for (const auto& item : test_vectors) {
|
||||
std::tie(input_shape, axis, split_lengths, result) = item;
|
||||
|
||||
auto split = std::make_shared<ngraph::opset9::VariadicSplit>(
|
||||
std::make_shared<ngraph::opset9::Parameter>(ngraph::element::f32, input_shape),
|
||||
ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::Shape({1}), {axis}),
|
||||
ngraph::opset9::Constant::create(ngraph::element::i64,
|
||||
ngraph::Shape({split_lengths.size()}),
|
||||
split_lengths));
|
||||
ASSERT_TRUE(ov::intel_gna::limitations::Limitations::is_split_supported(split, false) == result);
|
||||
ASSERT_TRUE(Limitations::is_split_supported(split, false) == result);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CheckSplitSupported, CheckVariadicSplitSupported_GNA3_5) {
|
||||
RunVariadicSplitSupportedTest(
|
||||
DeviceVersion::GNA3_5,
|
||||
{VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{192, 192, 320, 320}, true},
|
||||
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{640, 192, 192}, true},
|
||||
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{16, 1008}, false},
|
||||
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{500, 24, 500}, false},
|
||||
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{700, 300, 24}, false}});
|
||||
}
|
||||
|
||||
TEST(CheckSplitSupported, CheckVariadicSplitSupported_GNA3_6) {
|
||||
RunVariadicSplitSupportedTest(
|
||||
DeviceVersion::GNA3_6,
|
||||
{VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{192, 192, 320, 320}, true},
|
||||
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{640, 192, 192}, true},
|
||||
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{16, 1008}, true},
|
||||
VariadicSplitParameters{ov::Shape{1024}, 0, std::vector<int32_t>{500, 24, 500}, false},
|
||||
VariadicSplitParameters{ov::Shape{1, 1024}, 1, std::vector<int32_t>{700, 300, 24}, false}});
|
||||
}
|
||||
|
||||
using SplitParameters = std::tuple<ov::Shape, // input size
|
||||
uint32_t, // axis
|
||||
uint32_t, // num_splits
|
||||
bool // supported
|
||||
>;
|
||||
|
||||
const std::vector<SplitParameters> split_data = {
|
||||
SplitParameters{ov::Shape{1024}, 0, 4, true},
|
||||
SplitParameters{ov::Shape{1, 1024}, 1, 16, true},
|
||||
SplitParameters{ov::Shape{1024}, 0, 64, false},
|
||||
SplitParameters{ov::Shape{1, 1024}, 1, 256, false},
|
||||
};
|
||||
|
||||
TEST(CheckSplitSupported, CheckSplitSupported) {
|
||||
void RunSplitSupportedTest(DeviceVersion device_version, std::vector<SplitParameters> test_vectors) {
|
||||
ov::Shape input_shape;
|
||||
uint32_t axis;
|
||||
uint32_t num_splits;
|
||||
bool result;
|
||||
for (const auto& item : split_data) {
|
||||
|
||||
Limitations::init(device_version);
|
||||
for (const auto& item : test_vectors) {
|
||||
std::tie(input_shape, axis, num_splits, result) = item;
|
||||
auto split = std::make_shared<ngraph::opset9::Split>(
|
||||
std::make_shared<ngraph::opset9::Parameter>(ngraph::element::f32, input_shape),
|
||||
ngraph::opset9::Constant::create(ngraph::element::i64, ngraph::Shape({}), {axis}),
|
||||
num_splits);
|
||||
ASSERT_TRUE(ov::intel_gna::limitations::Limitations::is_split_supported(split, false) == result);
|
||||
ASSERT_TRUE(Limitations::is_split_supported(split, false) == result);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CheckSplitSupported, CheckSplitSupported_GNA3_5) {
|
||||
RunSplitSupportedTest(DeviceVersion::GNA3_5,
|
||||
{
|
||||
SplitParameters{ov::Shape{1024}, 0, 4, true},
|
||||
SplitParameters{ov::Shape{1, 1024}, 1, 16, true},
|
||||
SplitParameters{ov::Shape{1024}, 0, 64, false},
|
||||
SplitParameters{ov::Shape{1, 1024}, 1, 256, false},
|
||||
});
|
||||
}
|
||||
|
||||
TEST(CheckSplitSupported, CheckSplitSupported_GNA3_6) {
|
||||
RunSplitSupportedTest(DeviceVersion::GNA3_6,
|
||||
{
|
||||
SplitParameters{ov::Shape{1024}, 0, 4, true},
|
||||
SplitParameters{ov::Shape{1, 1024}, 1, 16, true},
|
||||
SplitParameters{ov::Shape{1024}, 0, 64, true},
|
||||
SplitParameters{ov::Shape{1, 1024}, 1, 256, false},
|
||||
});
|
||||
}
|
||||
} // namespace
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -9,9 +9,14 @@
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <transformations/init_node_info.hpp>
|
||||
|
||||
#include "backend/gna_limitations.hpp"
|
||||
#include "common/gna_target.hpp"
|
||||
#include "common_test_utils/ngraph_test_utils.hpp"
|
||||
#include "transformations/split_convolution_with_large_buffer_size.hpp"
|
||||
|
||||
using namespace ov::intel_gna::limitations;
|
||||
using namespace ov::intel_gna::target;
|
||||
|
||||
namespace testing {
|
||||
namespace {
|
||||
|
||||
@ -126,29 +131,22 @@ ngraph::Output<ngraph::Node> CreateConvolution::createOutputNode(const ngraph::O
|
||||
}
|
||||
|
||||
// should be used only after CreateBaseDecorator
|
||||
template <const ngraph::Shape& kernel_shape, const ngraph::Shape& split_shape>
|
||||
class CreateSplittedConvolution : public CreateGraphDecorator {
|
||||
public:
|
||||
CreateSplittedConvolution(CreateGraphDecoratorPtr prev,
|
||||
const ngraph::Shape& kernel_shape = ngraph::Shape{1, 64, 1, 1},
|
||||
const ngraph::Shape& split_shape = ngraph::Shape{960, 960, 960, 960, 256})
|
||||
CreateSplittedConvolution(CreateGraphDecoratorPtr prev)
|
||||
: CreateGraphDecorator(std::move(prev)),
|
||||
kernel_shape_(kernel_shape),
|
||||
split_shape_(split_shape) {}
|
||||
|
||||
protected:
|
||||
void updateGraph(Graph& graph) override;
|
||||
|
||||
private:
|
||||
const ngraph::Shape kernel_shape_;
|
||||
const ngraph::Shape split_shape_;
|
||||
};
|
||||
|
||||
void CreateSplittedConvolution::updateGraph(Graph& graph) {
|
||||
void updateGraph(Graph& graph) override {
|
||||
auto split_node_c1 =
|
||||
ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({1}), std::vector<int64_t>{3});
|
||||
auto split_node_c2 =
|
||||
ngraph::opset7::Constant::create(ngraph::element::i64, ngraph::Shape({split_shape_.size()}), split_shape_);
|
||||
auto split_node = std::make_shared<ngraph::opset7::VariadicSplit>(graph.input_params, split_node_c1, split_node_c2);
|
||||
auto split_node =
|
||||
std::make_shared<ngraph::opset7::VariadicSplit>(graph.input_params, split_node_c1, split_node_c2);
|
||||
|
||||
auto kernel = ngraph::opset7::Constant::create(ngraph::element::f32, kernel_shape_, {1});
|
||||
|
||||
@ -163,6 +161,11 @@ void CreateSplittedConvolution::updateGraph(Graph& graph) {
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const ngraph::Shape kernel_shape_;
|
||||
const ngraph::Shape split_shape_;
|
||||
};
|
||||
|
||||
class CreateAdd : public CreateAppendableGraphDecorator {
|
||||
public:
|
||||
CreateAdd(CreateGraphDecoratorPtr prev) : CreateAppendableGraphDecorator(std::move(prev)) {}
|
||||
@ -261,9 +264,10 @@ Graph createSolidGraph(const ngraph::Shape& input_shape, const ngraph::Shape& ke
|
||||
|
||||
// -------------------------------------------------------------------------------------------------------
|
||||
|
||||
using TestParams = std::tuple<Graph, Graph, ngraph::pass::Manager>;
|
||||
|
||||
class SplitConvolutionFixture : public CommonTestUtils::TestsCommon,
|
||||
public ::testing::WithParamInterface<
|
||||
std::tuple<Graph /* tranformed */, Graph /* reference */, ngraph::pass::Manager>> {
|
||||
public ::testing::WithParamInterface<std::tuple<DeviceVersion, TestParams>> {
|
||||
public:
|
||||
void SetUp() override;
|
||||
|
||||
@ -274,10 +278,14 @@ public:
|
||||
|
||||
void SplitConvolutionFixture::SetUp() {
|
||||
// TODO: use auto & [transformed_graph, reference_graph] = this->GetParam() when C++17
|
||||
DeviceVersion device_version;
|
||||
TestParams params;
|
||||
Graph transformed_graph;
|
||||
Graph reference_graph;
|
||||
std::tie(transformed_graph, reference_graph, pass_manager) = this->GetParam();
|
||||
std::tie(device_version, params) = this->GetParam();
|
||||
std::tie(transformed_graph, reference_graph, pass_manager) = params;
|
||||
|
||||
Limitations::init(device_version);
|
||||
function = transformed_graph.createFunction();
|
||||
reference_function = reference_graph.createFunction();
|
||||
}
|
||||
@ -305,21 +313,11 @@ TEST_P(SplitConvolutionFixture, CompareFunctions) {
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
SplitConvolutionTestSuite,
|
||||
SplitConvolution_GNA3_0_3_5_3_6_TestSuite,
|
||||
SplitConvolutionFixture,
|
||||
::testing::Combine(
|
||||
::testing::Values(DeviceVersion::GNA3_0, DeviceVersion::GNA3_5, DeviceVersion::GNA3_6),
|
||||
::testing::Values(
|
||||
std::make_tuple(createGraph<CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateSplittedConvolution>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolution>()),
|
||||
std::make_tuple(createGraph<CreateAdd, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateAdd, CreateSplittedConvolution>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()),
|
||||
std::make_tuple(createGraph<CreateFakeQuantize, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateFakeQuantize, CreateSplittedConvolution>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
|
||||
std::make_tuple(createGraph<CreateFakeQuantize, CreateAdd, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateFakeQuantize, CreateAdd, CreateSplittedConvolution>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
|
||||
std::make_tuple(createSolidGraph(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
|
||||
createSolidGraph(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolution>()),
|
||||
@ -332,7 +330,53 @@ INSTANTIATE_TEST_SUITE_P(
|
||||
std::make_tuple(
|
||||
createSolidGraph<CreateAdd, CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
|
||||
createSolidGraph<CreateAdd, CreateFakeQuantize>(ngraph::Shape{1, 1, 1, 1}, ngraph::Shape{1, 1, 1, 1}),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>())));
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))));
|
||||
|
||||
ngraph::Shape kernel_shape_3_5 = {1, 64, 1, 1};
|
||||
ngraph::Shape split_shape_3_5 = {960, 960, 960, 960, 256};
|
||||
using CreateSplitedConvolution3_5 = CreateSplittedConvolution<kernel_shape_3_5, split_shape_3_5>;
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
SplitConvolution_GNA3_0_3_5_TestSuite,
|
||||
SplitConvolutionFixture,
|
||||
::testing::Combine(
|
||||
::testing::Values(DeviceVersion::GNA3_0, DeviceVersion::GNA3_5),
|
||||
::testing::Values(
|
||||
std::make_tuple(createGraph<CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateSplitedConvolution3_5>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolution>()),
|
||||
std::make_tuple(createGraph<CreateAdd, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateAdd, CreateSplitedConvolution3_5>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()),
|
||||
std::make_tuple(createGraph<CreateFakeQuantize, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateFakeQuantize, CreateSplitedConvolution3_5>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
|
||||
std::make_tuple(createGraph<CreateFakeQuantize, CreateAdd, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateFakeQuantize, CreateAdd, CreateSplitedConvolution3_5>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))));
|
||||
|
||||
ngraph::Shape kernel_shape_3_6 = {1, 64, 1, 1};
|
||||
ngraph::Shape split_shape_3_6 = {1008, 1008, 1008, 1008, 64};
|
||||
using CreateSplitedConvolution3_6 = CreateSplittedConvolution<kernel_shape_3_6, split_shape_3_6>;
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
SplitConvolution_GNA3_6_TestSuite,
|
||||
SplitConvolutionFixture,
|
||||
::testing::Combine(
|
||||
::testing::Values(DeviceVersion::GNA3_6),
|
||||
::testing::Values(
|
||||
std::make_tuple(createGraph<CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateSplitedConvolution3_6>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolution>()),
|
||||
std::make_tuple(createGraph<CreateAdd, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateAdd, CreateSplitedConvolution3_6>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithBias>()),
|
||||
std::make_tuple(createGraph<CreateFakeQuantize, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateFakeQuantize, CreateSplitedConvolution3_6>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()),
|
||||
std::make_tuple(createGraph<CreateFakeQuantize, CreateAdd, CreateConvolution>(),
|
||||
createGraph<CreateConcat, CreateFakeQuantize, CreateAdd, CreateSplitedConvolution3_6>(),
|
||||
createPassManager<ov::intel_gna::pass::SplitConvolutionWithFq>()))));
|
||||
|
||||
} // namespace
|
||||
} // namespace testing
|
||||
|
@ -11,10 +11,15 @@
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <transformations/init_node_info.hpp>
|
||||
|
||||
#include "backend/gna_limitations.hpp"
|
||||
#include "common/gna_target.hpp"
|
||||
#include "common_test_utils/common_utils.hpp"
|
||||
#include "common_test_utils/ngraph_test_utils.hpp"
|
||||
#include "transformations/split_eltwise.hpp"
|
||||
|
||||
using namespace ov::intel_gna::limitations;
|
||||
using namespace ov::intel_gna::target;
|
||||
|
||||
namespace testing {
|
||||
namespace {
|
||||
|
||||
@ -87,7 +92,8 @@ static std::shared_ptr<ngraph::Function> createFunction(const ngraph::Shape& inp
|
||||
}
|
||||
}
|
||||
|
||||
typedef std::tuple<ngraph::Shape,
|
||||
typedef std::tuple<DeviceVersion, // device version
|
||||
ngraph::Shape, // input shape
|
||||
bool, // with const
|
||||
bool, // with fq
|
||||
ELTWISE_TYPE // eltwise type
|
||||
@ -95,13 +101,15 @@ typedef std::tuple<ngraph::Shape,
|
||||
EltwiseSplitParams;
|
||||
|
||||
static std::string getTestCaseName(testing::TestParamInfo<EltwiseSplitParams> obj) {
|
||||
DeviceVersion device_ver;
|
||||
ngraph::Shape shape;
|
||||
bool with_const;
|
||||
bool with_fq;
|
||||
ELTWISE_TYPE type;
|
||||
std::tie(shape, with_const, with_fq, type) = obj.param;
|
||||
std::tie(device_ver, shape, with_const, with_fq, type) = obj.param;
|
||||
|
||||
std::ostringstream result;
|
||||
result << DeviceToString(device_ver) << "_";
|
||||
result << "IS=" << CommonTestUtils::vec2str(shape) << "_";
|
||||
result << "wConst=" << with_const << "_";
|
||||
result << "wFQ=" << with_fq << "_";
|
||||
@ -132,11 +140,13 @@ public:
|
||||
};
|
||||
|
||||
void SplitEltwiseTestSuiteFixture::SetUp() {
|
||||
DeviceVersion device_ver;
|
||||
ngraph::Shape shape;
|
||||
bool with_const;
|
||||
bool with_fq;
|
||||
ELTWISE_TYPE type;
|
||||
std::tie(shape, with_const, with_fq, type) = this->GetParam();
|
||||
std::tie(device_ver, shape, with_const, with_fq, type) = this->GetParam();
|
||||
Limitations::init(device_ver);
|
||||
function = createFunction(shape, with_const, with_fq, type, false);
|
||||
reference_function = createFunction(shape, with_const, with_fq, type, true);
|
||||
}
|
||||
@ -158,13 +168,16 @@ TEST_P(SplitEltwiseTestSuiteFixture, CompareFunctions) {
|
||||
|
||||
const std::vector<ov::Shape> inputShape = {{1, 67000}, {1, 500000}, {1, 936, 513}, {1, 64, 64, 64}, {1, 256, 64, 64}};
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(SplitEltwiseTestSuite,
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
SplitEltwiseTestSuite,
|
||||
SplitEltwiseTestSuiteFixture,
|
||||
::testing::Combine(::testing::ValuesIn(inputShape),
|
||||
::testing::Combine(::testing::ValuesIn(std::vector<DeviceVersion>{DeviceVersion::GNA3_0, // device version
|
||||
DeviceVersion::GNA3_5,
|
||||
DeviceVersion::GNA3_6}),
|
||||
::testing::ValuesIn(inputShape),
|
||||
::testing::ValuesIn(std::vector<bool>{true, false}), // with const
|
||||
::testing::ValuesIn(std::vector<bool>{true, false}), // with fq
|
||||
::testing::ValuesIn(std::vector<ELTWISE_TYPE>{
|
||||
ELTWISE_TYPE::Sum,
|
||||
::testing::ValuesIn(std::vector<ELTWISE_TYPE>{ELTWISE_TYPE::Sum,
|
||||
ELTWISE_TYPE::Sub,
|
||||
ELTWISE_TYPE::Prod})), // eltwise type
|
||||
getTestCaseName);
|
||||
|
Loading…
Reference in New Issue
Block a user