[GNA] Introduce 16Byte memory alignment for LNL (GNA3.6) (#16363)

* [GNA] Introduce 16Byte memory alignment for LNL (GNA3.6) * update after review
2023-03-27 11:42:34 +02:00 · 2023-03-27 11:42:34 +02:00 · 4936d4bb1d
commit 4936d4bb1d
parent 5e835e327b
6 changed files with 190 additions and 9 deletions
--- a/src/plugins/intel_gna/src/backend/gna_limitations.cpp
+++ b/src/plugins/intel_gna/src/backend/gna_limitations.cpp
@ -41,6 +41,20 @@ const std::set<ov::element::Type> SupportedElementTypes::supported_parameter_typ
                                                                                      ov::element::i16,
                                                                                      ov::element::f32};

+size_t getMemoryAlignmentBytes(target::DeviceVersion target) {
+    static const std::unordered_map<target::DeviceVersion, size_t> mem_alignment_map{
+        {target::DeviceVersion::GNA1_0, 64},
+        {target::DeviceVersion::GNA2_0, 64},
+        {target::DeviceVersion::GNA3_0, 64},
+        {target::DeviceVersion::GNA3_1, 64},
+        {target::DeviceVersion::GNA3_5, 64},
+        {target::DeviceVersion::GNAEmbedded3_5, 64},
+        {target::DeviceVersion::GNA3_6, 16},
+        {target::DeviceVersion::GNA4_0, 16}};
+
+    return common::GetValueForKey<target::DeviceVersion, size_t>(target, mem_alignment_map);
+}
+
 bool SupportedElementTypes::is_parameter_type_supported(ov::element::Type elem_type, bool is_exception_allowed) {
    if (supported_parameter_types.count(elem_type) == 0) {
        if (is_exception_allowed) {
--- a/src/plugins/intel_gna/src/backend/gna_limitations.hpp
+++ b/src/plugins/intel_gna/src/backend/gna_limitations.hpp
@ -11,6 +11,7 @@
 #include <ie_algorithm.hpp>

 #include "common/gna_target.hpp"
+#include "common/misc_utils.hpp"
 #include "dnn_types.hpp"
 #include "gna_lib_ver_selector.hpp"
 #include "legacy/ngraph_ops/convolution_ie.hpp"
@ -55,7 +56,7 @@ constexpr uint32_t bytesPerSplitElement = 2;
 // In fp32 mode this is not necessary but is useful for testing
 constexpr uint32_t bytesPerCropElement = 2;

-constexpr uint32_t kMemoryAlignmentBytes = 64;
+constexpr uint32_t kMemoryPageSize = 4096;

 inline bool isCropAffinedOffset(size_t numberOfElements) {
    const auto cropOffset = numberOfElements * bytesPerCropElement;
@ -78,6 +79,8 @@ inline bool IsTransposeSupported(const std::vector<size_t>& shape) {
    return min <= 8 && max % 8 == 0 && max >= 8 && max <= transposeMaxSize;
 }

+size_t getMemoryAlignmentBytes(target::DeviceVersion target);
+
 class SupportedElementTypes {
 public:
    static bool is_parameter_type_supported(ov::element::Type type, bool is_exception_allowed = false);
--- a/src/plugins/intel_gna/src/gna_device.cpp
+++ b/src/plugins/intel_gna/src/gna_device.cpp
@ -38,7 +38,8 @@ GNADeviceHelper::GNADeviceHelper(std::shared_ptr<Target> targetIn, bool isPerfor
    : target(targetIn),
      nGnaDeviceIndex{selectGnaDevice()},
      useDeviceEmbeddedExport(deviceEmbedded),
-      isPerformanceMeasuring(isPerformanceMeasuring) {
+      isPerformanceMeasuring(isPerformanceMeasuring),
+      m_mem_alignment(limitations::getMemoryAlignmentBytes(targetIn->get_effective_compile_target())) {
    per_request_diagnostics = log::get_log_level() >= ov::log::Level::TRACE;
    per_model_diagnostics = log::get_log_level() >= ov::log::Level::DEBUG;
    open();
@ -48,8 +49,6 @@ GNADeviceHelper::GNADeviceHelper(std::shared_ptr<Target> targetIn, bool isPerfor
    GetGnaLibraryVersion();

    maxLayersCount_ = retrieveMaxLayersCount();
-
-    m_mem_alignment = limitations::kMemoryAlignmentBytes;
 }

 GNADeviceHelper::~GNADeviceHelper() {
--- a/src/plugins/intel_gna/src/gna_device.hpp
+++ b/src/plugins/intel_gna/src/gna_device.hpp
@ -47,7 +47,6 @@ class GNADeviceHelper : public GNADevice {
    uint32_t nGnaDeviceIndex = 0;
    bool useDeviceEmbeddedExport = false;
    uint32_t maxLayersCount_ = 0;
-    size_t m_mem_alignment = 0;

    static const uint32_t TotalGna2InstrumentationPoints = 2;
    Gna2InstrumentationPoint gna2InstrumentationPoints[TotalGna2InstrumentationPoints] = {
@ -68,6 +67,7 @@ class GNADeviceHelper : public GNADevice {
    uint64_t debugLogIndexRequestWait = 0;
    static constexpr const char* kDumpExt = ".bin";
    static constexpr const char* kDumpDelimiter = ".";
+    const size_t m_mem_alignment;

 public:
    explicit GNADeviceHelper(std::shared_ptr<target::Target> target = std::make_shared<target::Target>(),
@ -128,7 +128,7 @@ public:
        return allAllocations;
    }

-    const size_t getMemAlignment() const {
+    size_t getMemAlignment() const {
        return m_mem_alignment;
    }

--- a/src/plugins/intel_gna/src/gna_plugin.cpp
+++ b/src/plugins/intel_gna/src/gna_plugin.cpp
@ -375,9 +375,10 @@ void GNAPlugin::InitGNADevice() {
        gnadevice = std::make_shared<GNADeviceHelper>(config.target,
                                                      gnaFlags->performance_counting,
                                                      !config.embedded_export_path.empty());
-        size_t page_size_bytes = 4096;
-        size_t mem_alignment = gnadevice->getMemAlignment();
-        gnamem = std::make_shared<gna_memory_device>(memory::GNAAllocator(gnadevice), mem_alignment, page_size_bytes);
+
+        gnamem = std::make_shared<gna_memory_device>(memory::GNAAllocator(gnadevice),
+                                                     gnadevice->getMemAlignment(),
+                                                     limitations::kMemoryPageSize);
    }
    graphCompiler.setGNAMemoryPtr(gnamem);
 }
--- a/src/plugins/intel_gna/tests/unit/gna_memory_alignment.cpp
+++ b/src/plugins/intel_gna/tests/unit/gna_memory_alignment.cpp
@ -0,0 +1,164 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include "any_copy.hpp"
+#include "backend/gna_limitations.hpp"
+#include "common/gna_target.hpp"
+#include "gna_data_types.hpp"
+#include "gna_plugin.hpp"
+#include "memory/gna_memory.hpp"
+#include "ngraph_functions/builders.hpp"
+
+using namespace InferenceEngine;
+using namespace ov::intel_gna::target;
+using namespace ov::intel_gna::limitations;
+namespace testing {
+
+using MemAlignmentTestParams =
+    std::tuple<ExecutionMode,                    // execution mode
+               HWGeneration,                     // compile target
+               std::pair<ngraph::Shape, size_t>  // input shape vs expected memory size of the input region in bytes.
+                                                 // For this specific model and when the value of input_shape_H = 1,
+                                                 // the memory input region size can be calculated using below formula:
+                                                 // mem_input_region_size = ALIGN8(input_shape_W)*inputPrecInBytes.
+                                                 // Refer to GNAGraphCompiler::AffinePrimitive for more details.
+               >;
+
+const std::vector<std::pair<ngraph::Shape, size_t>> param_16B_alignment_prec_fp32{{{1, 2}, 32},
+                                                                                  {{1, 8}, 32},
+                                                                                  {{1, 9}, 64}};
+
+const std::vector<std::pair<ngraph::Shape, size_t>> param_64B_alignment_prec_int16{{{1, 2}, 64},
+                                                                                   {{1, 32}, 64},
+                                                                                   {{1, 33}, 128}};
+
+const std::vector<std::pair<ngraph::Shape, size_t>> param_16B_alignment_prec_int16{{{1, 2}, 16},
+                                                                                   {{1, 8}, 16},
+                                                                                   {{1, 9}, 32},
+                                                                                   {{1, 33}, 80}};
+
+class GNAPluginForMemoryAlignmentTest : public GNAPlugin {
+public:
+    GNAPluginForMemoryAlignmentTest(const std::map<std::string, std::string>& configMap) : GNAPlugin(configMap) {
+        if (gnadevice) {
+            gnamem.reset(new gna_memory_float(memory::GNAFloatAllocator{},
+                                              gnadevice->getMemAlignment(),
+                                              limitations::kMemoryPageSize));
+            graphCompiler.setGNAMemoryPtr(gnamem);
+            gnadevice.reset();
+        }
+    }
+
+    const size_t get_memory_REGION_INPUTS_size() const {
+        return this->gnamem->getQueue(ov::intel_gna::memory::REGION_INPUTS)->calcSize();
+    }
+};
+
+class GNAPluginLoadNetworkTests : public ::testing::TestWithParam<MemAlignmentTestParams> {
+public:
+    static std::string GetTestCaseName(const testing::TestParamInfo<MemAlignmentTestParams>& obj) {
+        ExecutionMode exe_mode;
+        HWGeneration hw_gen;
+        std::pair<ngraph::Shape, size_t> inp_shape_vs_mem;
+        tie(exe_mode, hw_gen, inp_shape_vs_mem) = obj.param;
+
+        std::ostringstream result;
+        result << "inp=" << inp_shape_vs_mem.first.to_string() << "_";
+        result << "mem_region_size=" << inp_shape_vs_mem.second;
+        return result.str();
+    }
+
+protected:
+    void Run() {
+        ExecutionMode exe_mode;
+        HWGeneration hw_gen;
+        std::pair<ngraph::Shape, size_t> inp_shape_vs_mem;
+        tie(exe_mode, hw_gen, inp_shape_vs_mem) = this->GetParam();
+        ngraph::Shape inp_shape = inp_shape_vs_mem.first;
+        size_t mem_region_size = inp_shape_vs_mem.second;
+
+        const ov::AnyMap gna_config = {ov::intel_gna::execution_mode(exe_mode), ov::intel_gna::compile_target(hw_gen)};
+
+        auto plugin = GNAPluginForMemoryAlignmentTest(any_copy(gna_config));
+        auto function = getMulFunction(inp_shape);
+        CNNNetwork cnnNetwork(function);
+        plugin.LoadNetwork(cnnNetwork);
+        EXPECT_EQ(plugin.get_memory_REGION_INPUTS_size(), mem_region_size);
+    }
+
+    void SetUp() override {
+        test_params = GetParam();
+    }
+
+private:
+    std::shared_ptr<ov::Model> getMulFunction(const ngraph::Shape input_shape) {
+        const ngraph::element::Type net_precision = ngraph::element::f32;
+
+        auto input = std::make_shared<ngraph::opset8::Parameter>(net_precision, input_shape);
+        auto multiplier = std::make_shared<ngraph::opset8::Constant>(net_precision, input_shape);
+        auto matmul = std::make_shared<ngraph::opset8::MatMul>(input, multiplier, false, true);
+        auto result = std::make_shared<ngraph::opset8::Result>(matmul);
+        auto function = std::make_shared<ov::Model>(ov::ResultVector({result}), ov::ParameterVector({input}), "MatMul");
+        return function;
+    }
+
+    MemAlignmentTestParams test_params;
+};
+
+TEST_P(GNAPluginLoadNetworkTests, CompareInpShapeVsReservedMemRegion) {
+    Run();
+}
+
+INSTANTIATE_TEST_SUITE_P(MemoryAlignment_FP32,
+                         GNAPluginLoadNetworkTests,
+                         ::testing::Combine(::testing::Values(ExecutionMode::SW_FP32),
+                                            ::testing::Values(HWGeneration::UNDEFINED),
+                                            ::testing::ValuesIn(param_16B_alignment_prec_fp32)),
+                         GNAPluginLoadNetworkTests::GetTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(MemoryAlignment_GNA_3_0,
+                         GNAPluginLoadNetworkTests,
+                         ::testing::Combine(::testing::Values(ExecutionMode::SW_EXACT),
+                                            ::testing::Values(HWGeneration::GNA_3_0),
+                                            ::testing::ValuesIn(param_64B_alignment_prec_int16)),
+                         GNAPluginLoadNetworkTests::GetTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(MemoryAlignment_GNA_3_5,
+                         GNAPluginLoadNetworkTests,
+                         ::testing::Combine(::testing::Values(ExecutionMode::SW_EXACT),
+                                            ::testing::Values(HWGeneration::GNA_3_5),
+                                            ::testing::ValuesIn(param_64B_alignment_prec_int16)),
+                         GNAPluginLoadNetworkTests::GetTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(MemoryAlignment_GNA_3_6,
+                         GNAPluginLoadNetworkTests,
+                         ::testing::Combine(::testing::Values(ExecutionMode::SW_EXACT),
+                                            ::testing::Values(HWGeneration::GNA_3_6),
+                                            ::testing::ValuesIn(param_16B_alignment_prec_int16)),
+                         GNAPluginLoadNetworkTests::GetTestCaseName);
+
+INSTANTIATE_TEST_SUITE_P(MemoryAlignment_GNA_4_0,
+                         GNAPluginLoadNetworkTests,
+                         ::testing::Combine(::testing::Values(ExecutionMode::SW_EXACT),
+                                            ::testing::Values(HWGeneration::GNA_4_0),
+                                            ::testing::ValuesIn(param_16B_alignment_prec_int16)),
+                         GNAPluginLoadNetworkTests::GetTestCaseName);
+
+class MemoryAlignmentTest : public ::testing::Test {};
+
+TEST(MemoryAlignmentTest, getMemoryAlignmentBytes_ExpectExceptionWhenTargetIsUnset) {
+    EXPECT_ANY_THROW(getMemoryAlignmentBytes(DeviceVersion::NotSet));
+}
+
+TEST(MemoryAlignmentTest, getMemoryAlignmentBytes_Expect64ByteAlignmentWhenTargetIsGNA3_0) {
+    EXPECT_EQ(getMemoryAlignmentBytes(DeviceVersion::GNA3_0), 64);
+}
+
+TEST(MemoryAlignmentTest, getMemoryAlignmentBytes_Expect16ByteAlignmentWhenTargetIsGNA3_6) {
+    EXPECT_EQ(getMemoryAlignmentBytes(DeviceVersion::GNA3_6), 16);
+}
+
+}  // namespace testing