[ARM CPU] Update TBB ACL Scheduler (#18885)

2023-10-13 07:34:07 +02:00
parent 5170350cf5
commit b7b5d4cd93
5 changed files with 170 additions and 0 deletions
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
@@ -10,6 +10,11 @@ namespace intel_cpu {

 using namespace arm_compute;

+static std::mutex & get_mtx_ifunc() {
+    static std::mutex mtx_ifunc;
+    return mtx_ifunc;
+}
+
 inline VectorDims reshape_sizes(VectorDims dims) {
    const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
    VectorDims result_dims(MAX_NUM_SHAPE - 1);
@@ -494,6 +499,11 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
        default:
            IE_THROW() << "Unsupported operation type for ACL Eltwise executor: " << static_cast<int>(aclEltwiseAttrs.algorithm);
    }
+
+    // We get a problem (seg. faults, data race etc) for eltwise operations when we use several configure(...) functions in parallel.
+    // We created issue about this problem here: https://github.com/ARM-software/ComputeLibrary/issues/1073
+    // TODO: change it when we will get an answer to our question in issue
+    std::lock_guard<std::mutex> _lock {get_mtx_ifunc()};
    ifunc = exec_func();
    return true;
 }
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.cpp
@@ -0,0 +1,77 @@
+// Copyright (C) 2020-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "acl_ie_scheduler.hpp"
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include <ie_parallel.hpp>
+
+namespace ov {
+namespace intel_cpu {
+
+using namespace arm_compute;
+
+ACLScheduler::ACLScheduler() = default;
+
+unsigned int ACLScheduler::num_threads() const {
+    return parallel_get_num_threads();
+}
+
+void ACLScheduler::set_num_threads(unsigned int num_threads) {}
+
+void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) {
+    const Window & max_window = window;
+    const unsigned int num_iterations = max_window.num_iterations_total();
+    const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
+
+    if (num_iterations == 0) {
+        return;
+    }
+
+    std::function<void(const Window &window, const ThreadInfo &info)> main_run;
+    if (tensors.empty()) {
+        main_run = [&](const Window &window, const ThreadInfo &info) {
+            kernel->run(window, info);
+        };
+    } else {
+        main_run = [&](const Window &window, const ThreadInfo &info) {
+            kernel->run_op(tensors, window, info);
+        };
+    }
+
+    if (!kernel->is_parallelisable() || _num_threads == 1) {
+        ThreadInfo info;
+        info.cpu_info = &cpu_info();
+        main_run(max_window, info);
+    } else {
+        const auto num_windows = _num_threads;
+        const auto hints_split_dimension = hints.split_dimension();
+
+        InferenceEngine::parallel_for(num_windows, [&](int wid) {
+            Window win = max_window.split_window(hints_split_dimension, wid, num_windows);
+            win.validate();
+            main_run(win, {wid, static_cast<int>(_num_threads), &cpu_info()});
+        });
+    }
+}
+
+void ACLScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
+    ITensorPack tensors;
+    schedule_custom(kernel, hints, kernel->window(), tensors);
+}
+
+void ACLScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) {
+    schedule_custom(kernel, hints, window, tensors);
+}
+
+void ACLScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads) {
+    InferenceEngine::parallel_for(workloads.size(), [&](int wid) {
+        workloads[wid]({wid, static_cast<int>(parallel_get_num_threads()), &cpu_info()});
+    });
+}
+
+} // namespace intel_cpu
+} // namespace ov
--- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp
+++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_ie_scheduler.hpp
@@ -0,0 +1,31 @@
+// Copyright (C) 2020-2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <arm_compute/runtime/Scheduler.h>
+#include <arm_compute/core/CPP/ICPPKernel.h>
+#include <arm_compute/core/ITensorPack.h>
+#include "support/Mutex.h"
+
+namespace ov {
+namespace intel_cpu {
+
+using namespace arm_compute;
+
+class ACLScheduler final : public IScheduler {
+public:
+    ACLScheduler();
+    ~ACLScheduler() override = default;
+    std::uint32_t num_threads() const override;
+    void set_num_threads(unsigned int num_threads) override;
+    void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
+protected:
+    void run_workloads(std::vector<Workload> &workloads) override;
+private:
+    void schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors);
+};
+}  //  namespace intel_cpu
+}  //  namespace ov
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -38,6 +38,11 @@
 #include <cpu/x64/cpu_isa_traits.hpp>
 #include <itt.h>

+#if defined(OV_CPU_WITH_ACL)
+#include "nodes/executors/acl/acl_ie_scheduler.hpp"
+#include "arm_compute/runtime/CPP/CPPScheduler.h"
+#endif
+
 using namespace InferenceEngine;

 #define IE_CPU_PLUGIN_THROW(...) IE_THROW(__VA_ARGS__) << "CPU plugin: "
@@ -137,11 +142,44 @@ public:
 };
 #endif // __linux__

+#if defined(OV_CPU_WITH_ACL)
+std::mutex Engine::SchedulerGuard::mutex;
+std::weak_ptr<Engine::SchedulerGuard> Engine::SchedulerGuard::ptr;
+
+Engine::SchedulerGuard::SchedulerGuard() {
+#if IE_THREAD == IE_THREAD_SEQ
+    // To save state for ACL cores in single-thread mode
+    arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
+#else
+    arm_compute::Scheduler::set(std::make_shared<ACLScheduler>());
+#endif
+}
+
+std::shared_ptr<Engine::SchedulerGuard> Engine::SchedulerGuard::instance() {
+    std::lock_guard<std::mutex> lock{SchedulerGuard::mutex};
+    auto scheduler_guard_ptr = SchedulerGuard::ptr.lock();
+    if (scheduler_guard_ptr == nullptr) {
+        SchedulerGuard::ptr = scheduler_guard_ptr = std::make_shared<SchedulerGuard>();
+    }
+    return scheduler_guard_ptr;
+}
+
+Engine::SchedulerGuard::~SchedulerGuard() {
+    // To save the state of scheduler after ACLScheduler has been executed
+    // TODO: find out the cause of the state
+    std::lock_guard<std::mutex> lock{this->dest_mutex};
+    arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
+}
+#endif
+
 Engine::Engine() :
    deviceFullName(getDeviceFullName()),
    specialSetup(new CPUSpecialSetup) {
    _pluginName = "CPU";
    extensionManager->AddExtension(std::make_shared<Extension>());
+#if defined(OV_CPU_WITH_ACL)
+    scheduler_guard = SchedulerGuard::instance();
+#endif
 }

 Engine::~Engine() {
--- a/src/plugins/intel_cpu/src/plugin.h
+++ b/src/plugins/intel_cpu/src/plugin.h
@@ -63,6 +63,20 @@ private:
    const std::string deviceFullName;

    std::shared_ptr<void> specialSetup;
+
+#if defined(OV_CPU_WITH_ACL)
+    struct SchedulerGuard {
+        SchedulerGuard();
+        ~SchedulerGuard();
+        static std::shared_ptr<SchedulerGuard> instance();
+        static std::mutex mutex;
+        // separate mutex for saving ACLScheduler state in destructor
+        mutable std::mutex dest_mutex;
+        static std::weak_ptr<SchedulerGuard> ptr;
+    };
+
+    std::shared_ptr<SchedulerGuard> scheduler_guard;
+#endif
 };

 }   // namespace intel_cpu