[ARM CPU] Update TBB ACL Scheduler (#18885)
This commit is contained in:
parent
5170350cf5
commit
b7b5d4cd93
@ -10,6 +10,11 @@ namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
static std::mutex & get_mtx_ifunc() {
|
||||
static std::mutex mtx_ifunc;
|
||||
return mtx_ifunc;
|
||||
}
|
||||
|
||||
inline VectorDims reshape_sizes(VectorDims dims) {
|
||||
const size_t MAX_NUM_SHAPE = arm_compute::MAX_DIMS;
|
||||
VectorDims result_dims(MAX_NUM_SHAPE - 1);
|
||||
@ -494,6 +499,11 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
|
||||
default:
|
||||
IE_THROW() << "Unsupported operation type for ACL Eltwise executor: " << static_cast<int>(aclEltwiseAttrs.algorithm);
|
||||
}
|
||||
|
||||
// We get a problem (seg. faults, data race etc) for eltwise operations when we use several configure(...) functions in parallel.
|
||||
// We created issue about this problem here: https://github.com/ARM-software/ComputeLibrary/issues/1073
|
||||
// TODO: change it when we will get an answer to our question in issue
|
||||
std::lock_guard<std::mutex> _lock {get_mtx_ifunc()};
|
||||
ifunc = exec_func();
|
||||
return true;
|
||||
}
|
||||
|
@ -0,0 +1,77 @@
|
||||
// Copyright (C) 2020-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "acl_ie_scheduler.hpp"
|
||||
|
||||
#include "arm_compute/core/CPP/ICPPKernel.h"
|
||||
#include "arm_compute/core/Error.h"
|
||||
#include "arm_compute/core/Helpers.h"
|
||||
#include <ie_parallel.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
ACLScheduler::ACLScheduler() = default;
|
||||
|
||||
unsigned int ACLScheduler::num_threads() const {
|
||||
return parallel_get_num_threads();
|
||||
}
|
||||
|
||||
void ACLScheduler::set_num_threads(unsigned int num_threads) {}
|
||||
|
||||
void ACLScheduler::schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) {
|
||||
const Window & max_window = window;
|
||||
const unsigned int num_iterations = max_window.num_iterations_total();
|
||||
const auto _num_threads = std::min(num_iterations, static_cast<unsigned int>(parallel_get_num_threads()));
|
||||
|
||||
if (num_iterations == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::function<void(const Window &window, const ThreadInfo &info)> main_run;
|
||||
if (tensors.empty()) {
|
||||
main_run = [&](const Window &window, const ThreadInfo &info) {
|
||||
kernel->run(window, info);
|
||||
};
|
||||
} else {
|
||||
main_run = [&](const Window &window, const ThreadInfo &info) {
|
||||
kernel->run_op(tensors, window, info);
|
||||
};
|
||||
}
|
||||
|
||||
if (!kernel->is_parallelisable() || _num_threads == 1) {
|
||||
ThreadInfo info;
|
||||
info.cpu_info = &cpu_info();
|
||||
main_run(max_window, info);
|
||||
} else {
|
||||
const auto num_windows = _num_threads;
|
||||
const auto hints_split_dimension = hints.split_dimension();
|
||||
|
||||
InferenceEngine::parallel_for(num_windows, [&](int wid) {
|
||||
Window win = max_window.split_window(hints_split_dimension, wid, num_windows);
|
||||
win.validate();
|
||||
main_run(win, {wid, static_cast<int>(_num_threads), &cpu_info()});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void ACLScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
|
||||
ITensorPack tensors;
|
||||
schedule_custom(kernel, hints, kernel->window(), tensors);
|
||||
}
|
||||
|
||||
void ACLScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) {
|
||||
schedule_custom(kernel, hints, window, tensors);
|
||||
}
|
||||
|
||||
void ACLScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads) {
|
||||
InferenceEngine::parallel_for(workloads.size(), [&](int wid) {
|
||||
workloads[wid]({wid, static_cast<int>(parallel_get_num_threads()), &cpu_info()});
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -0,0 +1,31 @@
|
||||
// Copyright (C) 2020-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <arm_compute/runtime/Scheduler.h>
|
||||
#include <arm_compute/core/CPP/ICPPKernel.h>
|
||||
#include <arm_compute/core/ITensorPack.h>
|
||||
#include "support/Mutex.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using namespace arm_compute;
|
||||
|
||||
class ACLScheduler final : public IScheduler {
|
||||
public:
|
||||
ACLScheduler();
|
||||
~ACLScheduler() override = default;
|
||||
std::uint32_t num_threads() const override;
|
||||
void set_num_threads(unsigned int num_threads) override;
|
||||
void schedule(ICPPKernel *kernel, const Hints &hints) override;
|
||||
void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
|
||||
protected:
|
||||
void run_workloads(std::vector<Workload> &workloads) override;
|
||||
private:
|
||||
void schedule_custom(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors);
|
||||
};
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -38,6 +38,11 @@
|
||||
#include <cpu/x64/cpu_isa_traits.hpp>
|
||||
#include <itt.h>
|
||||
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
#include "nodes/executors/acl/acl_ie_scheduler.hpp"
|
||||
#include "arm_compute/runtime/CPP/CPPScheduler.h"
|
||||
#endif
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
#define IE_CPU_PLUGIN_THROW(...) IE_THROW(__VA_ARGS__) << "CPU plugin: "
|
||||
@ -137,11 +142,44 @@ public:
|
||||
};
|
||||
#endif // __linux__
|
||||
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
std::mutex Engine::SchedulerGuard::mutex;
|
||||
std::weak_ptr<Engine::SchedulerGuard> Engine::SchedulerGuard::ptr;
|
||||
|
||||
Engine::SchedulerGuard::SchedulerGuard() {
|
||||
#if IE_THREAD == IE_THREAD_SEQ
|
||||
// To save state for ACL cores in single-thread mode
|
||||
arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
|
||||
#else
|
||||
arm_compute::Scheduler::set(std::make_shared<ACLScheduler>());
|
||||
#endif
|
||||
}
|
||||
|
||||
std::shared_ptr<Engine::SchedulerGuard> Engine::SchedulerGuard::instance() {
|
||||
std::lock_guard<std::mutex> lock{SchedulerGuard::mutex};
|
||||
auto scheduler_guard_ptr = SchedulerGuard::ptr.lock();
|
||||
if (scheduler_guard_ptr == nullptr) {
|
||||
SchedulerGuard::ptr = scheduler_guard_ptr = std::make_shared<SchedulerGuard>();
|
||||
}
|
||||
return scheduler_guard_ptr;
|
||||
}
|
||||
|
||||
Engine::SchedulerGuard::~SchedulerGuard() {
|
||||
// To save the state of scheduler after ACLScheduler has been executed
|
||||
// TODO: find out the cause of the state
|
||||
std::lock_guard<std::mutex> lock{this->dest_mutex};
|
||||
arm_compute::Scheduler::set(arm_compute::Scheduler::Type::ST);
|
||||
}
|
||||
#endif
|
||||
|
||||
Engine::Engine() :
|
||||
deviceFullName(getDeviceFullName()),
|
||||
specialSetup(new CPUSpecialSetup) {
|
||||
_pluginName = "CPU";
|
||||
extensionManager->AddExtension(std::make_shared<Extension>());
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
scheduler_guard = SchedulerGuard::instance();
|
||||
#endif
|
||||
}
|
||||
|
||||
Engine::~Engine() {
|
||||
|
@ -63,6 +63,20 @@ private:
|
||||
const std::string deviceFullName;
|
||||
|
||||
std::shared_ptr<void> specialSetup;
|
||||
|
||||
#if defined(OV_CPU_WITH_ACL)
|
||||
struct SchedulerGuard {
|
||||
SchedulerGuard();
|
||||
~SchedulerGuard();
|
||||
static std::shared_ptr<SchedulerGuard> instance();
|
||||
static std::mutex mutex;
|
||||
// separate mutex for saving ACLScheduler state in destructor
|
||||
mutable std::mutex dest_mutex;
|
||||
static std::weak_ptr<SchedulerGuard> ptr;
|
||||
};
|
||||
|
||||
std::shared_ptr<SchedulerGuard> scheduler_guard;
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
|
Loading…
Reference in New Issue
Block a user