[GPU] Async static kernels compile for dynamic flow (#14205)
* [GPU] Async static kernels compile * [GPU] Added interface for async compilation worker * [GPU] refactoring
This commit is contained in:
committed by
GitHub
parent
a37723e8cf
commit
63b80a4fee
@@ -50,6 +50,7 @@ private:
|
||||
};
|
||||
|
||||
class primitive_inst;
|
||||
class ICompilationContext;
|
||||
|
||||
struct network {
|
||||
public:
|
||||
@@ -233,6 +234,9 @@ public:
|
||||
/// Return in_mem_kernels_cache
|
||||
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
|
||||
|
||||
ICompilationContext& get_compilation_context() const { return *_compilation_context; }
|
||||
std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; }
|
||||
|
||||
private:
|
||||
using output_chains_map = std::map<primitive_id, std::vector<std::shared_ptr<primitive_inst>>>;
|
||||
uint32_t net_id = 0;
|
||||
@@ -257,6 +261,9 @@ private:
|
||||
std::unordered_map<primitive_id, event::ptr> _events;
|
||||
output_chains_map _output_chains;
|
||||
|
||||
mutable std::mutex _in_mem_cache_mutex;
|
||||
std::unique_ptr<ICompilationContext> _compilation_context;
|
||||
|
||||
void build_exec_order();
|
||||
void allocate_primitive_instance(program_node const& node);
|
||||
void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
|
||||
|
||||
54
src/plugins/intel_gpu/src/graph/compilation_context.cpp
Normal file
54
src/plugins/intel_gpu/src/graph/compilation_context.cpp
Normal file
@@ -0,0 +1,54 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "compilation_context.hpp"
|
||||
#include "threading/ie_thread_safe_containers.hpp"
|
||||
#include "kernel_selector/kernel_base.h"
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
class CompilationContext : public ICompilationContext {
|
||||
public:
|
||||
using compilation_queue_t = InferenceEngine::ThreadSafeQueue<ICompilationContext::Task>;
|
||||
|
||||
CompilationContext(cldnn::engine& engine, size_t program_id) {
|
||||
_kernels_cache = cldnn::make_unique<kernels_cache>(engine, program_id, kernel_selector::KernelBase::get_db().get_batch_header_str());
|
||||
_worker = std::thread([this](){
|
||||
while (!_stop_compilation) {
|
||||
CompilationContext::Task task;
|
||||
bool success = _queue.try_pop(task);
|
||||
if (success) {
|
||||
task(*_kernels_cache);
|
||||
} else {
|
||||
std::chrono::milliseconds ms{1};
|
||||
std::this_thread::sleep_for(ms);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void push_task(ICompilationContext::Task&& task) override {
|
||||
_queue.push(task);
|
||||
}
|
||||
|
||||
void cancel() noexcept override {
|
||||
_stop_compilation = true;
|
||||
if (_worker.joinable())
|
||||
_worker.join();
|
||||
}
|
||||
|
||||
~CompilationContext() noexcept { cancel(); }
|
||||
|
||||
private:
|
||||
std::unique_ptr<kernels_cache> _kernels_cache;
|
||||
compilation_queue_t _queue;
|
||||
std::thread _worker;
|
||||
std::atomic_bool _stop_compilation{false};
|
||||
};
|
||||
|
||||
std::unique_ptr<ICompilationContext> ICompilationContext::create(cldnn::engine& engine, size_t program_id) {
|
||||
return cldnn::make_unique<CompilationContext>(engine, program_id);
|
||||
}
|
||||
|
||||
} // namespace cldnn
|
||||
@@ -0,0 +1,23 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "kernels_cache.hpp"
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
class ICompilationContext {
|
||||
public:
|
||||
using Task = std::function<void(kernels_cache&)>;
|
||||
virtual void push_task(Task&& task) = 0;
|
||||
virtual void cancel() noexcept = 0;
|
||||
virtual ~ICompilationContext() = default;
|
||||
|
||||
static std::unique_ptr<ICompilationContext> create(cldnn::engine& engine, size_t program_id);
|
||||
};
|
||||
|
||||
} // namespace cldnn
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "program_helpers.h"
|
||||
#include "runtime/cldnn_itt.hpp"
|
||||
#include "kernels_cache.hpp"
|
||||
#include "compilation_context.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
@@ -308,6 +309,7 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
|
||||
kernel_selector::KernelBase::get_db().get_batch_header_str()));
|
||||
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
|
||||
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
|
||||
_compilation_context = std::move(ICompilationContext::create(program->get_engine(), program->get_id()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -425,6 +427,8 @@ network::network(cldnn::BinaryInputBuffer& ib, stream::ptr stream, engine& engin
|
||||
}
|
||||
|
||||
network::~network() {
|
||||
if (_compilation_context)
|
||||
_compilation_context->cancel();
|
||||
_memory_pool->clear_pool_for_network(net_id);
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
|
||||
|
||||
@@ -16,15 +16,16 @@
|
||||
#include "shape_of_inst.h"
|
||||
#include "strided_slice_inst.h"
|
||||
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
|
||||
#include "intel_gpu/plugin/common_utils.hpp"
|
||||
#include "compilation_context.hpp"
|
||||
|
||||
#include "intel_gpu/plugin/common_utils.hpp"
|
||||
#include "intel_gpu/graph/network.hpp"
|
||||
#include "intel_gpu/graph/serialization/set_serializer.hpp"
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "intel_gpu/runtime/memory.hpp"
|
||||
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
|
||||
#include "json_object.h"
|
||||
#include <string>
|
||||
#include <stack>
|
||||
@@ -220,7 +221,8 @@ void primitive_inst::update_shape() {
|
||||
layout.data_padding = padding::max(_node->get_primitive()->output_padding, layout.data_padding);
|
||||
if (_impl_params->get_output_layout(idx) != layout) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": update shape: was: " << _impl_params->get_output_layout(idx) << "\nnow: " << layout << std::endl;
|
||||
GPU_DEBUG_COUT << id() << ": update shape: was: " << _impl_params->get_output_layout(idx).to_short_string()
|
||||
<< " now: " << layout.to_short_string() << std::endl;
|
||||
}
|
||||
set_shape_change();
|
||||
}
|
||||
@@ -341,15 +343,48 @@ void primitive_inst::update_impl() {
|
||||
};
|
||||
|
||||
if (!_node->is_type<data>() && !(_node->is_type<mutable_data>() && _node->get_dependencies().empty())) {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
// Update param if fake_alignment is available
|
||||
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||
auto layout_key = get_layout_key(updated_params);
|
||||
auto& cache = get_network().get_implementations_cache();
|
||||
if (cache.has(layout_key)) {
|
||||
_impl = cache.get(layout_key)->clone();
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
} else {
|
||||
bool has_cached_impl = false;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
|
||||
has_cached_impl = cache.has(layout_key);
|
||||
if (has_cached_impl) {
|
||||
_impl = cache.get(layout_key)->clone();
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": get impl from cache " << _impl->get_kernel_name() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!has_cached_impl) {
|
||||
if (_dynamic_impl) {
|
||||
auto& compilation_context = get_network().get_compilation_context();
|
||||
compilation_context.push_task([this, updated_params, layout_key](kernels_cache& kc) {
|
||||
auto& cache = get_network().get_implementations_cache();
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
|
||||
// Check existense in the cache one more time as several iterations of model execution could happens and multiple compilation
|
||||
// tasks created for same shapes
|
||||
if (cache.has(layout_key))
|
||||
return;
|
||||
}
|
||||
|
||||
auto impl = _node->type()->choose_impl(*_node, updated_params);
|
||||
auto kernel_ids = kc.add_kernels_source(impl->get_kernels_source());
|
||||
impl->set_kernel_ids(kernel_ids);
|
||||
kc.compile();
|
||||
impl->init_kernels(kc);
|
||||
kc.reset();
|
||||
|
||||
std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
|
||||
cache.add(layout_key, impl->clone());
|
||||
});
|
||||
|
||||
_impl = _dynamic_impl->clone();
|
||||
_impl->update_dispatch_data(updated_params);
|
||||
update_shape_info(updated_params);
|
||||
@@ -360,17 +395,17 @@ void primitive_inst::update_impl() {
|
||||
_impl->set_kernel_ids(kernel_ids);
|
||||
kernels_cache.compile();
|
||||
_impl->init_kernels(kernels_cache);
|
||||
cache.add(layout_key, _impl->clone());
|
||||
kernels_cache.reset();
|
||||
std::lock_guard<std::mutex> lock(get_network().get_impl_cache_mutex());
|
||||
cache.add(layout_key, _impl->clone());
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
GPU_DEBUG_COUT << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
reset_shape_change();
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
auto new_impl_str = _impl != nullptr ? _impl->get_kernel_name() : "nullptr";
|
||||
GPU_DEBUG_COUT << id() << ": update impl from " << prev_impl_str << " to " << new_impl_str << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -707,13 +742,15 @@ event::ptr primitive_inst::update_weights() {
|
||||
auto& cache = get_network().get_in_mem_kernels_cache();
|
||||
if (cache.has(layout_key)) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout << "\nto " << expected_layout << std::endl;
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
|
||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||
}
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
||||
kernel = cache.get(layout_key);
|
||||
} else {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout << "\nto " << expected_layout << std::endl;
|
||||
GPU_DEBUG_COUT << id() << ": reorder weights from " << original_layout.to_short_string()
|
||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||
}
|
||||
auto& kernels_cache = get_network().get_kernels_cache();
|
||||
auto kernel_id = kernels_cache.set_kernel_source(weights_params.clKernel->code.kernelString, false);
|
||||
|
||||
Reference in New Issue
Block a user