[GPU] Share compilation context with sub-networks (#20706)
This commit is contained in:
parent
be25d9038e
commit
cc10b14bab
@ -131,6 +131,7 @@ public:
|
||||
topology const& topology,
|
||||
const ExecutionConfig& config,
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
|
||||
std::shared_ptr<ICompilationContext> compilation_context,
|
||||
bool is_internal = false,
|
||||
bool no_optimizations = false,
|
||||
bool is_body_program = false);
|
||||
@ -251,6 +252,14 @@ public:
|
||||
bool is_internal = false,
|
||||
bool no_optimizations = false,
|
||||
bool is_body_program = false);
|
||||
static ptr build_program(engine& engine,
|
||||
const topology& topology,
|
||||
const ExecutionConfig& config,
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
|
||||
std::shared_ptr<ICompilationContext> compilation_context,
|
||||
bool is_internal = false,
|
||||
bool no_optimizations = false,
|
||||
bool is_body_program = false);
|
||||
static ptr build_program(engine& engine,
|
||||
const std::set<std::shared_ptr<program_node>>& nodes,
|
||||
const ExecutionConfig& config,
|
||||
@ -266,9 +275,11 @@ public:
|
||||
|
||||
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
|
||||
ICompilationContext& get_compilation_context() const { return *_compilation_context; }
|
||||
std::shared_ptr<ICompilationContext> get_compilation_context_ptr() const { return _compilation_context; }
|
||||
void cancel_compilation_context();
|
||||
|
||||
static std::shared_ptr<ov::threading::IStreamsExecutor> make_task_executor(const ExecutionConfig& config);
|
||||
static std::shared_ptr<ICompilationContext> make_compilation_context(const ExecutionConfig& config);
|
||||
|
||||
private:
|
||||
uint32_t prog_id = 0;
|
||||
@ -286,8 +297,7 @@ private:
|
||||
bool is_body_program;
|
||||
std::unique_ptr<ImplementationsCache> _impls_cache;
|
||||
const size_t _impls_cache_capacity = 10000;
|
||||
const int _num_async_build_threads = 1;
|
||||
std::unique_ptr<ICompilationContext> _compilation_context;
|
||||
std::shared_ptr<ICompilationContext> _compilation_context;
|
||||
|
||||
std::map<primitive_id, std::shared_ptr<program_node>> nodes_map;
|
||||
std::list<primitive_id> optimized_out;
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "intel_gpu/plugin/custom_layer.hpp"
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "intel_gpu/runtime/execution_config.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
#include "intel_gpu/graph/topology.hpp"
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
|
||||
@ -75,7 +76,9 @@ class ProgramBuilder final {
|
||||
public:
|
||||
ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
|
||||
bool createTopologyOnly = false, bool partialBuild = false,
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr, bool innerProgram = false);
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr,
|
||||
std::shared_ptr<cldnn::ICompilationContext> compilation_context = nullptr,
|
||||
bool innerProgram = false);
|
||||
ProgramBuilder(cldnn::engine& engine, const ExecutionConfig& config);
|
||||
|
||||
static const cldnn::primitive_id m_preProcessTag;
|
||||
@ -136,6 +139,7 @@ public:
|
||||
bool requires_new_shape_infer(const ov::Node& op) const;
|
||||
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
|
||||
std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }
|
||||
|
||||
private:
|
||||
static factories_map_t factories_map;
|
||||
@ -153,6 +157,7 @@ private:
|
||||
bool queryMode;
|
||||
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> m_task_executor;
|
||||
std::shared_ptr<cldnn::ICompilationContext> m_compilation_context;
|
||||
|
||||
void EnableQueryMode() { queryMode = true; }
|
||||
void DisableQueryMode() { queryMode = false; }
|
||||
|
@ -4,10 +4,10 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/runtime/threading/cpu_streams_executor.hpp"
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include "intel_gpu/graph/kernel_impl_params.hpp"
|
||||
#include "openvino/runtime/threading/istreams_executor.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
@ -21,7 +21,7 @@ public:
|
||||
virtual void cancel() = 0;
|
||||
virtual void wait_all() = 0;
|
||||
|
||||
static std::unique_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
|
||||
static std::shared_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
|
||||
};
|
||||
|
||||
} // namespace cldnn
|
@ -2,12 +2,14 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "compilation_context.hpp"
|
||||
#include <mutex>
|
||||
#include <atomic>
|
||||
#include <unordered_set>
|
||||
#include <future>
|
||||
#include "intel_gpu/runtime/utils.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
|
||||
#include "openvino/runtime/threading/cpu_streams_executor.hpp"
|
||||
|
||||
namespace cldnn {
|
||||
class CompilationContext : public ICompilationContext {
|
||||
@ -83,7 +85,7 @@ private:
|
||||
std::vector<std::future<void>> futures;
|
||||
};
|
||||
|
||||
std::unique_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
|
||||
std::shared_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
|
||||
return cldnn::make_unique<CompilationContext>(task_executor_config);
|
||||
}
|
||||
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "intel_gpu/runtime/event.hpp"
|
||||
#include "intel_gpu/runtime/stream.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "intel_gpu/runtime/itt.hpp"
|
||||
|
||||
@ -34,7 +35,6 @@
|
||||
#include "program_helpers.h"
|
||||
#include "to_string_utils.h"
|
||||
#include "kernels_cache.hpp"
|
||||
#include "compilation_context.hpp"
|
||||
|
||||
// TODO: Remove once we have an abstraction for kernels_cache
|
||||
#include "kernel_base.h"
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include "read_value_inst.h"
|
||||
#include "condition_inst.h"
|
||||
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
|
||||
#include "compilation_context.hpp"
|
||||
#include "implementation_map.hpp"
|
||||
#include "graph_optimizer/prepare_buffer_fusing.h"
|
||||
|
||||
@ -36,6 +35,7 @@
|
||||
#include "intel_gpu/runtime/memory.hpp"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
|
||||
#include "json_object.h"
|
||||
#include <string>
|
||||
@ -1502,7 +1502,13 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
|
||||
ov::intel_gpu::allow_static_input_reorder(true),
|
||||
ov::intel_gpu::allow_new_shape_infer(true)
|
||||
};
|
||||
auto prog = program::build_program(get_network().get_engine(), t, subgraph_config, get_network().get_program()->get_task_executor(), true, false);
|
||||
auto prog = program::build_program(get_network().get_engine(),
|
||||
t,
|
||||
subgraph_config,
|
||||
get_network().get_program()->get_task_executor(),
|
||||
get_network().get_program()->get_compilation_context_ptr(),
|
||||
true,
|
||||
false);
|
||||
|
||||
_unfused_subgraph = network::allocate_network(get_network().get_stream_ptr(), prog, true, get_network().is_primary_stream());
|
||||
}
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||
#include "intel_gpu/runtime/itt.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
|
||||
#include "auto_tuner.h"
|
||||
@ -17,7 +18,6 @@
|
||||
#include "program_dump_graph.h"
|
||||
#include "sliding_window_utils.hpp"
|
||||
#include "program_helpers.h"
|
||||
#include "compilation_context.hpp"
|
||||
|
||||
#include "matrix_nms_inst.h"
|
||||
#include "roi_pooling_inst.h"
|
||||
@ -145,10 +145,17 @@ std::shared_ptr<ov::threading::IStreamsExecutor> program::make_task_executor(con
|
||||
return std::make_shared<ov::threading::CPUStreamsExecutor>(task_executor_config);
|
||||
}
|
||||
|
||||
std::shared_ptr<ICompilationContext> program::make_compilation_context(const ExecutionConfig& config) {
|
||||
const int _num_async_build_threads = 1;
|
||||
return ICompilationContext::create(make_task_executor_config(config,
|
||||
"Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
|
||||
}
|
||||
|
||||
program::program(engine& engine_ref,
|
||||
topology const& topology,
|
||||
const ExecutionConfig& config,
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
|
||||
std::shared_ptr<ICompilationContext> compilation_context,
|
||||
bool is_internal,
|
||||
bool no_optimizations,
|
||||
bool is_body_program)
|
||||
@ -158,7 +165,8 @@ program::program(engine& engine_ref,
|
||||
_task_executor(std::move(task_executor)),
|
||||
processing_order(),
|
||||
is_internal(is_internal),
|
||||
is_body_program(is_body_program) {
|
||||
is_body_program(is_body_program),
|
||||
_compilation_context(compilation_context) {
|
||||
_config.apply_user_properties(_engine.get_device_info());
|
||||
init_primitives();
|
||||
GPU_DEBUG_INFO << "Program config\n" << config.to_string();
|
||||
@ -214,8 +222,8 @@ void program::init_program() {
|
||||
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
|
||||
kernel_selector::KernelBase::get_db().get_batch_header_str()));
|
||||
|
||||
_compilation_context = ICompilationContext::create(make_task_executor_config(_config,
|
||||
"Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
|
||||
if (!_compilation_context)
|
||||
_compilation_context = program::make_compilation_context(_config);
|
||||
|
||||
_impls_cache = cldnn::make_unique<ImplementationsCache>(_impls_cache_capacity);
|
||||
// Remove items of compilation context's internal queue when some impl is popped in kernels_cache
|
||||
@ -253,7 +261,18 @@ program::ptr program::build_program(engine& engine,
|
||||
bool is_internal,
|
||||
bool no_optimizations,
|
||||
bool is_body_program) {
|
||||
return std::make_shared<program>(engine, topology, config, task_executor, is_internal, no_optimizations, is_body_program);
|
||||
return std::make_shared<program>(engine, topology, config, task_executor, nullptr, is_internal, no_optimizations, is_body_program);
|
||||
}
|
||||
|
||||
program::ptr program::build_program(engine& engine,
|
||||
const topology& topology,
|
||||
const ExecutionConfig& config,
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
|
||||
std::shared_ptr<ICompilationContext> compilation_context,
|
||||
bool is_internal,
|
||||
bool no_optimizations,
|
||||
bool is_body_program) {
|
||||
return std::make_shared<program>(engine, topology, config, task_executor, compilation_context, is_internal, no_optimizations, is_body_program);
|
||||
}
|
||||
|
||||
program::ptr program::build_program(engine& engine,
|
||||
@ -262,7 +281,7 @@ program::ptr program::build_program(engine& engine,
|
||||
bool is_internal,
|
||||
bool no_optimizations,
|
||||
bool is_body_program) {
|
||||
return std::make_shared<program>(engine, topology, config, nullptr, is_internal, no_optimizations, is_body_program);
|
||||
return std::make_shared<program>(engine, topology, config, nullptr, nullptr, is_internal, no_optimizations, is_body_program);
|
||||
}
|
||||
|
||||
program::ptr program::build_program(engine& engine,
|
||||
|
@ -31,7 +31,7 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_
|
||||
config.set_property(ov::intel_gpu::max_dynamic_batch(1));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic()));
|
||||
|
||||
ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), true);
|
||||
ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
|
||||
branch.inner_program = prog.get_compiled_program();
|
||||
|
||||
auto& input_map = branch.input_map;
|
||||
|
@ -280,7 +280,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
|
||||
|
||||
// get body program from ov::Model
|
||||
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), true);
|
||||
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
|
||||
auto body_program = prog.get_compiled_program();
|
||||
|
||||
GPU_DEBUG_LOG << "* trip_count_id : " << trip_count_id << std::endl;
|
||||
|
@ -55,14 +55,20 @@ std::string layer_type_name_ID(const std::shared_ptr<ov::Node>& op) {
|
||||
|
||||
ProgramBuilder::ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
|
||||
bool create_topology_only, bool partial_build,
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor, bool is_inner_program)
|
||||
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
|
||||
std::shared_ptr<cldnn::ICompilationContext> compilation_context,
|
||||
bool is_inner_program)
|
||||
: m_config(config)
|
||||
, m_engine(engine)
|
||||
, queryMode(false)
|
||||
, m_task_executor(task_executor) {
|
||||
, m_task_executor(task_executor)
|
||||
, m_compilation_context(compilation_context) {
|
||||
if (m_task_executor == nullptr)
|
||||
m_task_executor = cldnn::program::make_task_executor(m_config);
|
||||
|
||||
if (m_compilation_context == nullptr) {
|
||||
m_compilation_context = cldnn::program::make_compilation_context(m_config);
|
||||
}
|
||||
// locate global custom kernel config
|
||||
// and auto-load kernels from it
|
||||
#ifdef _WIN32
|
||||
@ -158,7 +164,14 @@ std::shared_ptr<cldnn::program> ProgramBuilder::build(const std::vector<std::sha
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "ProgramBuilder::CreateProgram");
|
||||
cldnn::program::ptr program;
|
||||
try {
|
||||
program = cldnn::program::build_program(m_engine, *m_topology, m_config, get_task_executor(), false, false, is_inner_program);
|
||||
program = cldnn::program::build_program(m_engine,
|
||||
*m_topology,
|
||||
m_config,
|
||||
get_task_executor(),
|
||||
get_compilation_context(),
|
||||
false,
|
||||
false,
|
||||
is_inner_program);
|
||||
} catch (std::exception& e) {
|
||||
OPENVINO_ASSERT(false, "[GPU] ProgramBuilder build failed!\n", e.what());
|
||||
}
|
||||
|
@ -7,9 +7,9 @@
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
|
||||
#include "fully_connected_inst.h"
|
||||
#include "compilation_context.hpp"
|
||||
|
||||
#include "program_wrapper.h"
|
||||
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include <intel_gpu/primitives/quantize.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include "compilation_context.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
#include "fully_connected_inst.h"
|
||||
|
||||
#include <cmath>
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include <intel_gpu/primitives/crop.hpp>
|
||||
#include "openvino/reference/matmul.hpp"
|
||||
|
||||
#include "compilation_context.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
#include "gemm_inst.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/group_normalization.hpp>
|
||||
#include "openvino/reference/group_normalization.hpp"
|
||||
#include "compilation_context.hpp"
|
||||
#include "intel_gpu/runtime/compilation_context.hpp"
|
||||
|
||||
|
||||
using namespace cldnn;
|
||||
|
Loading…
Reference in New Issue
Block a user