[GPU] Share compilation context with sub-networks (#20706)

This commit is contained in:
Vladimir Paramuzov 2023-10-27 09:58:49 +04:00 committed by GitHub
parent be25d9038e
commit cc10b14bab
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 80 additions and 25 deletions

View File

@ -131,6 +131,7 @@ public:
topology const& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
@ -251,6 +252,14 @@ public:
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
static ptr build_program(engine& engine,
const topology& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
static ptr build_program(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const ExecutionConfig& config,
@ -266,9 +275,11 @@ public:
ImplementationsCache& get_implementations_cache() const { return *_impls_cache; }
ICompilationContext& get_compilation_context() const { return *_compilation_context; }
std::shared_ptr<ICompilationContext> get_compilation_context_ptr() const { return _compilation_context; }
void cancel_compilation_context();
static std::shared_ptr<ov::threading::IStreamsExecutor> make_task_executor(const ExecutionConfig& config);
static std::shared_ptr<ICompilationContext> make_compilation_context(const ExecutionConfig& config);
private:
uint32_t prog_id = 0;
@ -286,8 +297,7 @@ private:
bool is_body_program;
std::unique_ptr<ImplementationsCache> _impls_cache;
const size_t _impls_cache_capacity = 10000;
const int _num_async_build_threads = 1;
std::unique_ptr<ICompilationContext> _compilation_context;
std::shared_ptr<ICompilationContext> _compilation_context;
std::map<primitive_id, std::shared_ptr<program_node>> nodes_map;
std::list<primitive_id> optimized_out;

View File

@ -10,6 +10,7 @@
#include "intel_gpu/plugin/custom_layer.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "intel_gpu/graph/topology.hpp"
#include "intel_gpu/graph/program.hpp"
@ -75,7 +76,9 @@ class ProgramBuilder final {
public:
ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
bool createTopologyOnly = false, bool partialBuild = false,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr, bool innerProgram = false);
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor = nullptr,
std::shared_ptr<cldnn::ICompilationContext> compilation_context = nullptr,
bool innerProgram = false);
ProgramBuilder(cldnn::engine& engine, const ExecutionConfig& config);
static const cldnn::primitive_id m_preProcessTag;
@ -136,6 +139,7 @@ public:
bool requires_new_shape_infer(const ov::Node& op) const;
std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }
private:
static factories_map_t factories_map;
@ -153,6 +157,7 @@ private:
bool queryMode;
std::shared_ptr<ov::threading::IStreamsExecutor> m_task_executor;
std::shared_ptr<cldnn::ICompilationContext> m_compilation_context;
void EnableQueryMode() { queryMode = true; }
void DisableQueryMode() { queryMode = false; }

View File

@ -4,10 +4,10 @@
#pragma once
#include "openvino/runtime/threading/cpu_streams_executor.hpp"
#include <functional>
#include <memory>
#include "intel_gpu/graph/kernel_impl_params.hpp"
#include "openvino/runtime/threading/istreams_executor.hpp"
namespace cldnn {
@ -21,7 +21,7 @@ public:
virtual void cancel() = 0;
virtual void wait_all() = 0;
static std::unique_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
static std::shared_ptr<ICompilationContext> create(ov::threading::IStreamsExecutor::Config task_executor_config);
};
} // namespace cldnn

View File

@ -2,12 +2,14 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "compilation_context.hpp"
#include <mutex>
#include <atomic>
#include <unordered_set>
#include <future>
#include "intel_gpu/runtime/utils.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "openvino/runtime/threading/cpu_streams_executor.hpp"
namespace cldnn {
class CompilationContext : public ICompilationContext {
@ -83,7 +85,7 @@ private:
std::vector<std::future<void>> futures;
};
std::unique_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
std::shared_ptr<ICompilationContext> ICompilationContext::create(ov::threading::IStreamsExecutor::Config task_executor_config) {
return cldnn::make_unique<CompilationContext>(task_executor_config);
}

View File

@ -13,6 +13,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/event.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/itt.hpp"
@ -34,7 +35,6 @@
#include "program_helpers.h"
#include "to_string_utils.h"
#include "kernels_cache.hpp"
#include "compilation_context.hpp"
// TODO: Remove once we have an abstraction for kernels_cache
#include "kernel_base.h"

View File

@ -25,7 +25,6 @@
#include "read_value_inst.h"
#include "condition_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "compilation_context.hpp"
#include "implementation_map.hpp"
#include "graph_optimizer/prepare_buffer_fusing.h"
@ -36,6 +35,7 @@
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/error_handler.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "json_object.h"
#include <string>
@ -1502,7 +1502,13 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
ov::intel_gpu::allow_static_input_reorder(true),
ov::intel_gpu::allow_new_shape_infer(true)
};
auto prog = program::build_program(get_network().get_engine(), t, subgraph_config, get_network().get_program()->get_task_executor(), true, false);
auto prog = program::build_program(get_network().get_engine(),
t,
subgraph_config,
get_network().get_program()->get_task_executor(),
get_network().get_program()->get_compilation_context_ptr(),
true,
false);
_unfused_subgraph = network::allocate_network(get_network().get_stream_ptr(), prog, true, get_network().is_primary_stream());
}

View File

@ -8,6 +8,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "intel_gpu/graph/program.hpp"
#include "auto_tuner.h"
@ -17,7 +18,6 @@
#include "program_dump_graph.h"
#include "sliding_window_utils.hpp"
#include "program_helpers.h"
#include "compilation_context.hpp"
#include "matrix_nms_inst.h"
#include "roi_pooling_inst.h"
@ -145,10 +145,17 @@ std::shared_ptr<ov::threading::IStreamsExecutor> program::make_task_executor(con
return std::make_shared<ov::threading::CPUStreamsExecutor>(task_executor_config);
}
std::shared_ptr<ICompilationContext> program::make_compilation_context(const ExecutionConfig& config) {
const int _num_async_build_threads = 1;
return ICompilationContext::create(make_task_executor_config(config,
"Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
}
program::program(engine& engine_ref,
topology const& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal,
bool no_optimizations,
bool is_body_program)
@ -158,7 +165,8 @@ program::program(engine& engine_ref,
_task_executor(std::move(task_executor)),
processing_order(),
is_internal(is_internal),
is_body_program(is_body_program) {
is_body_program(is_body_program),
_compilation_context(compilation_context) {
_config.apply_user_properties(_engine.get_device_info());
init_primitives();
GPU_DEBUG_INFO << "Program config\n" << config.to_string();
@ -214,8 +222,8 @@ void program::init_program() {
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
_compilation_context = ICompilationContext::create(make_task_executor_config(_config,
"Task executor config for CompilationContext in GPU plugin", _num_async_build_threads));
if (!_compilation_context)
_compilation_context = program::make_compilation_context(_config);
_impls_cache = cldnn::make_unique<ImplementationsCache>(_impls_cache_capacity);
// Remove items of compilation context's internal queue when some impl is popped in kernels_cache
@ -253,7 +261,18 @@ program::ptr program::build_program(engine& engine,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, config, task_executor, is_internal, no_optimizations, is_body_program);
return std::make_shared<program>(engine, topology, config, task_executor, nullptr, is_internal, no_optimizations, is_body_program);
}
program::ptr program::build_program(engine& engine,
const topology& topology,
const ExecutionConfig& config,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<ICompilationContext> compilation_context,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, config, task_executor, compilation_context, is_internal, no_optimizations, is_body_program);
}
program::ptr program::build_program(engine& engine,
@ -262,7 +281,7 @@ program::ptr program::build_program(engine& engine,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, config, nullptr, is_internal, no_optimizations, is_body_program);
return std::make_shared<program>(engine, topology, config, nullptr, nullptr, is_internal, no_optimizations, is_body_program);
}
program::ptr program::build_program(engine& engine,

View File

@ -31,7 +31,7 @@ static cldnn::condition::branch gen_branch(ProgramBuilder& p, const std::shared_
config.set_property(ov::intel_gpu::max_dynamic_batch(1));
config.set_property(ov::intel_gpu::allow_new_shape_infer(op->is_dynamic()));
ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), true);
ProgramBuilder prog(internal_body, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
branch.inner_program = prog.get_compiled_program();
auto& input_map = branch.input_map;

View File

@ -280,7 +280,7 @@ static void CreateCommonLoopOp(ProgramBuilder& p, const std::shared_ptr<ov::op::
config.set_property(ov::intel_gpu::allow_new_shape_infer(is_dynamic));
// get body program from ov::Model
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), true);
ProgramBuilder prog(ov_model, p.get_engine(), config, false, false, p.get_task_executor(), p.get_compilation_context(), true);
auto body_program = prog.get_compiled_program();
GPU_DEBUG_LOG << "* trip_count_id : " << trip_count_id << std::endl;

View File

@ -55,14 +55,20 @@ std::string layer_type_name_ID(const std::shared_ptr<ov::Node>& op) {
ProgramBuilder::ProgramBuilder(std::shared_ptr<ov::Model> model, cldnn::engine& engine, const ExecutionConfig& config,
bool create_topology_only, bool partial_build,
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor, bool is_inner_program)
std::shared_ptr<ov::threading::IStreamsExecutor> task_executor,
std::shared_ptr<cldnn::ICompilationContext> compilation_context,
bool is_inner_program)
: m_config(config)
, m_engine(engine)
, queryMode(false)
, m_task_executor(task_executor) {
, m_task_executor(task_executor)
, m_compilation_context(compilation_context) {
if (m_task_executor == nullptr)
m_task_executor = cldnn::program::make_task_executor(m_config);
if (m_compilation_context == nullptr) {
m_compilation_context = cldnn::program::make_compilation_context(m_config);
}
// locate global custom kernel config
// and auto-load kernels from it
#ifdef _WIN32
@ -158,7 +164,14 @@ std::shared_ptr<cldnn::program> ProgramBuilder::build(const std::vector<std::sha
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "ProgramBuilder::CreateProgram");
cldnn::program::ptr program;
try {
program = cldnn::program::build_program(m_engine, *m_topology, m_config, get_task_executor(), false, false, is_inner_program);
program = cldnn::program::build_program(m_engine,
*m_topology,
m_config,
get_task_executor(),
get_compilation_context(),
false,
false,
is_inner_program);
} catch (std::exception& e) {
OPENVINO_ASSERT(false, "[GPU] ProgramBuilder build failed!\n", e.what());
}

View File

@ -7,9 +7,9 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "intel_gpu/runtime/compilation_context.hpp"
#include "fully_connected_inst.h"
#include "compilation_context.hpp"
#include "program_wrapper.h"

View File

@ -14,7 +14,7 @@
#include <intel_gpu/primitives/quantize.hpp>
#include <intel_gpu/primitives/data.hpp>
#include "compilation_context.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "fully_connected_inst.h"
#include <cmath>

View File

@ -10,7 +10,7 @@
#include <intel_gpu/primitives/crop.hpp>
#include "openvino/reference/matmul.hpp"
#include "compilation_context.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
#include "gemm_inst.h"
#include <cstddef>

View File

@ -7,7 +7,7 @@
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/group_normalization.hpp>
#include "openvino/reference/group_normalization.hpp"
#include "compilation_context.hpp"
#include "intel_gpu/runtime/compilation_context.hpp"
using namespace cldnn;