[GPU] Remote context reuse and internal config update (#14635)

This commit is contained in:
Vladimir Paramuzov 2023-01-11 15:14:03 +04:00 committed by GitHub
parent 1d59a5a29b
commit 4feaeaad68
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
198 changed files with 4477 additions and 4876 deletions

View File

@ -1,488 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/primitives/implementation_desc.hpp"
#include "topology.hpp"
#include <memory>
#include <vector>
#include <string>
#include <map>
#include <utility>
namespace cldnn {
/// @addtogroup cpp_api C++ API
/// @{
/// @defgroup cpp_program Program compilation
/// @{
/// @brief Represents user-provided program build option type.
enum class build_option_type {
/// @brief Allow primitives fusing during program build (default: false).
fusing,
/// @brief Enable implicit reordering for user inputs (default: false).
optimize_data,
/// @brief Enable implicit static input reordering for user inputs (default: false).
allow_static_input_reorder,
/// @brief Enable debug mode (default: false).
/// @details This option enforce all program primitives to be accessible as outputs.
debug,
/// @brief User selected list of program outputs.
outputs,
/// @brief User defined learning parameters.
learning_config,
/// @brief Tuning config (default: Tuning is disabled).
/// @details The tuner will automatically find the optimal kernel/config for each node in the graph,
/// by running multiple implementations and configurations per node and storing the optimal one in cache.
/// Expect long execution time in the first run.
/// After the first run a cache with the tuning results will be created in the path provided.
/// This cache will be used in the next runs.
tuning_config,
/// @brief Specifies a directory to which stages of network compilation should be dumped. (default: empty, i.e. no dumping)
graph_dumps_dir,
/// @brief Specifies a directory to which compiled kernels should be cached or can be loaded from. (default: empty, i.e. no caching)
kernels_cache_dir,
/// @brief Name for serialization process
serialize_network,
load_program,
force_implementations,
partial_build_program,
allow_new_shape_infer
};
/// @brief Tuning mode.
enum class tuning_mode {
/// @brief Tuning is disabled.
tuning_disabled,
/// @brief Tuning using the cached data (no on-line tuning for non-existing data).
tuning_use_cache,
/// @brief Tuning using the cached data if exist, tune and update cache otherwise.
tuning_tune_and_cache,
/// @brief Tuning using the cached data and update tasks.
/// @details Performs updating tasks like removal of invalid caches, promoting to new format, etc.
/// No tuning for non-existing data.
tuning_use_and_update,
/// @brief Retune the cache data even if it exists.
tuning_retune_and_cache
};
/// @brief Tuning configuration.
struct tuning_config_options {
tuning_mode mode;
std::string cache_file_path;
tuning_config_options() : mode(tuning_mode::tuning_disabled), cache_file_path("") {}
};
/// @brief Learning parameters.
struct learning_params {
float momentum = 0.0;
float weights_decay = 0.0;
learning_params() : momentum(0.9f), weights_decay(0.0005f) {}
};
/// @brief Represents user-provided program build option.
struct build_option {
/// @brief Allow primitives fusing during program build (default: false).
static std::shared_ptr<const build_option> fusing(bool enable = false);
/// @brief Enable implicit reordering for user inputs (default: false).
static std::shared_ptr<const build_option> optimize_data(bool enable = false);
/// @brief Enable implicit reordering for static user inputs (default: false).
static std::shared_ptr<const build_option> allow_static_input_reorder(bool enable = false);
/// @brief Enable debug mode (default: false).
/// @details This option enforce all program primitives to be accessible as outputs.
static std::shared_ptr<const build_option> debug(bool enable = false);
/// @brief User selected list of program outputs.
static std::shared_ptr<const build_option> outputs(const std::vector<primitive_id>& outs);
/// @brief Tuning configuration (default: false).
/// @details This option will automatically find the optimal kernel/config for each node in the graph,
/// by running multiple implementations and configurations per node and storing the optimal one in cache.
/// Expect long execution time in the first run (unless the cache only mode is enabled).
/// After the first run a cache with the tuning results will be created in the path provided.
/// This cache will be used in the next runs.
static std::shared_ptr<const build_option> tuning_config(
const tuning_config_options& config = tuning_config_options());
/// @brief Specifies a directory to which stages of network compilation should be dumped (default: empty, i.e. no dumping)
static std::shared_ptr<const build_option> graph_dumps_dir(const std::string& dir_path);
/// @brief Specifies a directory to which compiled kernels should be cached or can be loaded from. (default: empty, i.e. no caching)
static std::shared_ptr<const build_option> kernels_cache_dir(const std::string& dir_path);
/// @brief Specifies a name for serialization process.
static std::shared_ptr<const build_option> serialize_network(const std::string& network_name);
/// @brief Specifies a name of load_program process.
static std::shared_ptr<const build_option> load_program(const std::string& network_name);
/// @brief User defined learning parameters.
static std::shared_ptr<const build_option> learning_config(const learning_params& params = learning_params());
/// @brief Specifies user defined implementation details to use.
static std::shared_ptr<const build_option> force_implementations(implementation_forcing_map forcing);
static std::shared_ptr<const build_option> partial_build_program(bool set = false);
static std::shared_ptr<const build_option> allow_new_shape_infer(bool set = false);
virtual ~build_option() = default;
private:
/// @brief Returns option type represented by this object.
virtual build_option_type get_type() const = 0;
friend class build_options;
};
/// @brief @ref build_option specialization for boolean options.
template <build_option_type OptType>
struct build_option_bool : build_option {
/// @brief Constructs option.
/// @param value Is option enabled.
explicit build_option_bool(bool value) : _value(value ? 1 : 0) {}
/// @brief Is option enabled.
bool enabled() const { return _value != 0; }
private:
build_option_type get_type() const override { return OptType; }
uintptr_t _value;
};
/// @brief @ref build_option specialization for program outputs list.
struct build_option_outputs : build_option {
/// @brief The list of output ids (names)
const std::vector<primitive_id> outputs;
/// @brief Constructs option.
/// @param outs List of ouput ids (names)
explicit build_option_outputs(const std::vector<primitive_id>& outs)
: outputs(outs) {}
private:
/// @brief Returns build_option_type::outputs.
build_option_type get_type() const override { return build_option_type::outputs; }
build_option_outputs(const build_option_outputs& other) = delete;
build_option_outputs& operator=(const build_option_outputs& other) = delete;
};
/// @brief @ref build_option specialization for learning config.
struct build_option_learning_config : build_option {
/// @brief Learning parameters.
const learning_params params;
/// @brief Constructs learning config build option.
/// @param learning_params Parameters for learning.
explicit build_option_learning_config(const learning_params& params)
: params(params) {}
private:
/// @brief Returns build_option_type::learning_config.
build_option_type get_type() const override { return build_option_type::learning_config; }
build_option_learning_config(const build_option_learning_config& other) = delete;
build_option_learning_config& operator=(const build_option_learning_config& other) = delete;
};
/// @brief @ref build_option specialization for tuning config.
struct build_option_tuning_config : build_option {
/// @brief Tuning configuration
const tuning_config_options config;
/// @brief Constructs tuning config build option.
/// @param tuning_config Configuration for the tuning.
explicit build_option_tuning_config(const tuning_config_options& tuning_config)
: config(tuning_config) {}
private:
/// @brief Returns build_option_type::tuning_config.
build_option_type get_type() const override { return build_option_type::tuning_config; }
build_option_tuning_config(const build_option_tuning_config& other) = delete;
build_option_tuning_config& operator=(const build_option_tuning_config& other) = delete;
};
/// @brief @ref build_option specialization for selecting a directory.
template <build_option_type OptType>
struct build_option_directory : build_option {
const std::string directory_path;
/// @brief Constructs option.
/// @param outs List of ouput ids (names)
explicit build_option_directory(const std::string& dir_path) : directory_path(dir_path) {}
private:
/// @brief Returns build_option_type::graph_dumps_dir.
build_option_type get_type() const override { return build_option_type::graph_dumps_dir; }
build_option_directory(const build_option_directory& other) = delete;
build_option_directory& operator=(const build_option_directory& other) = delete;
};
/// @brief @ref build_option specialization for selecting a directory.
template <build_option_type OptType>
struct build_option_kernels_cache_dir : build_option {
const std::string directory_path;
explicit build_option_kernels_cache_dir(const std::string& dir_path) : directory_path(dir_path) {}
private:
/// @brief Returns build_option_type::kernels_cache_dir.
build_option_type get_type() const override { return build_option_type::kernels_cache_dir; }
build_option_kernels_cache_dir(const build_option_kernels_cache_dir& other) = delete;
build_option_kernels_cache_dir& operator=(const build_option_kernels_cache_dir& other) = delete;
};
/// @brief @ref build_option specialization for serialization process.
template <build_option_type OptType>
struct build_option_serialization : build_option {
const std::string serialization_network_name;
explicit build_option_serialization(const std::string& name) : serialization_network_name(name) {}
private:
build_option_type get_type() const override { return build_option_type::serialize_network; }
build_option_serialization(const build_option_serialization& other) = delete;
build_option_serialization& operator=(const build_option_serialization& other) = delete;
};
/// @brief @ref build_option specialization for load_program process.
template <build_option_type OptType>
struct build_option_load_program : build_option {
const std::string load_program_name;
explicit build_option_load_program(const std::string& name) : load_program_name(name) {}
private:
build_option_type get_type() const override { return build_option_type::load_program; }
build_option_load_program(const build_option_load_program& other) = delete;
build_option_load_program& operator=(const build_option_load_program& other) = delete;
};
struct build_option_force_implementations : build_option {
implementation_forcing_map forcing;
explicit build_option_force_implementations(implementation_forcing_map _forcing) : forcing(std::move(_forcing)) {}
private:
build_option_type get_type() const override { return build_option_type::force_implementations; }
build_option_force_implementations(const build_option_force_implementations& other) = delete;
build_option_force_implementations& operator=(const build_option_force_implementations& other) = delete;
};
namespace detail {
/// @brief Helper template to convert @ref build_option_type value to particular @ref build_option class.
template <build_option_type OptType>
struct build_option_traits {
/// @brief @ref build_option object type which represents the particular @p OptType.
typedef build_option object_type;
/// @brief Make default @ref build_option corresponding @p OptType
static std::shared_ptr<const build_option> make_default();
};
#ifndef DOXYGEN_SHOULD_SKIP_THIS
template <>
struct build_option_traits<build_option_type::fusing> {
typedef build_option_bool<build_option_type::fusing> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::fusing(); }
};
template <>
struct build_option_traits<build_option_type::optimize_data> {
typedef build_option_bool<build_option_type::optimize_data> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::optimize_data(); }
};
template <>
struct build_option_traits<build_option_type::allow_static_input_reorder> {
typedef build_option_bool<build_option_type::allow_static_input_reorder> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::allow_static_input_reorder(); }
};
template <>
struct build_option_traits<build_option_type::debug> {
typedef build_option_bool<build_option_type::debug> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::debug(); }
};
template <>
struct build_option_traits<build_option_type::outputs> {
typedef build_option_outputs object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::outputs({}); }
};
template <>
struct build_option_traits<build_option_type::learning_config> {
typedef build_option_learning_config object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::learning_config(); }
};
template <>
struct build_option_traits<build_option_type::tuning_config> {
typedef build_option_tuning_config object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::tuning_config(); }
};
template <>
struct build_option_traits<build_option_type::graph_dumps_dir> {
typedef build_option_directory<build_option_type::graph_dumps_dir> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::graph_dumps_dir({}); }
};
template <>
struct build_option_traits<build_option_type::kernels_cache_dir> {
typedef build_option_directory<build_option_type::kernels_cache_dir> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::kernels_cache_dir({}); }
};
template <>
struct build_option_traits<build_option_type::serialize_network> {
typedef build_option_serialization<build_option_type::serialize_network> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::serialize_network({}); }
};
template <>
struct build_option_traits<build_option_type::load_program> {
typedef build_option_load_program<build_option_type::load_program> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::load_program({}); }
};
template <>
struct build_option_traits<build_option_type::force_implementations> {
using object_type = build_option_force_implementations;
static std::shared_ptr<const build_option> make_default() { return build_option::force_implementations({}); }
};
template <>
struct build_option_traits<build_option_type::partial_build_program> {
typedef build_option_bool<build_option_type::partial_build_program> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::partial_build_program(); }
};
template <>
struct build_option_traits<build_option_type::allow_new_shape_infer> {
typedef build_option_bool<build_option_type::allow_new_shape_infer> object_type;
static std::shared_ptr<const build_option> make_default() { return build_option::allow_new_shape_infer(); }
};
#endif
} // namespace detail
#ifndef DOXYGEN_SHOULD_SKIP_THIS
inline std::shared_ptr<const build_option> build_option::fusing(bool enable) {
return std::make_shared<build_option_bool<build_option_type::fusing>>(enable);
}
inline std::shared_ptr<const build_option> build_option::optimize_data(bool enable) {
return std::make_shared<build_option_bool<build_option_type::optimize_data>>(enable);
}
inline std::shared_ptr<const build_option> build_option::allow_static_input_reorder(bool enable) {
return std::make_shared<build_option_bool<build_option_type::allow_static_input_reorder>>(enable);
}
inline std::shared_ptr<const build_option> build_option::debug(bool enable) {
return std::make_shared<build_option_bool<build_option_type::debug>>(enable);
}
inline std::shared_ptr<const build_option> build_option::outputs(const std::vector<primitive_id>& outs) {
return std::make_shared<build_option_outputs>(outs);
}
inline std::shared_ptr<const build_option> build_option::learning_config(const learning_params& params) {
return std::make_shared<build_option_learning_config>(params);
}
inline std::shared_ptr<const build_option> build_option::tuning_config(const tuning_config_options& config) {
return std::make_shared<build_option_tuning_config>(config);
}
inline std::shared_ptr<const build_option> build_option::graph_dumps_dir(const std::string& dir_path) {
return std::make_shared<build_option_directory<build_option_type::graph_dumps_dir>>(dir_path);
}
inline std::shared_ptr<const build_option> build_option::kernels_cache_dir(const std::string& dir_path) {
return std::make_shared<build_option_directory<build_option_type::kernels_cache_dir>>(dir_path);
}
inline std::shared_ptr<const build_option> build_option::serialize_network(const std::string& name) {
return std::make_shared<build_option_serialization<build_option_type::serialize_network>>(name);
}
inline std::shared_ptr<const build_option> build_option::load_program(const std::string& name) {
return std::make_shared<build_option_load_program<build_option_type::load_program>>(name);
}
inline std::shared_ptr<const build_option> build_option::force_implementations(implementation_forcing_map forcing) {
return std::make_shared<build_option_force_implementations>(std::move(forcing));
}
inline std::shared_ptr<const build_option> build_option::partial_build_program(bool enable) {
return std::make_shared<build_option_bool<build_option_type::partial_build_program>>(enable);
}
inline std::shared_ptr<const build_option> build_option::allow_new_shape_infer(bool enable) {
return std::make_shared<build_option_bool<build_option_type::allow_new_shape_infer>>(enable);
}
#endif
/// @brief Represents program build options list.
class build_options {
public:
/// @brief Adds or replace option to the options list
void set_option(std::shared_ptr<const build_option> opt) { add_or_replace_option(opt); }
/// @brief Adds or replace options to the options list
template <typename... Args>
void set_option(std::shared_ptr<const build_option> opt, Args... args) {
add_or_replace_option(opt);
set_option(args...);
}
/// @brief Constructs build options list from its arguments.
template <typename... Args>
explicit build_options(Args... args) {
set_option(args...);
}
/// @brief Returns program build option for @p OptType
template <build_option_type OptType>
std::shared_ptr<const typename detail::build_option_traits<OptType>::object_type> get() const {
using T = typename detail::build_option_traits<OptType>::object_type;
for (auto& option : _options) {
if (option->get_type() == OptType)
return std::static_pointer_cast<const T>(option);
}
return std::static_pointer_cast<const T>(detail::build_option_traits<OptType>::make_default());
}
private:
friend struct program;
std::vector<std::shared_ptr<const build_option>> _options;
void set_option(void) {}
void add_or_replace_option(std::shared_ptr<const build_option> opt) {
for (auto& p : _options) {
if (p->get_type() == opt->get_type()) {
p = opt;
return;
}
}
_options.push_back(opt);
}
};
/// @}
/// @}
} // namespace cldnn

View File

@ -33,7 +33,7 @@ struct network_output {
memory::ptr get_memory() const {
// TODO: in_order queue doesn't create proper output event in some cases which leads to syncronization issues with user app
// So call finish for associated stream to enusre that the output data is ready.
if (_stream->get_queue_type() == queue_types::in_order) {
if (_stream->get_queue_type() == QueueTypes::in_order) {
_stream->finish();
} else {
_event->wait();
@ -67,14 +67,15 @@ public:
};
using variables_states_map = std::map<std::string, VariableState::Ptr>;
explicit network(program::ptr program, stream::ptr stream, bool is_internal = false, bool is_primary_stream = true);
explicit network(program::ptr program, const ExecutionConfig& config, stream::ptr stream, bool is_internal = false, bool is_primary_stream = true);
network(engine& engine,
const topology& topo,
const build_options& options = build_options(),
const ExecutionConfig& config = {},
bool is_internal = false);
network(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const build_options& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal);
network(program::ptr program, uint16_t stream_id = 0);
@ -82,6 +83,7 @@ public:
network(program::ptr program, stream::ptr stream, uint16_t stream_id);
network(cldnn::BinaryInputBuffer& ifs, stream::ptr stream, engine& engine, uint16_t stream_id = 0);
network(cldnn::BinaryInputBuffer& ifs, const ExecutionConfig& config, stream::ptr stream, engine& engine, uint16_t stream_id = 0);
~network();
@ -89,11 +91,12 @@ public:
static ptr build_network(engine& engine,
const topology& topology,
const build_options& options = build_options(),
const ExecutionConfig& config = {},
bool is_internal = false);
static ptr build_network(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const build_options& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal);
static ptr allocate_network(stream::ptr stream,
@ -121,7 +124,7 @@ public:
network_output get_output(const primitive_id& output_id) {
event::ptr evt;
if (get_stream().get_queue_type() == queue_types::out_of_order)
if (get_stream().get_queue_type() == QueueTypes::out_of_order)
evt = get_primitive_event(output_id);
return network_output(evt, get_output_memory(output_id), get_stream_ptr());
}
@ -236,10 +239,13 @@ public:
ICompilationContext& get_compilation_context() const { return *_compilation_context; }
std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; }
const ExecutionConfig& get_config() const { return _config; }
private:
using output_chains_map = std::map<primitive_id, std::vector<std::shared_ptr<primitive_inst>>>;
uint32_t net_id = 0;
program::ptr _program;
ExecutionConfig _config;
engine& _engine;
stream::ptr _stream;
std::unique_ptr<memory_pool> _memory_pool;

View File

@ -7,7 +7,7 @@
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "intel_gpu/runtime/lru_cache.hpp"
#include "build_options.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include <list>
#include <string>
@ -126,19 +126,22 @@ public:
program(engine& engine_ref,
topology const& topology,
build_options const& options,
const ExecutionConfig& config,
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
/* constructor used to build a program from subset of nodes of other program (used in propagate_constants) */
program(engine& engine_ref,
std::set<std::shared_ptr<program_node>> const& nodes,
build_options const& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal);
explicit program(engine& engine);
~program();
engine& get_engine() const { return _engine; }
const build_options& get_options() const { return options; }
const ExecutionConfig& get_config() const { return _config; }
InferenceEngine::CPUStreamsExecutor::Ptr get_task_executor() const { return _task_executor; }
std::list<program_node*>& get_inputs() {
return inputs;
} // ToDo: redesign trim to ouptut pass to make it const as_well as get_engine and get options
@ -146,7 +149,6 @@ public:
return outputs;
} // ToDo: redesign reorder-inputs pass to make it const as_well as get_engine and get options
bool is_loop_body() const { return is_body_program; }
bool is_debug_build() const { return options.get<build_option_type::debug>()->enabled(); }
const nodes_ordering& get_processing_order() const;
nodes_ordering& get_processing_order();
uint32_t get_prog_id() { return prog_id; }
@ -230,13 +232,14 @@ public:
static ptr build_program(engine& engine,
const topology& topology,
const build_options& options,
const ExecutionConfig& config,
bool is_internal = false,
bool no_optimizations = false,
bool is_body_program = false);
static ptr build_program(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const build_options& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal);
static void init_primitives();
void compile();
@ -261,7 +264,8 @@ private:
stream::ptr _stream;
// TODO: Consider moving it to engine
std::unique_ptr<kernels_cache> _kernels_cache;
build_options options;
ExecutionConfig _config;
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> _task_executor = nullptr;
std::list<program_node*> inputs;
std::vector<program_node*> outputs;
nodes_ordering processing_order;
@ -308,6 +312,7 @@ private:
void cleanup();
void transfer_memory_to_device();
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> make_task_executor(const ExecutionConfig& config) const;
/*
** Analysis functions
*/

View File

@ -14,8 +14,8 @@
#include "cpp/ie_cnn_network.h"
#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
#include "intel_gpu/plugin/graph.hpp"
#include "intel_gpu/plugin/device_config.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
namespace ov {
namespace intel_gpu {
@ -24,8 +24,8 @@ class CompiledModel : public InferenceEngine::ExecutableNetworkThreadSafeDefault
public:
typedef std::shared_ptr<CompiledModel> Ptr;
CompiledModel(InferenceEngine::CNNNetwork &network, std::shared_ptr<InferenceEngine::RemoteContext> context, Config config);
CompiledModel(std::istream& networkModel, std::shared_ptr<InferenceEngine::RemoteContext> context, Config config);
CompiledModel(InferenceEngine::CNNNetwork &network, InferenceEngine::RemoteContext::Ptr context, const ExecutionConfig& config);
CompiledModel(std::istream& networkModel, InferenceEngine::RemoteContext::Ptr context, const ExecutionConfig& config);
void Export(std::ostream& networkModel) override;
std::shared_ptr<ngraph::Function> GetExecGraphInfo() override;
@ -42,8 +42,8 @@ public:
std::shared_ptr<InferenceEngine::RemoteContext> GetContext() const override;
std::vector<std::shared_ptr<Graph>> m_graphs;
InferenceEngine::gpu::ClContext::Ptr m_context;
Config m_config;
InferenceEngine::RemoteContext::Ptr m_context;
ExecutionConfig m_config;
InferenceEngine::ITaskExecutor::Ptr m_taskExecutor;
InferenceEngine::ITaskExecutor::Ptr m_waitExecutor;

View File

@ -1,105 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <map>
#include <string>
#include "intel_gpu/plugin/custom_layer.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/graph/network.hpp"
#include "openvino/runtime/intel_gpu/properties.hpp"
#include <ie_performance_hints.hpp>
#include <threading/ie_cpu_streams_executor.hpp>
namespace ov {
namespace intel_gpu {
struct Config {
Config(std::string device_id = "0") : device_id(device_id),
throughput_streams(1),
useProfiling(false),
dumpCustomKernels(false),
exclusiveAsyncRequests(false),
enableDynamicBatch(false),
enableInt8(true),
nv12_two_inputs(false),
queuePriority(cldnn::priority_mode_types::med),
queueThrottle(cldnn::throttle_mode_types::med),
max_dynamic_batch(1),
customLayers({}),
kernels_cache_dir(""),
inference_precision(ov::element::f16),
task_exec_config({"GPU plugin internal task executor", // name
std::max(1, static_cast<int>(std::thread::hardware_concurrency())), // # of streams
1, // # of threads per streams
InferenceEngine::IStreamsExecutor::ThreadBindingType::HYBRID_AWARE, // thread binding type
1, // thread binding step
0, // thread binding offset
1, // # of threads
InferenceEngine::IStreamsExecutor::Config::ANY}), // preferred core type
enable_loop_unrolling(true) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(debug_config->serialize_compile == 1) {
task_exec_config._streams = 1;
}
adjustKeyMapValues();
}
uint32_t GetDefaultNStreamsForThroughputMode() const {
return 2;
}
void UpdateFromMap(const std::map<std::string, std::string>& configMap, const cldnn::device_info& info);
void adjustKeyMapValues();
static bool isNewApiProperty(std::string property);
static std::string ConvertPropertyToLegacy(const std::string& key, const std::string& value);
bool CanShareContextWith(const Config& other) const;
std::string device_id;
uint16_t throughput_streams;
bool useProfiling;
bool dumpCustomKernels;
bool exclusiveAsyncRequests;
bool enableDynamicBatch;
bool enableInt8;
bool nv12_two_inputs;
cldnn::priority_mode_types queuePriority;
cldnn::throttle_mode_types queueThrottle;
int max_dynamic_batch;
CustomLayerMap customLayers;
std::string kernels_cache_dir;
ov::element::Type inference_precision;
InferenceEngine::IStreamsExecutor::Config task_exec_config;
bool enable_loop_unrolling;
std::map<std::string, std::string> key_config_map;
InferenceEngine::PerfHintsConfig perfHintsConfig;
};
struct Configs {
using conf_iter = std::map<std::string, Config>::iterator;
Configs(Config conf = Config()) : configs({std::make_pair(default_device_id, conf.device_id = default_device_id)}) { }
void CreateConfig(std::string device_id);
Config& GetConfig(std::string device_id);
Config& GetDefaultDeviceConfig();
void SetDefaultDeviceID(std::string default_device_id) { this->default_device_id = default_device_id; }
std::string GetDefaultDeviceID() { return default_device_id; }
conf_iter begin() { return configs.begin(); }
conf_iter end() { return configs.end(); }
private:
std::string default_device_id = "0";
std::map<std::string, Config> configs;
};
} // namespace intel_gpu
} // namespace ov

View File

@ -23,8 +23,8 @@
#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
#include "intel_gpu/plugin/custom_layer.hpp"
#include "intel_gpu/plugin/device_config.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_blob.hpp"
#include "intel_gpu/plugin/program.hpp"
namespace ov {
@ -40,8 +40,11 @@ public:
typedef std::shared_ptr<Graph> Ptr;
using variable_states_map = std::map<std::string, std::vector<cldnn::network::VariableState::Ptr>>;
Graph(InferenceEngine::CNNNetwork& network, InferenceEngine::gpu::ClContext::Ptr context, Config config, uint16_t stream_id = 0);
Graph(cldnn::BinaryInputBuffer& ib, InferenceEngine::gpu::ClContext::Ptr context, Config config, uint16_t stream_id = 0);
Graph(InferenceEngine::CNNNetwork& network,
RemoteContextImpl::Ptr context,
const ExecutionConfig& config,
uint16_t stream_id = 0);
Graph(cldnn::BinaryInputBuffer& ib, RemoteContextImpl::Ptr context, const ExecutionConfig& config, uint16_t stream_id = 0);
explicit Graph(std::shared_ptr<Graph> graph, uint16_t stream_id = 0);
void Export(cldnn::BinaryOutputBuffer &ob);
std::shared_ptr<ngraph::Function> GetExecGraphInfo();
@ -51,10 +54,10 @@ public:
std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> GetPerformanceCounts() const;
void UpdatePerfStatistics();
const Config& getConfig() const { return m_config; }
InferenceEngine::gpu::ClContext::Ptr GetContext() { return m_context; }
std::shared_ptr<cldnn::engine> GetEngine() const { return getContextImpl(m_context)->GetEngine(); }
int GetMaxDynamicBatchSize() const { return getConfig().max_dynamic_batch; }
cldnn::engine& get_engine() const { return m_context->get_engine(); }
const ExecutionConfig& get_config() const { return m_config; }
int GetMaxDynamicBatchSize() const { return m_config.get_property(ov::intel_gpu::max_dynamic_batch); }
const std::map<std::string, cldnn::layout>& GetInputLayouts() const { return m_program->GetInputLayouts(); }
const InferenceEngine::InputsDataMap GetNetworkInputs() const { return m_program->GetNetworkInputs(); }
const InferenceEngine::OutputsDataMap GetNetworkOutputs() const { return m_program->GetNetworkOutputs(); }
@ -85,16 +88,15 @@ public:
bool use_external_queue() const;
protected:
InferenceEngine::gpu::ClContext::Ptr m_context;
RemoteContextImpl::Ptr m_context;
std::shared_ptr<Program> m_program;
std::string m_networkName;
Config m_config;
ExecutionConfig m_config;
uint16_t m_stream_id;
uint32_t m_state;
std::condition_variable m_cv;
std::mutex m_infer_mutex;
std::vector<std::shared_ptr<cldnn::network>> m_networks;
std::map<std::string, cldnn::primitive_id> primitiveIDs;
std::map<std::string, std::vector<cldnn::primitive_id>> prevPrimitiveIDs;
@ -104,7 +106,6 @@ protected:
std::map<std::string, InferenceEngine::SizeVector> outputDims;
std::shared_ptr<cldnn::network> BuildNetwork(std::shared_ptr<cldnn::program> program);
void Build();
void UpdateLayersMaps();

View File

@ -70,6 +70,7 @@ private:
bool m_useStreams = false;
bool m_useExternalQueue = false;
std::shared_ptr<Graph> m_graph;
InferenceEngine::gpu::ClContext::Ptr m_context = nullptr;
InferenceEngine::IStreamsExecutor* streamExecutor = nullptr;
@ -90,7 +91,7 @@ private:
template<typename RemoteBlobType, typename = typename std::enable_if<std::is_same<RemoteBlobType, RemoteCLbuffer>::value ||
std::is_same<RemoteBlobType, RemoteUSMbuffer>::value>::type>
InferenceEngine::Blob::Ptr create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
const RemoteBlobImpl::BlobType mem_type, void* mem_ptr = nullptr);
const BlobType mem_type, void* mem_ptr = nullptr);
InferenceEngine::Blob::Ptr create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem);
void allocate_inputs();
void allocate_outputs();

View File

@ -79,6 +79,7 @@ private:
bool m_useStreams = false;
bool m_useExternalQueue = false;
std::shared_ptr<Graph> m_graph;
InferenceEngine::gpu::ClContext::Ptr m_context = nullptr;
// dynamic batch stuff
std::map<std::string, std::vector<buf_info>> batchInputs;

View File

@ -1,23 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/runtime/properties.hpp"
namespace ov {
namespace intel_gpu {
/**
* @brief Read-only property to get GPU driver version
*/
static constexpr Property<std::string, PropertyMutability::RO> driver_version{"GPU_DRIVER_VERSION"};
/**
* @brief Read-only property to get GPU driver version
*/
static constexpr Property<std::string, PropertyMutability::RO> device_id{"GPU_DEVICE_ID"};
} // namespace intel_gpu
} // namespace ov

View File

@ -0,0 +1,23 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/runtime/execution_config.hpp"
namespace ov {
namespace intel_gpu {
class LegacyAPIHelper {
public:
static ov::AnyMap convert_legacy_properties(const std::map<std::string, std::string>& properties, bool is_new_api);
static ov::AnyMap convert_legacy_properties(const ov::AnyMap& properties, bool is_new_api);
static std::pair<std::string, ov::Any> convert_legacy_property(const std::pair<std::string, ov::Any>& legacy_property);
static std::pair<std::string, ov::Any> convert_to_legacy_property(const std::pair<std::string, ov::Any>& property);
static bool is_legacy_property(const std::pair<std::string, ov::Any>& property, bool is_new_api);
static bool is_new_api_property(const std::pair<std::string, ov::Any>& property);
static std::vector<std::string> get_supported_configs();
static std::vector<std::string> get_supported_metrics(bool model_caching_enabled);
};
} // namespace intel_gpu
} // namespace ov

View File

@ -15,34 +15,37 @@
namespace ov {
namespace intel_gpu {
using CustomLayerPtr = std::shared_ptr<class CustomLayer>;
class Plugin : public InferenceEngine::IInferencePlugin,
public InferenceEngine::gpu::details::param_map_obj_getter {
class Plugin : public InferenceEngine::IInferencePlugin {
struct impl;
std::shared_ptr<impl> _impl;
bool streamsSet = false;
bool throttlingSet = false;
bool isModelCachingEnabled = false;
std::string default_device_id = "0";
// key: device_id, value: cldnn device
std::map<std::string, cldnn::device::ptr> device_map;
std::map<std::string, ExecutionConfig> m_configs_map;
// key: cldnn context, value: memory statistics
mutable std::map<RemoteCLContext::Ptr, std::map<std::string, uint64_t>> statistics_map;
mutable std::map<RemoteContextImpl::Ptr, std::map<std::string, uint64_t>> statistics_map;
mutable std::mutex engine_mutex;
mutable std::map<std::string, RemoteCLContext::Ptr> m_defaultContexts;
mutable std::map<std::string, RemoteCLContext::Ptr> m_default_contexts;
cldnn::device_info GetDeviceInfo(const std::map<std::string, std::string> &config) const;
InferenceEngine::CNNNetwork CloneAndTransformNetwork(const InferenceEngine::CNNNetwork& network,
const Config& config) const;
void TransformNetwork(std::shared_ptr<ov::Model>& model, const Config& config) const;
std::map<std::string, std::string> ConvertPerfHintsToConfig(const std::map<std::string, std::string>& network_config,
const Config& plugin_config) const;
InferenceEngine::CNNNetwork clone_and_transform_model(const InferenceEngine::CNNNetwork& network,
const ExecutionConfig& config) const;
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const;
void register_primitives();
void update_memory_statistics(const RemoteContextImpl::Ptr& context) const;
std::string get_device_id_from_config(const std::map<std::string, std::string>& config) const;
std::string get_device_id(const std::map<std::string, std::string>& config) const;
RemoteCLContext::Ptr get_default_context(const std::string& device_id) const;
std::vector<ov::PropertyName> get_supported_properties() const;
std::vector<std::string> get_device_capabilities(const cldnn::device_info& info) const;
uint32_t get_optimal_batch_size(const std::map<std::string, InferenceEngine::Parameter>& options) const;
uint32_t get_max_batch_size(const std::map<std::string, InferenceEngine::Parameter>& options) const;
ov::AnyMap preprocess_config(const std::map<std::string, std::string>& orig_config) const;
void RegisterPrimitives();
void UpdateConfig(Config& conf, const InferenceEngine::CNNNetwork &network, const std::map<std::string, std::string> &params) const;
void UpdateStatistics(const RemoteCLContext::Ptr& context) const;
public:
Plugin();
@ -54,7 +57,6 @@ public:
const std::map<std::string, std::string> &config) override;
void SetConfig(const std::map<std::string, std::string> &config) override;
std::string GetDeviceIDFromConfig(const std::map<std::string, std::string>& config) const;
InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const override;
InferenceEngine::Parameter GetMetric(const std::string& name, const std::map<std::string, InferenceEngine::Parameter>& options) const override;
InferenceEngine::QueryNetworkResult QueryNetwork(const InferenceEngine::CNNNetwork& network,
@ -64,31 +66,6 @@ public:
std::shared_ptr<InferenceEngine::RemoteContext> CreateContext(const InferenceEngine::ParamMap& params) override;
std::shared_ptr<InferenceEngine::RemoteContext> GetDefaultContext(const InferenceEngine::ParamMap& params) override;
struct PluginParams {
cldnn::queue_types queue_type;
cldnn::engine_types engine_type;
cldnn::runtime_types runtime_type;
bool use_unified_shared_memory;
InferenceEngine::ITaskExecutor::Ptr task_executor;
};
static PluginParams GetParams(const Config& config, const cldnn::device::ptr& dev,
InferenceEngine::gpu_handle_param external_queue = nullptr) {
PluginParams params;
params.engine_type = cldnn::engine_types::ocl;
params.runtime_type = cldnn::runtime_types::ocl;
if (external_queue) {
params.queue_type = cldnn::stream::detect_queue_type(params.engine_type, external_queue);
} else if (dev->get_info().supports_immad) {
params.queue_type = cldnn::queue_types::in_order;
} else {
params.queue_type = cldnn::queue_types::out_of_order;
}
params.use_unified_shared_memory = true;
params.task_executor = std::make_shared<InferenceEngine::CPUStreamsExecutor>(config.task_exec_config);
return params;
}
};
} // namespace intel_gpu

View File

@ -14,11 +14,14 @@
#include <cpp/ie_cnn_network.h>
#include <ngraph/ngraph.hpp>
#include "gpu/gpu_config.hpp"
#include "intel_gpu/plugin/device_config.hpp"
#include "intel_gpu/plugin/custom_layer.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/graph/topology.hpp"
#include "intel_gpu/graph/program.hpp"
// Forward declarations for cldnn part
namespace cldnn {
@ -78,20 +81,14 @@ public:
class Program {
public:
Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config,
Program(InferenceEngine::CNNNetwork& network, cldnn::engine& engine, const ExecutionConfig& config,
bool createTopologyOnly = false, bool partialBuild = false);
Program(std::shared_ptr<cldnn::engine> engine, const Config& config)
Program(cldnn::engine& engine, const ExecutionConfig& config)
: m_max_batch(1)
, m_curBatch(-1)
, m_config(config)
, m_engine(engine)
, queryMode(false) {}
Program()
: m_max_batch(1)
, m_curBatch(-1)
, m_config()
, m_engine(nullptr)
, queryMode(false) {}
static const cldnn::primitive_id m_preProcessTag;
static const cldnn::primitive_id m_meanValuesTag;
@ -109,6 +106,7 @@ public:
std::map<std::string, cldnn::layout> inputLayouts;
using BlobCacheKey = std::pair<const char*, std::vector<size_t>>;
std::map<BlobCacheKey, cldnn::primitive_id> blobMemCache;
CustomLayerMap m_custom_layers;
int m_max_batch;
int m_curBatch;
@ -119,9 +117,8 @@ public:
const std::map<std::string, cldnn::layout>& GetInputLayouts() const { return inputLayouts; }
InferenceEngine::InputsDataMap GetNetworkInputs() const { return m_networkInputs; }
InferenceEngine::OutputsDataMap GetNetworkOutputs() const { return m_networkOutputs; }
cldnn::engine& GetEngine() const { return *m_engine; }
std::shared_ptr<cldnn::engine> GetEnginePtr() const { return m_engine; }
const Config& GetConfig() const { return m_config; }
cldnn::engine& get_engine() const { return m_engine; }
const ExecutionConfig& get_config() const { return m_config; }
int GetMaxBatchSizeForSingleProgram();
bool IsOpSupported(const InferenceEngine::CNNNetwork& network, const std::shared_ptr<ngraph::Node>& op);
@ -166,8 +163,8 @@ public:
private:
static factories_map_t factories_map;
std::vector<std::shared_ptr<cldnn::program>> m_programs;
Config m_config;
std::shared_ptr<cldnn::engine> m_engine;
ExecutionConfig m_config;
cldnn::engine& m_engine;
std::shared_ptr<cldnn::topology> m_topology;
InferenceEngine::InputsDataMap m_networkInputs;

View File

@ -0,0 +1,99 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/plugin/remote_context.hpp"
#include <string>
#include <map>
#include <memory>
#include <atomic>
namespace ov {
namespace intel_gpu {
class RemoteBlobImpl;
class RemoteAllocator : public InferenceEngine::IAllocator {
protected:
friend class RemoteBlobImpl;
std::atomic_flag _lock;
std::map<void*, const RemoteBlobImpl*> m_lockedBlobs;
void regLockedBlob(void* handle, const RemoteBlobImpl* blob);
public:
using Ptr = std::shared_ptr<RemoteAllocator>;
RemoteAllocator() { _lock.clear(std::memory_order_relaxed); }
/**
* @brief Maps handle to heap memory accessible by any memory manipulation routines.
* @return Generic pointer to memory
*/
void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override { return handle; };
/**
* @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
* The multiple sequential mappings of the same handle are suppose to get the same
* result while there isn't a ref counter supported.
*/
void unlock(void* handle) noexcept override;
/**
* @brief Allocates memory
* @param size The size in bytes to allocate
* @return Handle to the allocated resource
*/
void* alloc(size_t size) noexcept override { return nullptr; }
/**
* @brief Releases handle and all associated memory resources which invalidates the handle.
* @return false if handle cannot be released, otherwise - true.
*/
bool free(void* handle) noexcept override { return true; }
void lock() {
while (_lock.test_and_set(std::memory_order_acquire)) {}
}
void unlock() {
_lock.clear(std::memory_order_release);
}
};
class USMHostAllocator : public InferenceEngine::IAllocator {
protected:
InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr;
InferenceEngine::gpu::ClContext::Ptr _context = nullptr;
public:
using Ptr = std::shared_ptr<USMHostAllocator>;
USMHostAllocator(InferenceEngine::gpu::ClContext::Ptr context) : _context(context) { }
/**
* @brief Maps handle to heap memory accessible by any memory manipulation routines.
* @return Generic pointer to memory
*/
void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override;
/**
* @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
* The multiple sequential mappings of the same handle are suppose to get the same
* result while there isn't a ref counter supported.
*/
void unlock(void* handle) noexcept override;
/**
* @brief Allocates memory
* @param size The size in bytes to allocate
* @return Handle to the allocated resource
*/
void* alloc(size_t size) noexcept override;
/**
* @brief Releases handle and all associated memory resources which invalidates the handle.
* @return false if handle cannot be released, otherwise - true.
*/
bool free(void* handle) noexcept override;
};
} // namespace intel_gpu
} // namespace ov

View File

@ -0,0 +1,171 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#ifndef NOMINMAX
# define NOMINMAX
#endif
#ifndef OV_GPU_USE_OPENCL_HPP
#define OV_GPU_USE_OPENCL_HPP
#endif
#ifdef _WIN32
# include <gpu/gpu_context_api_dx.hpp>
#else
# include <gpu/gpu_context_api_va.hpp>
#endif
#include <string>
#include <map>
#include <memory>
namespace ov {
namespace intel_gpu {
class RemoteContextImpl;
class RemoteBlobImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
friend class RemoteAllocator;
public:
explicit RemoteBlobImpl(InferenceEngine::gpu::ClContext::Ptr context,
cldnn::stream& stream,
const cldnn::layout& layout,
cldnn::shared_handle mem = nullptr,
cldnn::shared_surface surf = 0,
uint32_t plane = 0,
BlobType mem_type = BlobType::BT_BUF_INTERNAL);
void allocate();
bool deallocate() noexcept;
InferenceEngine::ParamMap getParams() const;
std::string getDeviceName() const noexcept;
std::shared_ptr<InferenceEngine::RemoteContext> getContext() const noexcept;
InferenceEngine::LockedMemory<void> buffer() noexcept;
InferenceEngine::LockedMemory<const void> cbuffer() const noexcept;
InferenceEngine::LockedMemory<void> rwmap() noexcept;
InferenceEngine::LockedMemory<const void> rmap() const noexcept;
InferenceEngine::LockedMemory<void> wmap() noexcept;
const std::shared_ptr<InferenceEngine::IAllocator> &getAllocator() const noexcept;
void *getHandle() const noexcept { return _handle; }
void reinterpret(cldnn::layout new_layout);
bool is_allocated() const noexcept;
bool is_locked() const noexcept;
cldnn::memory::ptr get_memory() { return m_memory_object; }
protected:
std::shared_ptr<InferenceEngine::IAllocator> m_allocator;
InferenceEngine::gpu::ClContext::Ptr m_context;
cldnn::stream& m_stream;
// constructor stuff
cldnn::shared_handle m_mem;
cldnn::shared_surface m_surf;
uint32_t m_plane;
cldnn::layout m_layout;
BlobType m_mem_type;
size_t m_hash;
cldnn::memory::ptr m_memory_object;
mutable std::mutex lockedMutex;
mutable size_t lockedCounter;
mutable std::unique_ptr<cldnn::mem_lock<uint8_t>> lockedHolder;
mutable void* _handle;
void lock() const;
void unlock() const;
bool supports_caching() const;
};
template<typename TpublicAPI>
class TypedRemoteBlob : public TpublicAPI {
public:
using Ptr = std::shared_ptr<TypedRemoteBlob>;
explicit TypedRemoteBlob(InferenceEngine::gpu::ClContext::Ptr context,
cldnn::stream& stream,
const InferenceEngine::TensorDesc& desc,
const cldnn::layout& layout,
cldnn::shared_handle mem = nullptr,
cldnn::shared_surface surf = 0,
uint32_t plane = 0,
BlobType mem_type = BlobType::BT_BUF_INTERNAL)
: TpublicAPI(desc)
, _impl(context, stream, layout, mem, surf, plane, mem_type) {}
void allocate() noexcept override {
try {
if (!_impl.is_allocated())
_impl.allocate();
} catch (...) {}
}
bool deallocate() noexcept override { return _impl.deallocate(); }
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
std::shared_ptr<InferenceEngine::RemoteContext> getContext() const noexcept override { return _impl.getContext(); }
InferenceEngine::LockedMemory<void> buffer() noexcept override { return _impl.buffer(); }
InferenceEngine::LockedMemory<const void> cbuffer() const noexcept override { return _impl.cbuffer(); }
InferenceEngine::LockedMemory<void> rwmap() noexcept override { return _impl.rwmap(); }
InferenceEngine::LockedMemory<const void> rmap() const noexcept override { return _impl.rmap(); }
InferenceEngine::LockedMemory<void> wmap()noexcept override { return _impl.wmap(); }
RemoteBlobImpl* getImpl() { return &_impl; }
protected:
const std::shared_ptr<InferenceEngine::IAllocator> &getAllocator() const noexcept override { return _impl.getAllocator(); }
void *getHandle() const noexcept override { return _impl.getHandle(); }
RemoteBlobImpl _impl;
};
using RemoteCLbuffer = TypedRemoteBlob<InferenceEngine::gpu::ClBufferBlob>;
using RemoteUSMbuffer = TypedRemoteBlob<InferenceEngine::gpu::USMBlob>;
using RemoteCLImage2D = TypedRemoteBlob<InferenceEngine::gpu::ClImage2DBlob>;
#ifdef _WIN32
using RemoteD3DBuffer = TypedRemoteBlob<InferenceEngine::gpu::D3DBufferBlob>;
using RemoteD3DSurface = TypedRemoteBlob<InferenceEngine::gpu::D3DSurface2DBlob>;
#else
using RemoteVASurface = TypedRemoteBlob<InferenceEngine::gpu::VASurfaceBlob>;
#endif
inline RemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) {
#ifdef _WIN32
{
auto ptr = blobPtr->as<RemoteD3DSurface>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<RemoteD3DBuffer>();
if (ptr) return ptr->getImpl();
}
#else
{
auto ptr = blobPtr->as<RemoteVASurface>();
if (ptr) return ptr->getImpl();
}
#endif
{
auto ptr = blobPtr->as<RemoteCLbuffer>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<RemoteCLImage2D>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<RemoteUSMbuffer>();
if (ptr) return ptr->getImpl();
}
return nullptr;
}
} // namespace intel_gpu
} // namespace ov

View File

@ -6,7 +6,7 @@
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/plugin/device_config.hpp"
#include "intel_gpu/runtime/lru_cache.hpp"
#include "intel_gpu/plugin/common_utils.hpp"
#include <ie_parameter.hpp>
@ -35,12 +35,8 @@
namespace ov {
namespace intel_gpu {
class RemoteAllocator;
class RemoteBlobImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
friend class RemoteAllocator;
public:
enum BlobType {
enum class BlobType {
BT_EMPTY,
BT_BUF_INTERNAL,
BT_BUF_SHARED,
@ -50,544 +46,139 @@ public:
BT_IMG_SHARED,
BT_SURF_SHARED,
BT_DX_BUF_SHARED,
};
explicit RemoteBlobImpl(InferenceEngine::gpu::ClContext::Ptr context,
cldnn::stream& stream,
const cldnn::layout& layout,
cldnn::shared_handle mem = nullptr,
cldnn::shared_surface surf = 0,
uint32_t plane = 0,
BlobType mem_type = BT_BUF_INTERNAL);
void allocate();
bool deallocate() noexcept;
InferenceEngine::ParamMap getParams() const;
std::string getDeviceName() const noexcept;
std::shared_ptr<InferenceEngine::RemoteContext> getContext() const noexcept;
InferenceEngine::LockedMemory<void> buffer() noexcept;
InferenceEngine::LockedMemory<const void> cbuffer() const noexcept;
InferenceEngine::LockedMemory<void> rwmap() noexcept;
InferenceEngine::LockedMemory<const void> rmap() const noexcept;
InferenceEngine::LockedMemory<void> wmap() noexcept;
const std::shared_ptr<InferenceEngine::IAllocator> &getAllocator() const noexcept;
void *getHandle() const noexcept { return _handle; }
void reinterpret(cldnn::layout new_layout);
bool is_allocated() const noexcept;
bool is_locked() const noexcept;
cldnn::memory::ptr getMemory() { return m_memObject; }
protected:
static RemoteAllocator m_allocator;
std::weak_ptr<InferenceEngine::gpu::ClContext> m_context;
// retain engine ptr to ensure that memory object can be released properly in cases when RemoteContext if deleted before RemoteTensor
std::shared_ptr<cldnn::engine> m_engine;
cldnn::stream& m_stream;
// constructor stuff
cldnn::shared_handle m_mem;
cldnn::shared_surface m_surf;
uint32_t m_plane;
cldnn::layout m_layout;
BlobType m_mem_type;
cldnn::memory::ptr m_memObject;
mutable std::mutex lockedMutex;
mutable size_t lockedCounter;
mutable std::unique_ptr<cldnn::mem_lock<uint8_t>> lockedHolder;
mutable void* _handle;
mutable std::shared_ptr<InferenceEngine::IAllocator> _allocator;
void lock() const;
void unlock() const;
};
template<typename TpublicAPI>
class TypedRemoteBlob : public TpublicAPI {
public:
using Ptr = std::shared_ptr<TypedRemoteBlob>;
explicit TypedRemoteBlob(InferenceEngine::gpu::ClContext::Ptr context,
cldnn::stream& stream,
const InferenceEngine::TensorDesc& desc,
const cldnn::layout& layout,
cldnn::shared_handle mem = nullptr,
cldnn::shared_surface surf = 0,
uint32_t plane = 0,
RemoteBlobImpl::BlobType mem_type = RemoteBlobImpl::BlobType::BT_BUF_INTERNAL)
: TpublicAPI(desc)
, _impl(context, stream, layout, mem, surf, plane, mem_type) {}
void allocate() noexcept override {
try {
if (!_impl.is_allocated())
_impl.allocate();
} catch (...) {}
}
bool deallocate() noexcept override { return _impl.deallocate(); }
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
std::shared_ptr<InferenceEngine::RemoteContext> getContext() const noexcept override { return _impl.getContext(); }
InferenceEngine::LockedMemory<void> buffer() noexcept override { return _impl.buffer(); }
InferenceEngine::LockedMemory<const void> cbuffer() const noexcept override { return _impl.cbuffer(); }
InferenceEngine::LockedMemory<void> rwmap() noexcept override { return _impl.rwmap(); }
InferenceEngine::LockedMemory<const void> rmap() const noexcept override { return _impl.rmap(); }
InferenceEngine::LockedMemory<void> wmap()noexcept override { return _impl.wmap(); }
RemoteBlobImpl* getImpl() { return &_impl; }
protected:
const std::shared_ptr<InferenceEngine::IAllocator> &getAllocator() const noexcept override { return _impl.getAllocator(); }
void *getHandle() const noexcept override { return _impl.getHandle(); }
RemoteBlobImpl _impl;
};
using RemoteCLbuffer = TypedRemoteBlob<InferenceEngine::gpu::ClBufferBlob>;
using RemoteUSMbuffer = TypedRemoteBlob<InferenceEngine::gpu::USMBlob>;
using RemoteCLImage2D = TypedRemoteBlob<InferenceEngine::gpu::ClImage2DBlob>;
#ifdef _WIN32
using RemoteD3DBuffer = TypedRemoteBlob<InferenceEngine::gpu::D3DBufferBlob>;
using RemoteD3DSurface = TypedRemoteBlob<InferenceEngine::gpu::D3DSurface2DBlob>;
#else
using RemoteVASurface = TypedRemoteBlob<InferenceEngine::gpu::VASurfaceBlob>;
#endif
inline RemoteBlobImpl* getBlobImpl(InferenceEngine::gpu::ClBlob* blobPtr) {
#ifdef _WIN32
{
auto ptr = blobPtr->as<RemoteD3DSurface>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<RemoteD3DBuffer>();
if (ptr) return ptr->getImpl();
}
#else
{
auto ptr = blobPtr->as<RemoteVASurface>();
if (ptr) return ptr->getImpl();
}
#endif
{
auto ptr = blobPtr->as<RemoteCLbuffer>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<RemoteCLImage2D>();
if (ptr) return ptr->getImpl();
}
{
auto ptr = blobPtr->as<RemoteUSMbuffer>();
if (ptr) return ptr->getImpl();
}
return nullptr;
template <typename Result>
Result extract_object(const InferenceEngine::ParamMap& params, const std::string& key) {
auto itrHandle = params.find(key);
OPENVINO_ASSERT(itrHandle != params.end(), "[GPU] No parameter ", key, " found in ParamsMap");
return itrHandle->second.as<Result>();
}
class RemoteAllocator : public InferenceEngine::IAllocator {
protected:
friend class RemoteBlobImpl;
std::atomic_flag _lock;
std::map<void*, const RemoteBlobImpl*> m_lockedBlobs;
void regLockedBlob(void* handle, const RemoteBlobImpl* blob);
public:
using Ptr = std::shared_ptr<RemoteAllocator>;
RemoteAllocator() { _lock.clear(std::memory_order_relaxed); }
/**
* @brief Maps handle to heap memory accessible by any memory manipulation routines.
* @return Generic pointer to memory
*/
void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override { return handle; };
/**
* @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
* The multiple sequential mappings of the same handle are suppose to get the same
* result while there isn't a ref counter supported.
*/
void unlock(void* handle) noexcept override;
/**
* @brief Allocates memory
* @param size The size in bytes to allocate
* @return Handle to the allocated resource
*/
void* alloc(size_t size) noexcept override { return nullptr; }
/**
* @brief Releases handle and all associated memory resources which invalidates the handle.
* @return false if handle cannot be released, otherwise - true.
*/
bool free(void* handle) noexcept override { return true; }
void lock() {
while (_lock.test_and_set(std::memory_order_acquire)) {}
}
void unlock() {
_lock.clear(std::memory_order_release);
}
};
class USMHostAllocator : public InferenceEngine::IAllocator {
protected:
InferenceEngine::gpu::USMBlob::Ptr _usm_host_blob = nullptr;
InferenceEngine::gpu::ClContext* _context = nullptr;
public:
using Ptr = std::shared_ptr<USMHostAllocator>;
USMHostAllocator(InferenceEngine::gpu::ClContext* context) : _context(context) { }
/**
* @brief Maps handle to heap memory accessible by any memory manipulation routines.
* @return Generic pointer to memory
*/
void* lock(void* handle, InferenceEngine::LockOp = InferenceEngine::LOCK_FOR_WRITE) noexcept override {
if (!_usm_host_blob)
return nullptr;
try {
return _usm_host_blob->get();
} catch (...) {
return nullptr;
}
};
/**
* @brief Unmaps memory by handle with multiple sequential mappings of the same handle.
* The multiple sequential mappings of the same handle are suppose to get the same
* result while there isn't a ref counter supported.
*/
void unlock(void* handle) noexcept override {}
/**
* @brief Allocates memory
* @param size The size in bytes to allocate
* @return Handle to the allocated resource
*/
void* alloc(size_t size) noexcept override {
try {
auto td = InferenceEngine::TensorDesc(InferenceEngine::Precision::U8, InferenceEngine::SizeVector{size}, InferenceEngine::Layout::C);
InferenceEngine::ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
_usm_host_blob = std::dynamic_pointer_cast<InferenceEngine::gpu::USMBlob>(_context->CreateBlob(td, params));
_usm_host_blob->allocate();
if (!getBlobImpl(_usm_host_blob.get())->is_allocated()) {
return nullptr;
}
return _usm_host_blob->get();
} catch (...) {
return nullptr;
}
}
/**
* @brief Releases handle and all associated memory resources which invalidates the handle.
* @return false if handle cannot be released, otherwise - true.
*/
bool free(void* handle) noexcept override {
try {
_usm_host_blob = nullptr;
} catch(...) { }
return true;
}
};
class ExecutionContextImpl : public InferenceEngine::gpu::details::param_map_obj_getter {
class RemoteContextImpl {
public:
enum ContextType {
OCL,
DEV_SHARED
};
using Ptr = std::shared_ptr<ExecutionContextImpl>;
using CPtr = std::shared_ptr<const ExecutionContextImpl>;
using Ptr = std::shared_ptr<RemoteContextImpl>;
using CPtr = std::shared_ptr<const RemoteContextImpl>;
explicit ExecutionContextImpl(std::shared_ptr<InferenceEngine::IInferencePlugin> plugin,
const InferenceEngine::ParamMap& params,
const Config& config = {});
RemoteContextImpl(std::string device_name, std::vector<cldnn::device::ptr> devices);
RemoteContextImpl(const std::vector<RemoteContextImpl::Ptr>& known_contexts, const InferenceEngine::ParamMap& params);
InferenceEngine::ParamMap getParams() const;
std::string getDeviceName() const noexcept;
InferenceEngine::ParamMap get_params() const;
std::string get_device_name() const noexcept;
InferenceEngine::MemoryBlob::Ptr create_host_blob(InferenceEngine::gpu::ClContext::Ptr public_context, const InferenceEngine::TensorDesc& desc);
InferenceEngine::RemoteBlob::Ptr create_blob(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
const InferenceEngine::ParamMap& params = {});
std::shared_ptr<cldnn::engine> GetEngine() const { return m_engine; }
Config& GetConfig() { return m_config; }
ContextType GetType() const { return m_type; }
InferenceEngine::gpu_handle_param GetExternalQueue() const { return m_external_queue; }
const std::weak_ptr<InferenceEngine::IInferencePlugin> GetPlugin() const { return m_plugin; }
cldnn::engine& get_engine() { return *m_engine; }
InferenceEngine::gpu_handle_param get_external_queue() const { return m_external_queue; }
void lock() {
while (m_lock.test_and_set(std::memory_order_acquire)) {}
}
cldnn::memory::ptr try_get_cached_memory(size_t hash);
void add_to_cache(size_t hash, cldnn::memory::ptr memory);
void unlock() {
m_lock.clear(std::memory_order_release);
}
private:
std::string get_device_name(const std::vector<RemoteContextImpl::Ptr>& known_contexts,
const cldnn::device::ptr current_device);
InferenceEngine::RemoteBlob::Ptr reuse_surface(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
const InferenceEngine::ParamMap& params);
InferenceEngine::RemoteBlob::Ptr reuse_memory(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
cldnn::shared_handle mem,
BlobType blob_type);
InferenceEngine::RemoteBlob::Ptr create_buffer(InferenceEngine::gpu::ClContext::Ptr public_context, const InferenceEngine::TensorDesc& desc);
InferenceEngine::RemoteBlob::Ptr create_usm(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
BlobType alloc_type);
void check_if_shared();
protected:
// TODO: refactor to unique_ptr
std::shared_ptr<cldnn::engine> m_engine;
InferenceEngine::gpu_handle_param m_va_display;
InferenceEngine::gpu_handle_param m_external_queue;
Config m_config;
static const size_t cache_capacity = 100;
ContextType m_type;
std::weak_ptr<InferenceEngine::IInferencePlugin> m_plugin;
std::atomic_flag m_lock;
std::string m_device_name = "";
const std::string m_plugin_name;
cldnn::LruCache<size_t, cldnn::memory::ptr> m_memory_cache;
std::mutex m_cache_mutex;
};
template<typename TpublicContextAPI>
class TypedExecutionContext : public TpublicContextAPI {
template<typename T1, typename T2>
struct _Key {
T1 _surf;
T2 _plane;
_Key(T1 surf, T2 plane) : _surf(surf), _plane(plane) {}
bool operator<(const _Key &that) const {
return _surf < that._surf || (_surf == that._surf && _plane < that._plane);
}
};
#ifdef _WIN32
using surf_key = _Key<cldnn::shared_handle, uint32_t>;
#else
using surf_key = _Key<cldnn::shared_surface, uint32_t>;
#endif
std::map<surf_key, InferenceEngine::RemoteBlob::Ptr> shared_surf_reg;
std::map<cldnn::shared_handle, InferenceEngine::RemoteBlob::Ptr> shared_obj_reg;
InferenceEngine::RemoteBlob::Ptr reuse_surf(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params) {
using namespace InferenceEngine;
using InferenceEngine::gpu::details::param_map_obj_getter;
InferenceEngine::RemoteBlob::Ptr ret = nullptr;
auto& stream = _impl.GetEngine()->get_program_stream();
uint32_t plane = param_map_obj_getter::_ObjFromParamSimple<uint32_t>(params, GPU_PARAM_KEY(VA_PLANE));
#ifdef _WIN32
cldnn::shared_handle mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
surf_key skey(mem, plane);
#else
cldnn::shared_surface surf = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_surface>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
surf_key skey(surf, plane);
#endif
std::lock_guard<ExecutionContextImpl> locker(_impl);
// try to locate previously shared surface
auto itr = shared_surf_reg.find(skey);
if (itr != shared_surf_reg.end()) {
ret = itr->second;
} else {
// unlickily, not found - create new and insert into registry
cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
ImageFormatFromLayout(tensorDesc.getLayout()),
tensor_from_dims(tensorDesc.getDims()));
auto smart_this =
std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
#ifdef _WIN32
ret = std::make_shared<RemoteD3DSurface>(smart_this, stream,
tensorDesc, layout, mem, 0, plane,
RemoteBlobImpl::BlobType::BT_SURF_SHARED);
#else
ret = std::make_shared<RemoteVASurface>(smart_this, stream,
tensorDesc, layout, nullptr, surf, plane,
RemoteBlobImpl::BlobType::BT_SURF_SHARED);
#endif
shared_surf_reg[skey] = ret;
}
return ret;
}
InferenceEngine::RemoteBlob::Ptr reuse_obj(const InferenceEngine::TensorDesc& tensorDesc,
cldnn::shared_handle mem,
RemoteBlobImpl::BlobType blob_type) {
InferenceEngine::RemoteBlob::Ptr ret = nullptr;
std::lock_guard<ExecutionContextImpl> locker(_impl);
auto& stream = _impl.GetEngine()->get_program_stream();
// try to locate previously shared object
auto itr = shared_obj_reg.find(mem);
if (itr != shared_obj_reg.end()) {
ret = itr->second;
} else {
// unlickily, not found - create new and insert into registry
cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
FormatFromLayout(tensorDesc.getLayout()),
tensor_from_dims(tensorDesc.getDims()));
auto smart_this =
std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
switch (blob_type) {
case RemoteBlobImpl::BlobType::BT_BUF_SHARED:
ret = std::make_shared<RemoteCLbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
break;
case RemoteBlobImpl::BlobType::BT_USM_SHARED:
ret = std::make_shared<RemoteUSMbuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
break;
case RemoteBlobImpl::BlobType::BT_IMG_SHARED:
layout.format = ImageFormatFromLayout(tensorDesc.getLayout());
ret = std::make_shared<RemoteCLImage2D>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
break;
#ifdef _WIN32
case RemoteBlobImpl::BlobType::BT_DX_BUF_SHARED:
ret = std::make_shared<RemoteD3DBuffer>(smart_this, stream, tensorDesc, layout, mem, 0, 0, blob_type);
break;
#endif
default:
break;
}
shared_obj_reg[mem] = ret;
}
return ret;
}
InferenceEngine::RemoteBlob::Ptr create_buffer(const InferenceEngine::TensorDesc& tensorDesc) {
cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
FormatFromLayout(tensorDesc.getLayout()),
tensor_from_dims(tensorDesc.getDims()));
auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
auto& stream = _impl.GetEngine()->get_program_stream();
return std::make_shared<RemoteCLbuffer>(smart_this,
stream,
tensorDesc,
layout,
nullptr, 0, 0,
RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
}
InferenceEngine::RemoteBlob::Ptr create_usm(const InferenceEngine::TensorDesc& tensorDesc, RemoteBlobImpl::BlobType alloc_type) {
cldnn::layout layout(DataTypeFromPrecision(tensorDesc.getPrecision()),
FormatFromLayout(tensorDesc.getLayout()),
tensor_from_dims(tensorDesc.getDims()));
auto smart_this = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this());
auto& stream = _impl.GetEngine()->get_program_stream();
return std::make_shared<RemoteUSMbuffer>(smart_this,
stream,
tensorDesc,
layout,
nullptr, 0, 0,
alloc_type);
}
void check_if_shared() {
if (GetType() != ExecutionContextImpl::ContextType::DEV_SHARED)
IE_THROW() << "Shared context is required to to share this type of memory";
}
// Template class below is needed to allow proper cast of user contexts
// We have the following public classes hierarchy:
// RemoteContext
// |
// ClContext
// | |
// VAContext D3DContext
// So our implementation must allow casting of context object to proper type user type (ClContext, VAContext or D3DContext)
// Thus we introduce this template which have 3 instances with different base classes:
// RemoteContext
// |
// ---------- ClContext -----------
// | | |
// VAContext | D3DContext
// | | |
// RemoteVAContext RemoteCLContext RemoteD3DContext
//
// All these context types are just thin wrappers that calls common context internal impl (RemoteContextImpl)
template<typename PublicContextType>
class TypedRemoteContext : public PublicContextType {
public:
using Ptr = std::shared_ptr<TypedExecutionContext>;
using CPtr = std::shared_ptr<const TypedExecutionContext>;
using Ptr = std::shared_ptr<TypedRemoteContext>;
explicit TypedExecutionContext(std::shared_ptr<InferenceEngine::IInferencePlugin> plugin,
const InferenceEngine::ParamMap& params,
const Config& config = {})
: _impl(plugin, params, config) {}
TypedRemoteContext(std::string device_name, std::vector<cldnn::device::ptr> devices)
: m_impl(std::make_shared<RemoteContextImpl>(device_name, devices)) {}
TypedRemoteContext(const std::vector<RemoteContextImpl::Ptr>& known_contexts, const InferenceEngine::ParamMap& params)
: m_impl(std::make_shared<RemoteContextImpl>(known_contexts, params)) {}
~TypedExecutionContext() {
shared_surf_reg.clear();
shared_obj_reg.clear();
InferenceEngine::ParamMap getParams() const override { return m_impl->get_params(); }
std::string getDeviceName() const noexcept override { return m_impl->get_device_name(); }
InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& desc) override {
return m_impl->create_host_blob(std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this()), desc);
}
InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& desc, const InferenceEngine::ParamMap& params = {}) override {
return m_impl->create_blob(std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(this->shared_from_this()), desc, params);
}
InferenceEngine::ParamMap getParams() const override { return _impl.getParams(); }
std::string getDeviceName() const noexcept override { return _impl.getDeviceName(); }
RemoteContextImpl::Ptr get_impl() { return m_impl; }
InferenceEngine::MemoryBlob::Ptr CreateHostBlob(const InferenceEngine::TensorDesc& tensorDesc) override {
if (_impl.GetEngine()->use_unified_shared_memory())
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc, std::make_shared<USMHostAllocator>(this)));
else
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(tensorDesc));
}
InferenceEngine::RemoteBlob::Ptr CreateBlob(const InferenceEngine::TensorDesc& tensorDesc, const InferenceEngine::ParamMap& params = {}) override {
using namespace InferenceEngine;
using InferenceEngine::gpu::details::param_map_obj_getter;
if (params.empty()) {
// user wants plugin to allocate blob by itself and return handle
return create_buffer(tensorDesc);
} else {
// user will supply shared object handle
std::string memTypeStr = param_map_obj_getter::_StrFromParams(params, GPU_PARAM_KEY(SHARED_MEM_TYPE));
bool is_usm = memTypeStr == GPU_PARAM_VALUE(USM_HOST_BUFFER) ||
memTypeStr == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) ||
memTypeStr == GPU_PARAM_VALUE(USM_USER_BUFFER);
if (is_usm && !_impl.GetEngine()->use_unified_shared_memory()) {
IE_THROW(NotAllocated) << "Can't create USM tensor as USM is not supported (or manually disabled) on current device";
}
if (GPU_PARAM_VALUE(VA_SURFACE) == memTypeStr) {
check_if_shared();
return reuse_surf(tensorDesc, params);
} else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == memTypeStr) {
return create_usm(tensorDesc, RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
} else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == memTypeStr) {
return create_usm(tensorDesc, RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
} else {
RemoteBlobImpl::BlobType blob_type;
cldnn::shared_handle mem = nullptr;
if (GPU_PARAM_VALUE(OCL_BUFFER) == memTypeStr) {
blob_type = RemoteBlobImpl::BlobType::BT_BUF_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
} else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == memTypeStr) {
blob_type = RemoteBlobImpl::BlobType::BT_USM_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
} else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == memTypeStr) {
blob_type = RemoteBlobImpl::BlobType::BT_IMG_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
#ifdef _WIN32
} else if (GPU_PARAM_VALUE(DX_BUFFER) == memTypeStr) {
blob_type = RemoteBlobImpl::BlobType::BT_DX_BUF_SHARED;
mem = param_map_obj_getter::_ObjFromParamSimple<cldnn::shared_handle>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
check_if_shared();
#endif
} else {
IE_THROW() << "Unsupported shared object type " << memTypeStr;
}
return reuse_obj(tensorDesc, mem, blob_type);
}
}
}
Config& GetConfig() { return _impl.GetConfig(); }
ExecutionContextImpl::ContextType GetType() const { return _impl.GetType(); }
ExecutionContextImpl* getImpl() { return &_impl; }
protected:
ExecutionContextImpl _impl;
private:
std::shared_ptr<RemoteContextImpl> m_impl;
};
using RemoteCLContext = TypedExecutionContext<InferenceEngine::gpu::ClContext>;
using RemoteCLContext = TypedRemoteContext<InferenceEngine::gpu::ClContext>;
#ifdef _WIN32
using RemoteD3DContext = TypedExecutionContext<InferenceEngine::gpu::D3DContext>;
using RemoteD3DContext = TypedRemoteContext<InferenceEngine::gpu::D3DContext>;
#else
using RemoteVAContext = TypedExecutionContext<InferenceEngine::gpu::VAContext>;
using RemoteVAContext = TypedRemoteContext<InferenceEngine::gpu::VAContext>;
#endif
inline ExecutionContextImpl* getContextImpl(InferenceEngine::gpu::ClContext::Ptr ctxPtr) {
inline std::shared_ptr<RemoteContextImpl> get_context_impl(InferenceEngine::gpu::ClContext::Ptr context) {
OPENVINO_ASSERT(context != nullptr, "[GPU] Couldn't get impl from invalid context object");
#ifdef _WIN32
{
auto ptr = ctxPtr->as<RemoteD3DContext>();
if (ptr) return ptr->getImpl();
}
if (auto ptr = context->as<RemoteD3DContext>())
return ptr->get_impl();
#else
{
auto ptr = ctxPtr->as<RemoteVAContext>();
if (ptr) return ptr->getImpl();
}
if (auto ptr = context->as<RemoteVAContext>())
return ptr->get_impl();
#endif
{
auto ptr = ctxPtr->as<RemoteCLContext>();
if (ptr) return ptr->getImpl();
}
return nullptr;
if (auto ptr = context->as<RemoteCLContext>())
return ptr->get_impl();
OPENVINO_ASSERT(false, "[GPU] Couldn't get context impl from public context object.");
}
inline std::shared_ptr<RemoteContextImpl> get_context_impl(InferenceEngine::RemoteContext::Ptr context) {
OPENVINO_ASSERT(context != nullptr, "[GPU] Couldn't get impl from invalid context object");
auto casted = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(context);
OPENVINO_ASSERT(casted != nullptr, "[GPU] Couldn't get context impl: Context type is not ClContext or it's derivatives");
return get_context_impl(casted);
}
} // namespace intel_gpu

View File

@ -8,19 +8,20 @@
#include <ngraph/function.hpp>
#include "intel_gpu/plugin/device_config.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/runtime/device.hpp"
namespace ov {
namespace intel_gpu {
class TransformationsPipeline {
public:
explicit TransformationsPipeline(const Config &conf, const cldnn::device_info &device_info)
explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info)
: config(conf), device_info(device_info) {}
void apply(std::shared_ptr<ov::Model> func);
private:
Config config;
const ExecutionConfig& config;
cldnn::device_info device_info;
};

View File

@ -13,7 +13,7 @@ namespace intel_gpu {
class VariableState : public InferenceEngine::IVariableStateInternal {
public:
VariableState(const std::string& name, const std::vector<cldnn::network::VariableState::Ptr>& states,
std::shared_ptr<cldnn::engine> engine, int currentBatch);
cldnn::engine& engine, int currentBatch);
/**
* @brief Reset internal variable state for relevant infer request, to a value specified as
@ -41,7 +41,7 @@ private:
int currentBatch_;
std::vector<cldnn::network::VariableState::Ptr> states_;
InferenceEngine::TensorDesc desc_;
std::shared_ptr<cldnn::engine> engine_;
cldnn::engine& engine_;
};
} // namespace intel_gpu

View File

@ -49,6 +49,25 @@ inline std::ostream& operator<<(std::ostream& out, const impl_types& impl_type)
return out;
}
inline std::istream& operator>>(std::istream& is, impl_types& impl_type) {
std::string str;
is >> str;
if (str == "cpu") {
impl_type = impl_types::cpu;
} else if (str == "common") {
impl_type = impl_types::common;
} else if (str == "ocl") {
impl_type = impl_types::ocl;
} else if (str == "onednn") {
impl_type = impl_types::onednn;
} else if (str == "any") {
impl_type = impl_types::any;
} else {
throw ov::Exception{"Unsupported impl type: " + str};
}
return is;
}
/// @brief Possible supported shape types.
enum class shape_types : uint8_t {
static_shape = 1 << 0,
@ -82,25 +101,35 @@ inline std::ostream& operator<<(std::ostream& out, const shape_types& shape_type
return out;
}
/// @brief Description of primitives implementation.
struct implementation_desc {
format::type output_format; ///< Output format.
} // namespace cldnn
namespace ov {
namespace intel_gpu {
struct ImplementationDesc {
cldnn::format::type output_format; ///< Output format.
std::string kernel_name; ///< GPU kernel name.
impl_types impl_type; ///< GPU implementation type.
cldnn::impl_types impl_type; ///< GPU implementation type.
implementation_desc() :
output_format(format::any),
ImplementationDesc() :
output_format(cldnn::format::any),
kernel_name(""),
impl_type(impl_types::any) {}
impl_type(cldnn::impl_types::any) {}
implementation_desc(format::type output_format,
ImplementationDesc(cldnn::format::type output_format,
std::string kernel_name,
impl_types impl_type = impl_types::any) :
cldnn::impl_types impl_type = cldnn::impl_types::any) :
output_format(output_format),
kernel_name(kernel_name),
impl_type(impl_type) {}
};
using implementation_forcing_map = std::map<primitive_id, implementation_desc>;
inline std::ostream& operator<<(std::ostream& out, const ImplementationDesc& desc) {
out << desc.impl_type << ":" << desc.kernel_name << ":" << desc.output_format;
return out;
}
} // namespace cldnn
using ImplForcingMap = std::map<cldnn::primitive_id, ImplementationDesc>;
} // namespace intel_gpu
} // namespace ov

View File

@ -5,11 +5,12 @@
#pragma once
#include "device.hpp"
#include "engine_configuration.hpp"
#include "event.hpp"
#include "memory_caps.hpp"
#include "memory_pool.hpp"
#include "layout.hpp"
#include "execution_config.hpp"
#include "engine_configuration.hpp"
#include <threading/ie_cpu_streams_executor.hpp>
#include <memory>
@ -91,9 +92,6 @@ public:
/// Checks if the current engine supports speicied allocation @p type
bool supports_allocation(allocation_type type) const;
/// Returns configuration of current engine
const engine_configuration& configuration() const { return _configuration; }
/// Returns device structure which represents stores device capabilities
device_info get_device_info() const;
@ -129,22 +127,23 @@ public:
uint64_t get_max_memory_size() const;
/// Create stream object for current engine
virtual stream_ptr create_stream() const = 0;
virtual stream_ptr create_stream(const ExecutionConfig& config) const = 0;
/// Creates stream object from user handle
virtual stream_ptr create_stream(void *handle) const = 0;
virtual stream_ptr create_stream(const ExecutionConfig& config, void *handle) const = 0;
/// Returns service stream which can be used during program build and optimizations
virtual stream& get_program_stream() const = 0;
virtual stream& get_service_stream() const = 0;
virtual allocation_type detect_usm_allocation_type(const void* memory) const = 0;
#ifdef ENABLE_ONEDNN_FOR_GPU
/// Creates onednn engine object which shares device and context with current engine
virtual void create_onednn_engine(const ExecutionConfig& config) = 0;
/// Returns onednn engine object which shares device and context with current engine
virtual dnnl::engine& get_onednn_engine() const = 0;
#endif
/// Return GPU plugin internal task executor
const InferenceEngine::ITaskExecutor::Ptr get_task_executor();
/// Factory method which creates engine object with impl configured by @p engine_type
/// @param engine_type requested engine type
@ -152,13 +151,7 @@ public:
/// @param runtime_type requested execution runtime for the engine. @note some runtime/engine types configurations might be unsupported
/// @param device specifies the device which the engine is created for
/// @param configuration options for the engine
static std::shared_ptr<cldnn::engine> create(engine_types engine_type,
runtime_types runtime_type,
const device::ptr device,
const engine_configuration& configuration = engine_configuration(),
const InferenceEngine::ITaskExecutor::Ptr task_executor =
std::make_shared<InferenceEngine::CPUStreamsExecutor>(
InferenceEngine::CPUStreamsExecutor::Config()));
static std::shared_ptr<cldnn::engine> create(engine_types engine_type, runtime_types runtime_type, const device::ptr device);
/// Factory method which creates engine object with impl configured by @p engine_type
/// @param engine_type requested engine type
@ -166,19 +159,12 @@ public:
/// @param task_executor GPU plugin internal task executor
/// @param configuration options for the engine
/// @note engine is created for the first device returned by devices query
static std::shared_ptr<cldnn::engine> create(engine_types engine_type,
runtime_types runtime_type,
const engine_configuration& configuration = engine_configuration(),
const InferenceEngine::ITaskExecutor::Ptr task_executor =
std::make_shared<InferenceEngine::CPUStreamsExecutor>(
InferenceEngine::CPUStreamsExecutor::Config()));
static std::shared_ptr<cldnn::engine> create(engine_types engine_type, runtime_types runtime_type);
protected:
/// Create engine for given @p device and @p configuration
engine(const device::ptr device, const engine_configuration& configuration, const InferenceEngine::ITaskExecutor::Ptr task_executor);
const InferenceEngine::ITaskExecutor::Ptr _task_executor;
engine(const device::ptr device);
const device::ptr _device;
engine_configuration _configuration;
mutable std::mutex _mutex;
std::map<allocation_type, std::atomic<uint64_t>> _memory_usage_map;

View File

@ -13,101 +13,23 @@
namespace cldnn {
/// @addtogroup cpp_api C++ API
/// @{
/// @defgroup cpp_engine Execution Engine
/// @{
/// @brief Defines available engine types
enum class engine_types : int32_t {
ocl,
};
inline std::ostream& operator<<(std::ostream& os, engine_types type) {
switch (type) {
case engine_types::ocl: os << "ocl"; break;
default: os << "unknown"; break;
}
return os;
}
/// @brief Defines available runtime types
enum class runtime_types : int32_t {
ocl,
};
/// @brief Defines available priority mode types
enum class priority_mode_types : int16_t {
disabled,
low,
med,
high
};
/// @brief Defines available throttle mode types
enum class throttle_mode_types : int16_t {
disabled,
low,
med,
high
};
/// @brief Defines supported queue types
enum class queue_types : int16_t {
in_order,
out_of_order
};
/// @brief Configuration parameters for created engine.
struct engine_configuration {
const bool enable_profiling; ///< Enable per-primitive profiling.
const queue_types queue_type; ///< Specifies type of queue used by the runtime
const std::string sources_dumps_dir; ///< Specifies a directory where sources of cldnn::program objects should be dumped.
///< Empty by default (means no dumping).
const priority_mode_types priority_mode; ///< Priority mode (support of priority hints in command queue). If cl_khr_priority_hints extension
///< is not supported by current OpenCL implementation, the value must be set to cldnn_priority_disabled.
const throttle_mode_types throttle_mode; ///< Throttle mode (support of throttle hints in command queue). If cl_khr_throttle_hints extension
///< is not supported by current OpenCL implementation, the value must be set to cldnn_throttle_disabled.
bool use_memory_pool; ///< Enables memory usage optimization. memory objects will be reused when possible
///< (switched off for older drivers then NEO).
bool use_unified_shared_memory; ///< Enables USM usage
const std::string kernels_cache_path; ///< Path to compiled kernels cache
uint16_t throughput_streams; ///< Number of queues/streams executed in parallel by GPU plugin
const std::string tuning_cache_path; ///< Path to tuning kernel cache
/// @brief Constructs engine configuration with specified options.
/// @param enable_profiling Enable per-primitive profiling.
/// @param queue_type Specifies type of queue used by the runtime
/// @param sources_dumps_dir Specifies a directory where sources of cldnn::program objects should be dumped
/// @param priority_mode Priority mode for all streams created within the engine
/// @param throttle_mode Throttle mode for all streams created within the engine
/// @param use_memory_pool Controls whether engine is allowed to reuse intermediate memory buffers whithin a network
/// @param use_unified_shared_memory If this option it true and device supports USM, then engine will use USM for all memory allocations
/// @param kernels_cache_path Path to existing directory where plugin can cache compiled kernels
/// @param n_threads Max number of host threads used in gpu plugin
/// @param throughput_streams Number of queues/streams executed in parallel by GPU plugin
/// @param tuning_cache_path Path to tuning kernel cache
engine_configuration(
bool enable_profiling = false,
queue_types queue_type = queue_types::out_of_order,
const std::string& sources_dumps_dir = std::string(),
priority_mode_types priority_mode = priority_mode_types::med,
throttle_mode_types throttle_mode = throttle_mode_types::med,
bool use_memory_pool = true,
bool use_unified_shared_memory = true,
const std::string& kernels_cache_path = "",
uint16_t throughput_streams = 1,
const std::string& tuning_cache_path = "cache.json")
: enable_profiling(enable_profiling)
, queue_type(queue_type)
, sources_dumps_dir(sources_dumps_dir)
, priority_mode(priority_mode)
, throttle_mode(throttle_mode)
, use_memory_pool(use_memory_pool)
, use_unified_shared_memory(use_unified_shared_memory)
, kernels_cache_path(kernels_cache_path)
, throughput_streams(throughput_streams)
, tuning_cache_path(tuning_cache_path) { }
};
/// @}
/// @}
} // namespace cldnn

View File

@ -0,0 +1,162 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/runtime/internal_properties.hpp"
#include "intel_gpu/runtime/device.hpp"
namespace ov {
namespace intel_gpu {
enum class PropertyVisibility {
INTERNAL = 0,
PUBLIC = 1
};
inline std::ostream& operator<<(std::ostream& os, const PropertyVisibility& visibility) {
switch (visibility) {
case PropertyVisibility::PUBLIC: os << "PUBLIC"; break;
case PropertyVisibility::INTERNAL: os << "INTERNAL"; break;
default: os << "UNKNOWN"; break;
}
return os;
}
class BaseValidator {
public:
using Ptr = std::shared_ptr<BaseValidator>;
virtual ~BaseValidator() = default;
virtual bool is_valid(const ov::Any& v) const = 0;
};
class FuncValidator : public BaseValidator {
public:
explicit FuncValidator(std::function<bool(const ov::Any)> func) : m_func(func) { }
bool is_valid(const ov::Any& v) const override {
return m_func(v);
}
private:
std::function<bool(const ov::Any)> m_func;
};
// PropertyTypeValidator ensures that value can be converted to given property type
template<typename T>
class PropertyTypeValidator : public BaseValidator {
public:
bool is_valid(const ov::Any& v) const override {
try {
v.as<T>();
return true;
} catch (ov::Exception&) {
return false;
}
}
};
class ExecutionConfig {
public:
ExecutionConfig();
ExecutionConfig(std::initializer_list<ov::AnyMap::value_type> values) : ExecutionConfig() { set_property(ov::AnyMap(values)); }
explicit ExecutionConfig(const ov::AnyMap& properties) : ExecutionConfig() { set_property(properties); }
explicit ExecutionConfig(const ov::AnyMap::value_type& property) : ExecutionConfig() { set_property(property); }
void set_default();
void set_property(const ov::AnyMap& properties);
void set_user_property(const ov::AnyMap& properties);
Any get_property(const std::string& name) const;
bool is_set_by_user(const std::string& name) const;
bool is_supported(const std::string& name) const;
void register_property_impl(const std::pair<std::string, ov::Any>& propertiy, PropertyVisibility visibility, BaseValidator::Ptr validator);
template <PropertyVisibility visibility, typename... PropertyInitializer, typename std::enable_if<(sizeof...(PropertyInitializer) == 0), bool>::type = true>
void register_property_impl() { }
template <PropertyVisibility visibility, typename T, PropertyMutability mutability, typename ValueT, typename... PropertyInitializer>
void register_property_impl(const std::tuple<ov::Property<T, mutability>, ValueT>& property, PropertyInitializer&&... properties) {
auto p = std::get<0>(property)(std::get<1>(property));
auto v = std::dynamic_pointer_cast<BaseValidator>(std::make_shared<PropertyTypeValidator<T>>());
register_property_impl(std::move(p), visibility, std::move(v));
register_property_impl<visibility>(properties...);
}
template <PropertyVisibility visibility,
typename T,
PropertyMutability mutability,
typename ValueT,
typename ValidatorT,
typename... PropertyInitializer>
typename std::enable_if<std::is_base_of<BaseValidator, ValidatorT>::value, void>::type
register_property_impl(const std::tuple<ov::Property<T, mutability>, ValueT, ValidatorT>& property, PropertyInitializer&&... properties) {
auto p = std::get<0>(property)(std::get<1>(property));
auto v = std::dynamic_pointer_cast<BaseValidator>(std::make_shared<ValidatorT>(std::get<2>(property)));
register_property_impl(std::move(p), visibility, std::move(v));
register_property_impl<visibility>(properties...);
}
template <PropertyVisibility visibility,
typename T,
PropertyMutability mutability,
typename ValueT,
typename ValidatorT,
typename... PropertyInitializer>
typename std::enable_if<std::is_same<std::function<bool(const ov::Any&)>, ValidatorT>::value, void>::type
register_property_impl(const std::tuple<ov::Property<T, mutability>, ValueT, ValidatorT>& property, PropertyInitializer&&... properties) {
auto p = std::get<0>(property)(std::get<1>(property));
auto v = std::dynamic_pointer_cast<BaseValidator>(std::make_shared<FuncValidator>(std::get<2>(property)));
register_property_impl(std::move(p), visibility, std::move(v));
register_property_impl<visibility>(properties...);
}
template <PropertyVisibility visibility, typename... PropertyInitializer>
void register_property(PropertyInitializer&&... properties) {
register_property_impl<visibility>(properties...);
}
template <typename... Properties>
util::EnableIfAllStringAny<void, Properties...> set_property(Properties&&... properties) {
set_property(ov::AnyMap{std::forward<Properties>(properties)...});
}
template <typename... Properties>
util::EnableIfAllStringAny<void, Properties...> set_user_property(Properties&&... properties) {
set_user_property(ov::AnyMap{std::forward<Properties>(properties)...});
}
template <typename T, PropertyMutability mutability>
bool is_set_by_user(const ov::Property<T, mutability>& property) const {
return is_set_by_user(property.name());
}
template <typename T, PropertyMutability mutability>
T get_property(const ov::Property<T, mutability>& property) const {
return get_property(property.name()).template as<T>();
}
void apply_user_properties(const cldnn::device_info& info);
std::string to_string() const;
protected:
void apply_hints(const cldnn::device_info& info);
void apply_performance_hints(const cldnn::device_info& info);
void apply_priority_hints(const cldnn::device_info& info);
void apply_debug_options(const cldnn::device_info& info);
private:
ov::AnyMap internal_properties;
ov::AnyMap user_properties;
std::map<std::string, PropertyVisibility> supported_properties;
std::map<std::string, BaseValidator::Ptr> property_validators;
};
} // namespace intel_gpu
} // namespace ov
namespace cldnn {
using ov::intel_gpu::ExecutionConfig;
} // namespace cldnn

View File

@ -0,0 +1,99 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/runtime/properties.hpp"
#include "openvino/runtime/intel_gpu/properties.hpp"
#include "intel_gpu/primitives/implementation_desc.hpp"
namespace ov {
namespace intel_gpu {
/**
* @brief Read-only property to get GPU driver version
*/
static constexpr Property<std::string, PropertyMutability::RO> driver_version{"GPU_DRIVER_VERSION"};
/**
* @brief Read-only property to get GPU driver version
*/
static constexpr Property<std::string, PropertyMutability::RO> device_id{"GPU_DEVICE_ID"};
enum class QueueTypes : int16_t {
in_order,
out_of_order
};
inline std::ostream& operator<<(std::ostream& os, const QueueTypes& val) {
switch (val) {
case QueueTypes::in_order: os << "in-order"; break;
case QueueTypes::out_of_order: os << "out-of-order"; break;
default: os << "unknown";
}
return os;
}
/**
* @brief Defines queue type that must be used for model execution
*/
static constexpr Property<QueueTypes, PropertyMutability::RW> queue_type{"GPU_QUEUE_TYPE"};
static constexpr Property<bool, PropertyMutability::RW> enable_memory_pool{"GPU_ENABLE_MEMORY_POOL"};
static constexpr Property<bool, PropertyMutability::RW> optimize_data{"GPU_OPTIMIZE_DATA"};
static constexpr Property<bool, PropertyMutability::RW> allow_static_input_reorder{"GPU_ALLOW_STATIC_INPUT_REORDER"};
static constexpr Property<bool, PropertyMutability::RW> partial_build_program{"GPU_PARTIAL_BUILD"};
static constexpr Property<bool, PropertyMutability::RW> allow_new_shape_infer{"GPU_ALLOW_NEW_SHAPE_INFER"};
static constexpr Property<std::string, PropertyMutability::RW> dump_graphs{"GPU_DUMP_GRAPHS"};
static constexpr Property<std::vector<std::string>, PropertyMutability::RW> custom_outputs{"GPU_CUSTOM_OUTPUTS"};
/// @brief Tuning mode.
enum class TuningMode {
/// @brief Tuning is disabled.
tuning_disabled,
/// @brief Tuning using the cached data (no on-line tuning for non-existing data).
tuning_use_cache,
/// @brief Tuning using the cached data if exist, tune and update cache otherwise.
tuning_tune_and_cache,
/// @brief Tuning using the cached data and update tasks.
/// @details Performs updating tasks like removal of invalid caches, promoting to new format, etc.
/// No tuning for non-existing data.
tuning_use_and_update,
/// @brief Retune the cache data even if it exists.
tuning_retune_and_cache
};
struct TuningConfig {
TuningMode mode;
std::string cache_file_path;
TuningConfig() : mode(TuningMode::tuning_disabled), cache_file_path("") {}
};
inline std::ostream& operator<<(std::ostream& os, const TuningConfig& val) {
os << val.cache_file_path;
return os;
}
static constexpr Property<TuningConfig, PropertyMutability::RW> tuning_config{"GPU_TUNING_CONFIG"};
static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implementations{"GPU_FORCE_IMPLEMENTATIONS"};
static constexpr Property<std::string, PropertyMutability::RW> config_file{"CONFIG_FILE"};
static constexpr Property<bool, PropertyMutability::RW> enable_lp_transformations{"LP_TRANSFORMS_MODE"};
static constexpr Property<bool, PropertyMutability::RW> enable_dynamic_batch{"DYN_BATCH_ENABLED"};
static constexpr Property<size_t, PropertyMutability::RW> max_dynamic_batch{"DYN_BATCH_LIMIT"};
static constexpr Property<bool, PropertyMutability::RW> exclusive_async_requests{"EXCLUSIVE_ASYNC_REQUESTS"};
static constexpr Property<bool, PropertyMutability::RW> nv12_two_inputs{"GPU_NV12_TWO_INPUTS"};
} // namespace intel_gpu
} // namespace ov
namespace cldnn {
using ov::intel_gpu::QueueTypes;
} // namespace cldnn

View File

@ -9,6 +9,8 @@
#include <functional>
#include <iostream>
#include "kernel.hpp"
namespace cldnn {
struct primitive_impl;

View File

@ -7,6 +7,7 @@
#include "event.hpp"
#include "kernel.hpp"
#include "kernel_args.hpp"
#include "execution_config.hpp"
#include <memory>
#include <vector>
@ -20,7 +21,7 @@ namespace cldnn {
class stream {
public:
using ptr = std::shared_ptr<stream>;
explicit stream(queue_types queue_type) : queue_type(queue_type) {}
explicit stream(QueueTypes queue_type) : queue_type(queue_type) {}
virtual ~stream() = default;
virtual void flush() const = 0;
@ -39,16 +40,16 @@ public:
virtual event::ptr create_user_event(bool set) = 0;
virtual event::ptr create_base_event() = 0;
queue_types get_queue_type() const { return queue_type; }
QueueTypes get_queue_type() const { return queue_type; }
static queue_types detect_queue_type(engine_types engine_type, void* queue_handle);
static QueueTypes detect_queue_type(engine_types engine_type, void* queue_handle);
#ifdef ENABLE_ONEDNN_FOR_GPU
virtual dnnl::stream& get_onednn_stream() const = 0;
virtual dnnl::stream& get_onednn_stream() = 0;
#endif
protected:
queue_types queue_type;
QueueTypes queue_type;
};
} // namespace cldnn

View File

@ -12,8 +12,8 @@ class CompilationContext : public ICompilationContext {
public:
using compilation_queue_t = InferenceEngine::ThreadSafeQueue<ICompilationContext::Task>;
CompilationContext(cldnn::engine& engine, size_t program_id) {
_kernels_cache = cldnn::make_unique<kernels_cache>(engine, program_id, kernel_selector::KernelBase::get_db().get_batch_header_str());
CompilationContext(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id) {
_kernels_cache = cldnn::make_unique<kernels_cache>(engine, config, program_id, nullptr, kernel_selector::KernelBase::get_db().get_batch_header_str());
_worker = std::thread([this](){
while (!_stop_compilation) {
CompilationContext::Task task;
@ -47,8 +47,8 @@ private:
std::atomic_bool _stop_compilation{false};
};
std::unique_ptr<ICompilationContext> ICompilationContext::create(cldnn::engine& engine, size_t program_id) {
return cldnn::make_unique<CompilationContext>(engine, program_id);
std::unique_ptr<ICompilationContext> ICompilationContext::create(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id) {
return cldnn::make_unique<CompilationContext>(engine, config, program_id);
}
} // namespace cldnn

View File

@ -46,7 +46,7 @@ void add_required_reorders::add_reorder(program& p, program_node* node, program_
}
void add_required_reorders::run(program& p) {
bool optimize_data = p.get_options().get<build_option_type::optimize_data>()->enabled();
bool optimize_data = p.get_config().get_property(ov::intel_gpu::optimize_data);
auto usr_itr = p.get_processing_order().begin();
while (usr_itr != p.get_processing_order().end()) {
auto& usr = *usr_itr++;

View File

@ -26,7 +26,7 @@ void compile_graph::run(program& p) {
}
}
auto task_executor = p.get_engine().get_task_executor();
auto task_executor = p.get_task_executor();
auto& proc_order = p.get_processing_order();
std::vector<InferenceEngine::Task> tasks;
std::exception_ptr exception;

View File

@ -400,9 +400,9 @@ void graph_initializations::handle_dynamic_lstm_node(program& p, lstm_dynamic_no
}
void graph_initializations::set_outputs(program& p) {
auto outputs_option = p.get_options().get<build_option_type::outputs>();
if (!outputs_option->outputs.empty()) {
for (auto const& output : outputs_option->outputs) {
auto custom_outputs = p.get_config().get_property(ov::intel_gpu::custom_outputs);
if (!custom_outputs.empty()) {
for (auto const& output : custom_outputs) {
auto o_node = p.get_node_ptr(output);
o_node->set_output(true);
p.outputs.push_back(o_node.get());

View File

@ -29,7 +29,7 @@ void pre_replace_deconv::run(program& p) {
if (node->is_type<deconvolution>()) {
if (node->is_dynamic())
continue;
if (!p.get_options().get<build_option_type::optimize_data>()->enabled())
if (!p.get_config().get_property(ov::intel_gpu::optimize_data))
continue;
auto& deconv_node = node->as<deconvolution>();

View File

@ -63,7 +63,7 @@ struct concat_in_place_optimization : pattern_match_optimization_typed<concat_in
};
bool concat_noop_optimization::match(concatenation_node& node) {
if (node.is_output() && !get_program().is_debug_build())
if (node.is_output())
return false;
if (node.is_dynamic())
return false;
@ -82,7 +82,7 @@ bool concat_noop_optimization::optimize(concatenation_node& node) {
}
bool concat_in_place_optimization::match(concatenation_node& node) {
if (node.is_output() && !get_program().is_debug_build())
if (node.is_output())
return false;
if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty())
return false;
@ -191,8 +191,7 @@ bool concat_in_place_optimization::match(concatenation_node& node) {
// if an input is marked as network output, prevent optimizations
// which would affect a form of its output (unless debug flag is set),
// we also need to restrict input types to those which support padding on all axis
if ((input.first->is_output() && !get_program().is_debug_build()) ||
!input.first->is_padding_supported(concat_axis, lower_padd_in_axis))
if (input.first->is_output() || !input.first->is_padding_supported(concat_axis, lower_padd_in_axis))
return false;
// TODO: Investigate if this condition is needed
@ -306,7 +305,6 @@ static bool can_reshape_be_optimized(const reshape_node& node) {
// ToDo remove friendship relation from program_node
void prepare_buffer_fusing::run(program& p) {
bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
/*
We need to take care of proper ordering by types.
1. Concats
@ -348,10 +346,10 @@ void prepare_buffer_fusing::run(program& p) {
if (!can_optimize(node))
continue;
// zero copy
program_helpers::do_for_types<crop>(*node, [&p, is_debug](crop_node& node) {
program_helpers::do_for_types<crop>(*node, [&p](crop_node& node) {
// if the node is marked as network output, prevent optimizations which would affect a form of its output,
// unless debug flag is set
if (node.is_output() && !is_debug)
if (node.is_output())
return;
// do not optimize when next node is concatenation which is not output

View File

@ -227,13 +227,12 @@ void prepare_primitive_fusing::fuse_reorders(program &p) {
}
void prepare_primitive_fusing::fuse_activations(program &p) {
bool is_debug = p.get_options().get<build_option_type::debug>()->enabled();
std::map<primitive_id, std::vector<std::pair<primitive_id, size_t>>> fusing_history;
bool use_onednn_impls = false;
#ifdef ENABLE_ONEDNN_FOR_GPU
auto& engine = p.get_engine();
if (engine.get_device_info().supports_immad && engine.configuration().queue_type == queue_types::in_order)
if (engine.get_device_info().supports_immad && p.get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order)
use_onednn_impls = true;
#endif
@ -242,7 +241,7 @@ void prepare_primitive_fusing::fuse_activations(program &p) {
auto node_itr = itr++;
auto& node = (*node_itr);
program_helpers::do_for_types<activation>(*node, [&p, &is_debug, &fusing_history, &use_onednn_impls](activation_node& node) {
program_helpers::do_for_types<activation>(*node, [&p, &fusing_history, &use_onednn_impls](activation_node& node) {
auto& input = node.input();
auto id = node.id();
// Restrictions:
@ -251,7 +250,7 @@ void prepare_primitive_fusing::fuse_activations(program &p) {
// - no activation additional input
// - input was optimized
// - can't have fused primitives
if (node.has_padded_dependency() || (input.is_output() && !is_debug) || node.is_output() ||
if (node.has_padded_dependency() || input.is_output() || node.is_output() ||
node.get_dependencies().size() != 1 || input.can_be_optimized() || node.is_constant() ||
node.has_fused_primitives())
return;

View File

@ -24,7 +24,7 @@ void propagate_constants::run(program& p) {
handle_constant(p, *node);
}
auto&& to_replace = calculate(p.get_engine(), p.get_options());
auto&& to_replace = calculate(p.get_engine(), p.get_config(), p.get_task_executor());
// remove all nodes which are no longer relevant, i.e. nodes which:
// 1. are constants, and
@ -108,13 +108,16 @@ bool propagate_constants::has_non_const_user(program_node& node) const {
return false;
}
std::list<std::pair<primitive_id, memory::ptr>> propagate_constants::calculate(engine& engine, build_options bo) {
std::list<std::pair<primitive_id, memory::ptr>> propagate_constants::calculate(engine& engine,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor) {
if (!has_non_trivial_constants)
return {};
bo.set_option(build_option::optimize_data(false));
bo.set_option(build_option::outputs(const_outputs));
network::ptr net = network::build_network(engine, nodes, bo, true);
ExecutionConfig cf_config = config;
cf_config.set_property(ov::intel_gpu::optimize_data(false));
cf_config.set_property(ov::intel_gpu::custom_outputs(const_outputs));
network::ptr net = network::build_network(engine, nodes, cf_config, task_executor, true);
for (auto& cin : const_inputs)
net->set_input_data(cin->id(), cin->get_attached_memory_ptr());

View File

@ -30,6 +30,7 @@ void select_preferred_formats::run(program& p) {
return;
#ifdef ENABLE_ONEDNN_FOR_GPU
engine.create_onednn_engine(p.get_config());
for (auto n : p.get_processing_order()) {
// Onednn primitive descriptor creation may fail, for example, due to asymmetric weight.
try {

View File

@ -75,10 +75,10 @@ public:
uint32_t dilation_x = dilation.size() >= 1 ? dilation[dilation.size() - 1] : 1;
params.dilation = {dilation_x, dilation_y, dilation_z};
const auto& tuning_config = impl_param.get_program().get_options().get<build_option_type::tuning_config>();
const auto& tuning_config = impl_param.get_program().get_config().get_property(ov::intel_gpu::tuning_config);
if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache ||
tuning_config->config.mode == tuning_mode::tuning_retune_and_cache) {
if (tuning_config.mode == ov::intel_gpu::TuningMode::tuning_tune_and_cache ||
tuning_config.mode == ov::intel_gpu::TuningMode::tuning_retune_and_cache) {
optional_params.tuningParams.runner =
std::make_shared<gpu::kernel_runner>(impl_param.get_program().get_engine(), impl_param.get_program().get_id(), true);
}

View File

@ -166,10 +166,10 @@ public:
auto& kernel_selector = kernel_selector::convolution_kernel_selector::Instance();
const auto& tuning_config = arg.get_program().get_options().get<build_option_type::tuning_config>();
const auto& tuning_config = impl_param.get_program().get_config().get_property(ov::intel_gpu::tuning_config);
if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache ||
tuning_config->config.mode == tuning_mode::tuning_retune_and_cache) {
if (tuning_config.mode == ov::intel_gpu::TuningMode::tuning_tune_and_cache ||
tuning_config.mode == ov::intel_gpu::TuningMode::tuning_retune_and_cache) {
conv_optional_params.tuningParams.runner =
std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), arg.get_program().get_id(), true, true);
}

View File

@ -108,6 +108,7 @@ public:
static std::unique_ptr<primitive_impl> create(const concatenation_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
if (arg.can_be_optimized())
return make_unique<concatenation_onednn>(engine);
auto prim = impl_params.typed_desc<concatenation>();
@ -116,7 +117,7 @@ public:
std::shared_ptr<void> dummy = nullptr;
return cldnn::make_unique<concatenation_onednn>(engine, dummy, attr, *desc);
return cldnn::make_unique<concatenation_onednn>(engine, config, dummy, attr, *desc);
}
};

View File

@ -190,11 +190,12 @@ public:
static std::unique_ptr<primitive_impl> create(const convolution_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto desc = get_convolution_descriptor(impl_params);
auto attr = get_primitive_attributes(arg);
dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};
return cldnn::make_unique<convolution_onednn>(engine, desc, attr, prim_desc, get_weights_reorder(impl_params, prim_desc, arg.get_transposed()));
return cldnn::make_unique<convolution_onednn>(engine, config, desc, attr, prim_desc, get_weights_reorder(impl_params, prim_desc, arg.get_transposed()));
}
};

View File

@ -115,11 +115,12 @@ public:
static std::unique_ptr<primitive_impl> create(const deconvolution_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto desc = get_deconvolution_descriptor(impl_params);
auto attr = get_primitive_attributes(arg);
dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};
return cldnn::make_unique<deconvolution_onednn>(engine, desc, attr, prim_desc, get_weights_reorder(impl_params, prim_desc));
return cldnn::make_unique<deconvolution_onednn>(engine, config, desc, attr, prim_desc, get_weights_reorder(impl_params, prim_desc));
}
};

View File

@ -177,11 +177,12 @@ public:
static std::unique_ptr<primitive_impl> create(const fully_connected_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto desc = get_fully_connected_descriptor(impl_params);
auto attr = arg.get_onednn_primitive_attributes();
dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};
return cldnn::make_unique<fully_connected_onednn>(engine, desc, attr, prim_desc, get_weights_reorder(impl_params, prim_desc));
return cldnn::make_unique<fully_connected_onednn>(engine, config, desc, attr, prim_desc, get_weights_reorder(impl_params, prim_desc));
}
};

View File

@ -158,11 +158,12 @@ public:
static std::unique_ptr<primitive_impl> create(const gemm_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto desc = get_gemm_descriptor(impl_params);
auto attr = arg.get_onednn_primitive_attributes();
dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};
return cldnn::make_unique<gemm_onednn>(engine, desc, attr, prim_desc);
return cldnn::make_unique<gemm_onednn>(engine, config, desc, attr, prim_desc);
}
};

View File

@ -102,11 +102,12 @@ public:
static std::unique_ptr<primitive_impl> create(const pooling_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto desc = get_pooling_descriptor(impl_params);
auto attr = arg.get_onednn_primitive_attributes();
dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};
return cldnn::make_unique<pooling_onednn>(engine, desc, attr, prim_desc);
return cldnn::make_unique<pooling_onednn>(engine, config, desc, attr, prim_desc);
}
};

View File

@ -40,6 +40,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
std::unordered_map<uint32_t, std::unordered_map<int, dnnl::memory>> _args;
typed_primitive_onednn_impl(const engine& engine,
const ExecutionConfig& config,
std::shared_ptr<DescType> desc,
std::shared_ptr<dnnl::primitive_attr> attrs,
const PrimDescType& pd,
@ -49,7 +50,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
_desc(desc),
_attrs(attrs),
_pd(pd) {
build_primitive();
build_primitive(config);
}
typed_primitive_onednn_impl(const engine& engine)
@ -362,8 +363,8 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
}
private:
std::string get_cache_directory() const {
auto path = _engine->configuration().kernels_cache_path;
std::string get_cache_directory(const ExecutionConfig& config) const {
auto path = config.get_property(ov::cache_dir);
if (path.empty()) {
return {};
}
@ -374,8 +375,8 @@ private:
return path;
}
std::string generate_cache_path_from_key(std::vector<uint8_t> key) const {
auto path = get_cache_directory();
std::string generate_cache_path_from_key(const ExecutionConfig& config, std::vector<uint8_t> key) const {
auto path = get_cache_directory(config);
if (path.empty()) {
return {};
}
@ -385,8 +386,8 @@ private:
return path + std::to_string(hash) + ".onednn.cl_cache";
}
void build_primitive() {
auto cache_outpath = get_cache_directory();
void build_primitive(const ExecutionConfig& config) {
auto cache_outpath = get_cache_directory(config);
if (const char* env_p = std::getenv("OV_GPU_CACHE_MODEL")) {
if (env_p[0] == '1') {
@ -403,7 +404,7 @@ private:
std::vector<uint8_t> cache;
{
std::lock_guard<std::mutex> lock(cacheAccessMutex);
cache = ov::util::load_binary(generate_cache_path_from_key(key));
cache = ov::util::load_binary(generate_cache_path_from_key(config, key));
}
if (cache.empty()) {
@ -412,7 +413,7 @@ private:
{
std::lock_guard<std::mutex> lock(cacheAccessMutex);
ov::util::save_binary(generate_cache_path_from_key(key), cache);
ov::util::save_binary(generate_cache_path_from_key(config, key), cache);
}
} else {
_prim = PrimType(_pd, cache);
@ -563,9 +564,8 @@ protected:
event::ptr execute_impl(const std::vector<event::ptr>& /* events */,
typed_primitive_inst<PType>& instance) override {
auto& network = instance.get_network();
auto& engine = network.get_engine();
auto& stream = network.get_stream();
auto profiling = engine.configuration().enable_profiling;
auto profiling = network.get_config().get_property(ov::enable_profiling);
auto net_id = network.get_id();
event::ptr event;

View File

@ -118,11 +118,12 @@ public:
static std::unique_ptr<primitive_impl> create(const reduce_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto desc = get_reduction_descriptor(impl_params);
auto attr = arg.get_onednn_primitive_attributes();
dnnl::primitive_desc prim_desc{&desc->data, attr.get(), engine.get_onednn_engine(), nullptr};
return cldnn::make_unique<reduction_onednn>(engine, desc, attr, prim_desc);
return cldnn::make_unique<reduction_onednn>(engine, config, desc, attr, prim_desc);
}
};

View File

@ -86,12 +86,13 @@ public:
static std::unique_ptr<primitive_impl> create(const reorder_node& arg, const kernel_impl_params& impl_params) {
auto& engine = impl_params.prog->get_engine();
auto& config = impl_params.prog->get_config();
auto attr = arg.get_onednn_primitive_attributes();
auto desc = get_reorder_descriptor(impl_params, *attr, impl_params.prog->get_engine());
std::shared_ptr<void> dummy = nullptr;
return cldnn::make_unique<reorder_onednn>(engine, dummy, attr, *desc);
return cldnn::make_unique<reorder_onednn>(engine, config, dummy, attr, *desc);
}
};

View File

@ -113,7 +113,7 @@ dnnl::memory::desc create_memory_desc_from_format_string(dnnl::memory::dims dims
template <typename T>
cldnn::memory::ptr convert_zp_data_to_s32(const memory::ptr zp_memory) {
auto engine = zp_memory->get_engine();
auto& stream = engine->get_program_stream();
auto& stream = engine->get_service_stream();
auto zp_s32_layout = zp_memory->get_layout();
zp_s32_layout.data_type = data_types::i32;
@ -493,7 +493,7 @@ template <typename T>
bool is_per_tensor(cldnn::data_node& node, int32_t& zp_val) {
auto ptr = node.get_attached_memory_ptr();
auto engine = ptr->get_engine();
auto& stream = engine->get_program_stream();
auto& stream = engine->get_service_stream();
auto num_elems = node.get_output_layout().count();
mem_lock<T, mem_lock_type::read> old_data {ptr, stream};
auto val = old_data[0];

View File

@ -17,7 +17,7 @@ public:
virtual void cancel() noexcept = 0;
virtual ~ICompilationContext() = default;
static std::unique_ptr<ICompilationContext> create(cldnn::engine& engine, size_t program_id);
static std::unique_ptr<ICompilationContext> create(cldnn::engine& engine, const ExecutionConfig& config, size_t program_id);
};
} // namespace cldnn

View File

@ -26,7 +26,7 @@ private:
add_or_change_input_layout(node);
_program = program::build_program(node.get_program().get_engine(),
_topology,
node.get_program().get_options(),
node.get_program().get_config(),
true); // rebuild program
}
program::ptr get() const { return _program; }

View File

@ -101,7 +101,7 @@ kernel_selector::data_layout to_data_layout(format f);
cldnn::format from_data_layout(kernel_selector::data_layout l);
kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped);
cldnn::format::type from_weights_layout(kernel_selector::weights_layout l);
kernel_selector::tuning_mode to_tuning_mode(cldnn::tuning_mode mode);
kernel_selector::tuning_mode to_tuning_mode(ov::intel_gpu::TuningMode mode);
kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {});
kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped = false);
layout from_weights_tensor(const kernel_selector::weights_tensor& t);

View File

@ -199,7 +199,7 @@ public:
void set_optimization_attribute(optimization_attributes_type attribute, int32_t val);
optimization_attributes get_optimization_attributes() { return _optimization_attributes; }
void set_implementation_forcing(const implementation_forcing_map& map);
void set_implementation_forcing(const ov::intel_gpu::ImplForcingMap& map);
void update_formats_map(const convolution_node& node);
bool is_format_optimized(const convolution_node& node, const format& format, bool use_weak_restrictions = false);

View File

@ -311,10 +311,10 @@ public:
output_names.insert(get_condition_id());
}
auto opts = get_program().get_options();
std::vector<primitive_id> output_names_vec(output_names.begin(), output_names.end());
opts.set_option(build_option::outputs(output_names_vec));
body_program = program::build_program(get_program().get_engine(), body, opts, false, false, true);
auto config = get_program().get_config();
config.set_property(ov::intel_gpu::custom_outputs(output_names_vec));
body_program = program::build_program(get_program().get_engine(), body, config, false, false, true);
}
const primitive_id& get_trip_count_id() const { return get_primitive()->trip_count_id; }

View File

@ -276,7 +276,9 @@ public:
private:
void run(program& p) override;
std::list<std::pair<primitive_id, memory::ptr>> calculate(engine& engine, build_options bo);
std::list<std::pair<primitive_id, memory::ptr>> calculate(engine& engine,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor);
bool has_non_const_user(program_node& node) const;
void handle_constant(program& prog, program_node& node);
void add_constant(program& prog, program_node& node);

View File

@ -8,8 +8,7 @@
#include <string>
namespace cldnn {
std::string get_dir_path(build_options);
std::string get_serialization_network_name(build_options);
std::string get_dir_path(const ExecutionConfig& config);
void dump_graph_optimized(std::ofstream&, const program&);
void dump_graph_processing_order(std::ofstream&, const program&);

View File

@ -162,7 +162,7 @@ void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kern
std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_selector::KernelsData& kernels_data) {
std::vector<std::chrono::nanoseconds> run_times;
stream::ptr stream = _engine.create_stream();
stream::ptr stream = _engine.create_stream({});
int num_of_kernels_to_run = static_cast<int>(kernels_data.size());
int num_of_kernels_run = 0;
@ -174,7 +174,7 @@ std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_se
batch_end = batch_start + current_compilation_batch;
std::vector<kernel::ptr> kernels;
kernels_cache cache(_engine, program_id);
kernels_cache cache(_engine, {}, program_id);
for (auto it = batch_start; it < batch_end; it++) {
auto kernel_id = cache.set_kernel_source(it->kernels[0].code.kernelString, false);

View File

@ -819,17 +819,17 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
}
}
kernel_selector::tuning_mode to_tuning_mode(cldnn::tuning_mode mode) {
kernel_selector::tuning_mode to_tuning_mode(ov::intel_gpu::TuningMode mode) {
switch (mode) {
case cldnn::tuning_mode::tuning_disabled:
case ov::intel_gpu::TuningMode::tuning_disabled:
return kernel_selector::tuning_mode::TUNING_DISABLED;
case cldnn::tuning_mode::tuning_use_cache:
case ov::intel_gpu::TuningMode::tuning_use_cache:
return kernel_selector::tuning_mode::TUNING_USE_CACHE;
case cldnn::tuning_mode::tuning_tune_and_cache:
case ov::intel_gpu::TuningMode::tuning_tune_and_cache:
return kernel_selector::tuning_mode::TUNING_TUNE_AND_CACHE;
case cldnn::tuning_mode::tuning_use_and_update:
case ov::intel_gpu::TuningMode::tuning_use_and_update:
return kernel_selector::tuning_mode::TUNING_USE_AND_UPDATE;
case cldnn::tuning_mode::tuning_retune_and_cache:
case ov::intel_gpu::TuningMode::tuning_retune_and_cache:
return kernel_selector::tuning_mode::TUNING_RETUNE_AND_CACHE;
default:
return kernel_selector::tuning_mode::TUNING_DISABLED;
@ -1041,8 +1041,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
params.engineInfo.vendor_id = device_info.vendor_id;
auto impl_forcing_bo = program->get_options().get<build_option_type::force_implementations>();
const auto& impl_forcing = impl_forcing_bo->forcing;
auto impl_forcing = program->get_config().get_property(ov::intel_gpu::force_implementations);
if (impl_forcing.count(param_info.desc->id) != 0) {
params.forceImplementation = impl_forcing.at(param_info.desc->id).kernel_name;
@ -1051,14 +1050,14 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
void set_optional_params(const program& program, kernel_selector::optional_params& params) {
params.meaningfulKernelsNames = false;
params.allowStaticInputReordering = program.get_options().get<build_option_type::optimize_data>()->enabled() ||
program.get_options().get<build_option_type::allow_static_input_reorder>()->enabled();
params.allowStaticInputReordering = program.get_config().get_property(ov::intel_gpu::optimize_data) ||
program.get_config().get_property(ov::intel_gpu::allow_static_input_reorder);
params.allowInputReordering = false;
params.allowOutputReordering = false;
const auto& tuning_config = program.get_options().get<build_option_type::tuning_config>();
params.tuningParams.mode = to_tuning_mode(tuning_config->config.mode);
params.tuningParams.cacheFilePath = tuning_config->config.cache_file_path;
const auto& tuning_config = program.get_config().get_property(ov::intel_gpu::tuning_config);
params.tuningParams.mode = to_tuning_mode(tuning_config.mode);
params.tuningParams.cacheFilePath = tuning_config.cache_file_path;
}
void kernel_impl_params::save(BinaryOutputBuffer& ob) const {

View File

@ -1416,7 +1416,7 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
const size_t kBatchNum = scores_layout.batch();
const size_t kClassNum = scores_layout.feature();
const size_t kNStreams =
static_cast<size_t>(node.get_program().get_engine().configuration().throughput_streams);
static_cast<size_t>(node.get_program().get_config().get_property(ov::streams::num));
const size_t kKeyValue = kBatchNum * std::min(kClassNum, static_cast<size_t>(8)) * kNStreams;
preferred_impl = (kKeyValue > 64) ? impl_types::ocl : impl_types::cpu;
}
@ -1668,7 +1668,7 @@ format layout_optimizer::get_preferred_format(program_node& node) {
auto output_layout = node.get_output_layout();
bool use_onednn_impls = _optimization_attributes.use_onednn_impls;
bool allow_new_shape_infer = node.get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
bool allow_new_shape_infer = node.get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
if (allow_new_shape_infer) {
if (node.is_type<shape_of>())
@ -2013,7 +2013,7 @@ bool layout_optimizer::is_format_optimized(const deconvolution_node& node, const
}
}
void layout_optimizer::set_implementation_forcing(const implementation_forcing_map& map) {
void layout_optimizer::set_implementation_forcing(const ov::intel_gpu::ImplForcingMap& map) {
for (const auto& kv : map) {
_forcing_map.emplace(kv.first, std::make_pair(kv.second.output_format, kv.second.impl_type));
}

View File

@ -277,8 +277,9 @@ static uint32_t get_unique_net_id() {
Network will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants
opt pass).
*/
network::network(program::ptr program, stream::ptr stream, bool is_internal, bool is_primary_stream)
network::network(program::ptr program, const ExecutionConfig& config, stream::ptr stream, bool is_internal, bool is_primary_stream)
: _program(program)
, _config(config)
, _engine(program->get_engine())
, _stream(stream)
, _memory_pool(new memory_pool(program->get_engine()))
@ -304,34 +305,42 @@ network::network(program::ptr program, stream::ptr stream, bool is_internal, boo
if (is_dynamic()) {
GPU_DEBUG_DEFINE_MEM_LOGGER("dynamic_network_initialization");
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(program->get_engine(), program->get_id(),
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(program->get_engine(),
program->get_config(),
program->get_id(),
program->get_task_executor(),
kernel_selector::KernelBase::get_db().get_batch_header_str()));
_impls_cache = std::unique_ptr<ImplementationsCache>(new ImplementationsCache(_impls_cache_capacity));
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
_compilation_context = std::move(ICompilationContext::create(program->get_engine(), program->get_id()));
_compilation_context = std::move(ICompilationContext::create(program->get_engine(), program->get_config(), program->get_id()));
}
}
network::network(engine& engine,
const topology& topo,
const build_options& options,
const ExecutionConfig& config,
bool is_internal)
: network(program::build_program(engine, topo, options, is_internal), engine.create_stream(), is_internal) {}
: network(program::build_program(engine, topo, config, is_internal), config, engine.create_stream(config), is_internal) {}
network::network(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const build_options& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal)
: network(program::build_program(engine, nodes, options, is_internal), engine.create_stream(), is_internal) {}
: network(program::build_program(engine, nodes, config, task_executor, is_internal), config, engine.create_stream(config), is_internal) {}
network::network(program::ptr program, uint16_t stream_id)
: network(program, program->get_engine().create_stream(), false, stream_id == 0) {}
: network(program, program->get_config(), program->get_engine().create_stream(program->get_config()), false, stream_id == 0) {}
network::network(program::ptr program, stream::ptr stream, uint16_t stream_id)
: network(program, stream, false, stream_id == 0) {}
: network(program, program->get_config(), stream, false, stream_id == 0) {}
network::network(cldnn::BinaryInputBuffer& ib, stream::ptr stream, engine& engine, uint16_t stream_id)
: network(ib, ExecutionConfig{}, stream, engine, stream_id) {}
network::network(cldnn::BinaryInputBuffer& ib, const ExecutionConfig& config, stream::ptr stream, engine& engine, uint16_t stream_id)
: _program(nullptr)
, _config(config)
, _engine(engine)
, _stream(stream)
, _memory_pool(new memory_pool(engine))
@ -340,7 +349,7 @@ network::network(cldnn::BinaryInputBuffer& ib, stream::ptr stream, engine& engin
, _reset_arguments(true) {
net_id = get_unique_net_id();
kernels_cache kernels_cache(get_engine(), 0, {""});
kernels_cache kernels_cache(get_engine(), config, 0, nullptr, {""});
ib >> kernels_cache;
int num_data_nodes;
@ -442,7 +451,7 @@ network::~network() {
// [ executable primitive_inst ]
// [ memory reuse information ]
void network::save(cldnn::BinaryOutputBuffer& ob) {
kernels_cache kernels_cache(get_engine(), 0, {""});
kernels_cache kernels_cache(get_engine(), _config, 0, nullptr, {""});
for (const auto& p_inst : _exec_order) {
if (p_inst->get_impl() != nullptr)
kernels_cache.add_kernels(p_inst->get_impl()->get_kernel_ids(), p_inst->get_impl()->get_kernels());
@ -505,26 +514,27 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
}
network::ptr network::allocate_network(stream::ptr stream, program::ptr program, bool is_internal, bool is_primary_stream) {
return std::make_shared<network>(program, stream, is_internal, is_primary_stream);
return std::make_shared<network>(program, program->get_config(), stream, is_internal, is_primary_stream);
}
network::ptr network::allocate_network(engine& engine, program::ptr program, bool is_internal, bool is_primary_stream) {
auto stream = engine.create_stream();
return std::make_shared<network>(program, stream, is_internal, is_primary_stream);
auto stream = engine.create_stream(program->get_config());
return std::make_shared<network>(program, program->get_config(), stream, is_internal, is_primary_stream);
}
network::ptr network::build_network(engine& engine,
const topology& topology,
const build_options& options,
const ExecutionConfig& config,
bool is_internal) {
return std::make_shared<network>(engine, topology, options, is_internal);
return std::make_shared<network>(engine, topology, config, is_internal);
}
network::ptr network::build_network(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const build_options& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal) {
return std::make_shared<network>(engine, nodes, options, is_internal);
return std::make_shared<network>(engine, nodes, config, task_executor, is_internal);
}
void network::validate_primitives() {
@ -963,8 +973,7 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
}
// Store events only in case of OOO queue or enabled Profiling
auto store_events = get_stream().get_queue_type() == queue_types::out_of_order ||
get_engine().configuration().enable_profiling;
auto store_events = get_stream().get_queue_type() == QueueTypes::out_of_order || _config.get_property(ov::enable_profiling);
if (store_events) {
if (_program != nullptr) {
for (auto& inst : _program->get_processing_order()) {
@ -1113,8 +1122,8 @@ void network::execute_primitive(const std::shared_ptr<primitive_inst>& primitive
event::ptr ev = primitive->execute(events);
// Collect events only for OOO queue and Profiling mode
if (get_stream().get_queue_type() == queue_types::out_of_order ||
get_engine().configuration().enable_profiling) {
if (get_stream().get_queue_type() == QueueTypes::out_of_order ||
get_config().get_property(ov::enable_profiling)) {
auto id = primitive->id();
_events.insert({id, ev});
}
@ -1203,7 +1212,7 @@ memory::ptr network::get_memory_from_pool(const layout& layout,
std::set<primitive_id> dependencies,
allocation_type type,
bool reusable) {
if (get_engine().configuration().use_memory_pool)
if (_config.get_property(ov::intel_gpu::enable_memory_pool))
return _memory_pool->get_memory(layout, id, get_id(), dependencies, type, reusable);
return _memory_pool->get_memory(layout, type);
}

View File

@ -15,7 +15,7 @@
pass_manager::pass_manager(program& p) {
pass_count = 0;
auto path = get_dir_path(p.get_options());
auto path = get_dir_path(p.get_config());
if (!path.empty()) {
graph_opt_log.open(path + std::to_string(p.get_prog_id()) + "_cldnn_graph_optimizer.log");
if (graph_opt_log.is_open()) {

View File

@ -96,9 +96,13 @@ void primitive_inst::check_memory_to_set(const memory& mem, const layout& layout
// check shared image/buffer compatibility, if applicable
auto params = mem.get_internal_params();
if (params.mem_type != shared_mem_type::shared_mem_empty) {
if (!mem.is_allocated_by(get_network().get_engine())) {
CLDNN_ERROR_MESSAGE(_node->id(), "Memory object is not suitable");
}
auto& net_engine = get_network().get_engine();
auto& mem_engine = *mem.get_engine();
OPENVINO_ASSERT(mem.is_allocated_by(net_engine), "[GPU] Can't set memory due to engines mismatch. ",
"Network was created for ", &net_engine, " (",
net_engine.get_device_info().dev_name, ") engine",
" while memory object was allocated for ", &mem_engine, "(",
mem_engine.get_device_info().dev_name, ")");
switch (params.mem_type) {
case shared_mem_type::shared_mem_vasurface:
@ -182,7 +186,7 @@ void primitive_inst::update_shape() {
auto& dep = _node->get_dependency(i);
auto dep_id = dep.id();
// Events may be not created for in-order queue, so take them for OOO queue only
if (_network.has_event(dep.id()) && queue_type == queue_types::out_of_order) {
if (_network.has_event(dep.id()) && queue_type == QueueTypes::out_of_order) {
dependencies_events.push_back(_network.get_primitive_event(dep_id));
GPU_DEBUG_TRACE_DETAIL << id() << ": shape infer waits for " << i << " dependency\n";
}
@ -192,9 +196,9 @@ void primitive_inst::update_shape() {
}
if (has_runtime_deps) {
if (!dependencies_events.empty() && queue_type == queue_types::out_of_order) {
if (!dependencies_events.empty() && queue_type == QueueTypes::out_of_order) {
_network.get_stream().wait_for_events(dependencies_events);
} else if (queue_type == queue_types::in_order) {
} else if (queue_type == QueueTypes::in_order) {
_network.get_stream().finish();
}
}
@ -446,7 +450,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
dependencies = events;
} else {
auto queue_type = get_network().get_stream().get_queue_type();
if (queue_type == queue_types::out_of_order) {
if (queue_type == QueueTypes::out_of_order) {
dependencies.reserve(dependencies.size() + _exec_deps.size());
for (auto& input : _exec_deps) {
auto id = input->id();
@ -755,7 +759,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
uint32_t net_id, bool is_internal, size_t idx) {
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
allocation_type type, bool reusable) {
if (_engine.configuration().use_memory_pool)
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
return pool.get_memory(layout, id, net_id, dependencies, type, reusable);
return pool.get_memory(layout, type);
};
@ -933,10 +937,11 @@ cldnn::network::ptr primitive_inst::get_unfused_subgraph() {
in = _node->get_dependency(i).id();
}
}
build_options bo;
bo.set_option(build_option::allow_static_input_reorder(true));
bo.set_option(build_option::allow_new_shape_infer(true));
auto prog = program::build_program(get_network().get_engine(), t, bo, true, false);
ExecutionConfig subgraph_config{
ov::intel_gpu::allow_static_input_reorder(true),
ov::intel_gpu::allow_new_shape_infer(true)
};
auto prog = program::build_program(get_network().get_engine(), t, subgraph_config, true, false);
_unfused_subgraph = network::allocate_network(get_network().get_stream_ptr(), prog, true, get_network().is_primary_stream());
}

View File

@ -8,6 +8,8 @@
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "intel_gpu/graph/program.hpp"
#include <ie_system_conf.h>
#include "kernel_selector_helper.h"
#include "device_cache_reader.h"
#include "auto_tuner.h"
@ -98,13 +100,13 @@ using namespace ov::intel_gpu;
program::program(engine& engine_ref,
topology const& topology,
build_options const& options,
const ExecutionConfig& config,
bool is_internal,
bool no_optimizations,
bool is_body_program)
: _engine(engine_ref),
_stream(_engine.create_stream()),
options(options),
_stream(_engine.create_stream(config)),
_config(config),
processing_order(),
tuning_cache(nullptr),
is_body_program(is_body_program),
@ -112,10 +114,13 @@ program::program(engine& engine_ref,
init_primitives();
set_options();
query_local_block_io_supported();
_task_executor = make_task_executor(_config);
GPU_DEBUG_INFO << "Program config\n" << config.to_string();
pm = std::unique_ptr<pass_manager>(new pass_manager(*this));
prepare_nodes(topology);
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
program_node::reset_unique_id();
@ -128,11 +133,13 @@ program::program(engine& engine_ref,
program::program(engine& engine_ref,
std::set<std::shared_ptr<program_node>> const& nodes,
build_options const& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal)
: _engine(engine_ref),
_stream(_engine.create_stream()),
options(options),
_stream(_engine.create_stream(config)),
_config(config),
_task_executor(task_executor),
processing_order(),
tuning_cache(nullptr),
is_subgroup_local_block_io_supported(-1) {
@ -140,7 +147,9 @@ program::program(engine& engine_ref,
set_options();
query_local_block_io_supported();
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
_task_executor = make_task_executor(_config);
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
pm = std::unique_ptr<pass_manager>(new pass_manager(*this));
prepare_nodes(nodes);
@ -149,8 +158,8 @@ program::program(engine& engine_ref,
program::program(engine& engine)
: _engine(engine),
_stream(_engine.create_stream()),
options(build_options()),
_stream(_engine.create_stream({})),
_config(),
processing_order(),
tuning_cache(nullptr),
is_subgroup_local_block_io_supported(-1) { }
@ -171,6 +180,42 @@ void program::init_primitives() {
}
}
static void adjust_num_cores(InferenceEngine::CPUStreamsExecutor::Config& config) {
if (InferenceEngine::getAvailableCoresTypes().size() == 1) {
return;
}
const auto total_num_cores = InferenceEngine::getNumberOfLogicalCPUCores();
const auto total_num_big_cores = InferenceEngine::getNumberOfLogicalCPUCores(true);
const auto total_num_little_cores = total_num_cores - total_num_big_cores;
auto core_type = config._threadPreferredCoreType;
int num_cores = total_num_cores;
if (core_type == InferenceEngine::IStreamsExecutor::Config::BIG) {
num_cores = total_num_big_cores;
} else if (core_type == InferenceEngine::IStreamsExecutor::Config::LITTLE) {
num_cores = total_num_little_cores;
}
config._streams = std::min(config._streams, num_cores);
}
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> program::make_task_executor(const ExecutionConfig& config) const {
InferenceEngine::CPUStreamsExecutor::Config task_executor_config("CPU Tasks executor for GPU plugin", 1);
task_executor_config._streams = config.get_property(ov::compilation_num_threads);
auto priority = config.get_property(ov::intel_gpu::hint::host_task_priority);
switch (priority) {
case ov::hint::Priority::LOW: task_executor_config._threadPreferredCoreType = InferenceEngine::IStreamsExecutor::Config::LITTLE; break;
case ov::hint::Priority::MEDIUM: task_executor_config._threadPreferredCoreType = InferenceEngine::IStreamsExecutor::Config::ANY; break;
case ov::hint::Priority::HIGH: task_executor_config._threadPreferredCoreType = InferenceEngine::IStreamsExecutor::Config::BIG; break;
default: OPENVINO_ASSERT(false, "[GPU] Can't create task executor: invalid host task priority value: ", priority);
}
adjust_num_cores(task_executor_config);
return std::make_shared<InferenceEngine::CPUStreamsExecutor>(task_executor_config);
}
void program::compile() {
GPU_DEBUG_DEFINE_MEM_LOGGER("compile");
_kernels_cache->build_all();
@ -190,7 +235,7 @@ void program::load_tuning_cache() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "ProgramImpl::LoadTuningCache");
GPU_DEBUG_DEFINE_MEM_LOGGER("ProgramImpl::LoadTuningCache");
try {
tuning_cache = kernel_selector::CreateTuningCacheFromFile(get_engine().configuration().tuning_cache_path);
tuning_cache = kernel_selector::CreateTuningCacheFromFile("cache.json");
} catch (...) {
tuning_cache = std::make_shared<kernel_selector::TuningCache>();
}
@ -210,18 +255,19 @@ kernels_cache& program::get_kernels_cache() const {
program::ptr program::build_program(engine& engine,
const topology& topology,
const build_options& options,
const ExecutionConfig& config,
bool is_internal,
bool no_optimizations,
bool is_body_program) {
return std::make_shared<program>(engine, topology, options, is_internal, no_optimizations, is_body_program);
return std::make_shared<program>(engine, topology, config, is_internal, no_optimizations, is_body_program);
}
program::ptr program::build_program(engine& engine,
const std::set<std::shared_ptr<program_node>>& nodes,
const build_options& options,
const ExecutionConfig& config,
std::shared_ptr<InferenceEngine::CPUStreamsExecutor> task_executor,
bool is_internal) {
return std::make_shared<program>(engine, nodes, options, is_internal);
return std::make_shared<program>(engine, nodes, config, task_executor, is_internal);
}
program_node& program::get_node(primitive_id const& id) {
@ -449,20 +495,8 @@ void program::set_options() {
static std::atomic<uint32_t> id_gen{0};
prog_id = ++id_gen;
assert(prog_id != 0);
if ((options.get<build_option_type::tuning_config>()->config.mode == tuning_mode::tuning_tune_and_cache ||
options.get<build_option_type::tuning_config>()->config.mode == tuning_mode::tuning_retune_and_cache) &&
!_engine.configuration().enable_profiling) {
throw std::invalid_argument("Engine must be created with profiling enabled in tune_and_cache mode!");
}
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) {
options.set_option(cldnn::build_option::graph_dumps_dir(debug_config->dump_graphs));
}
if (!options.get<build_option_type::force_implementations>()->forcing.empty()) {
options.set_option(build_option::optimize_data(true));
if (!_config.get_property(ov::intel_gpu::force_implementations).empty()) {
_config.set_property(ov::intel_gpu::optimize_data(true));
}
}
@ -502,7 +536,7 @@ void program::query_local_block_io_supported() {
kernel_string->batch_compilation = true;
try {
auto _kernels_cache_device_query = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, prog_id,
auto _kernels_cache_device_query = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, nullptr,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
auto id = _kernels_cache_device_query->set_kernel_source(kernel_string, false);
_kernels_cache_device_query->build_all();
@ -533,7 +567,7 @@ void program::build_program(bool is_internal) {
#endif
prepare_memory_dependencies();
if (options.get<build_option_type::partial_build_program>()->enabled()) {
if (_config.get_property(ov::intel_gpu::partial_build_program)) {
return;
}
@ -582,7 +616,8 @@ void program::pre_optimize_graph(bool is_internal) {
node->get_output_layouts();
}
if (options.get<build_option_type::optimize_data>()->enabled()) {
bool optimize_data = _config.get_property(ov::intel_gpu::optimize_data);
if (optimize_data) {
apply_opt_pass<prepare_quantization>();
}
@ -590,7 +625,7 @@ void program::pre_optimize_graph(bool is_internal) {
set_layout_optimizer_attributes(lo);
reorder_factory rf;
if (options.get<build_option_type::optimize_data>()->enabled()) {
if (optimize_data) {
apply_opt_pass<prepare_primitive_fusing_through>();
apply_opt_pass<pre_replace_deconv>(lo);
@ -623,7 +658,7 @@ void program::pre_optimize_graph(bool is_internal) {
apply_opt_pass<prepare_padding>(output_size_handling_enabled);
apply_opt_pass<remove_redundant_reorders>(lo, options.get<build_option_type::optimize_data>()->enabled());
apply_opt_pass<remove_redundant_reorders>(lo, optimize_data);
if (!is_internal) {
// ToDo remove hidden dependencies from propagate_constants pass
@ -631,7 +666,7 @@ void program::pre_optimize_graph(bool is_internal) {
}
// try to fuse buffers (i.e. depth_concat in bfyx format) after padding calculations
if (options.get<build_option_type::optimize_data>()->enabled()) {
if (optimize_data) {
apply_opt_pass<prepare_buffer_fusing>();
}
@ -653,17 +688,18 @@ void program::post_optimize_graph(bool is_internal) {
apply_opt_pass<remove_redundant_reorders>(lo, false, true); // TODO: do we need it at this place also?
auto partial_build = _config.get_property(ov::intel_gpu::partial_build_program);
#ifdef GPU_DEBUG_CONFIG
GPU_DEBUG_GET_INSTANCE(debug_config);
if (!is_internal && (!options.get<build_option_type::partial_build_program>()->enabled() || !debug_config->dry_run_path.empty())) {
if (!is_internal && (!partial_build || !debug_config->dry_run_path.empty())) {
#else
if (!is_internal && !options.get<build_option_type::partial_build_program>()->enabled()) {
if (!is_internal && !partial_build) {
#endif
// ToDo remove hidden dependencies from propagate_constants pass
apply_opt_pass<propagate_constants>();
}
if (options.get<build_option_type::optimize_data>()->enabled())
if (_config.get_property(ov::intel_gpu::optimize_data))
apply_opt_pass<remove_redundant_reorders>(lo, false, true, true); // pass to remove output reorders while all others graph optimizations were done
// update loop input/output primitive mappings
@ -743,17 +779,6 @@ void program::cleanup() {
for (auto& node : processing_order)
node->get_output_layout();
// in debug build, at the end, mark all nodes as outputs so user can query for buffers of all not-optimized nodes,
// including internal ones etc.
if (is_debug_build()) {
for (auto& node : processing_order) {
if (!node->is_output()) {
node->set_output(true);
outputs.push_back(node);
}
}
}
_kernels_cache->reset();
}
@ -786,7 +811,7 @@ program::nodes_ordering& program::get_processing_order() { return processing_ord
const program::nodes_ordering& program::get_processing_order() const { return processing_order; }
void program::prepare_memory_dependencies() {
if (!get_engine().configuration().use_memory_pool)
if (!_config.get_property(ov::intel_gpu::enable_memory_pool))
return;
apply_opt_pass<basic_memory_dependencies>();
@ -1046,7 +1071,7 @@ bool program::remove_if_dangling(program_node& node) {
if (!node.dependencies.empty())
return false;
if (!node.is_output() || is_debug_build()) {
if (!node.is_output()) {
if (node.is_input())
inputs.remove(&node);
@ -1062,7 +1087,7 @@ bool program::extract(program_node& node) {
if (node.get_dependencies().size() != 1)
return false;
if (node.is_output() && !is_debug_build()) {
if (node.is_output()) {
auto& prev = node.get_dependency(0);
auto node_id = node.id();
@ -1248,7 +1273,7 @@ void program::remove_nodes(std::vector<program_node*>& to_remove) {
void program::dump_program(const char* stage,
bool with_full_info,
std::function<bool(program_node const&)> const& filter) const {
std::string path = get_dir_path(options);
std::string path = get_dir_path(_config);
if (path.empty() || !with_full_info) {
return;
}
@ -1372,7 +1397,7 @@ program::primitives_info program::get_current_stage_info() const {
void program::save_pass_info(std::string pass_name) {
// TODO: Directory path here can be probably changed to some bool flag
if (!options.get<build_option_type::graph_dumps_dir>()->directory_path.empty())
if (!_config.get_property(ov::intel_gpu::dump_graphs).empty())
optimizer_passes_info.emplace_back(pass_name, get_current_stage_info());
}
@ -1400,7 +1425,8 @@ const program::primitives_info& program::get_primitives_info() const { return pr
void program::apply_opt_pass(base_pass& pass) { pm->run(*this, pass); }
void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
lo.set_implementation_forcing(options.get<build_option_type::force_implementations>()->forcing);
lo.set_implementation_forcing(_config.get_property(ov::intel_gpu::force_implementations));
// first pass to set layout optimization_attributes for topology
bool can_use_fsv16 = true;
@ -1625,7 +1651,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
auto& engine = get_engine();
if (engine.get_device_info().supports_immad &&
engine.get_device_info().vendor_id == INTEL_VENDOR_ID &&
engine.configuration().queue_type == queue_types::in_order)
get_config().get_property(ov::intel_gpu::queue_type) == QueueTypes::in_order)
lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, 1);
#endif
}

View File

@ -139,8 +139,8 @@ std::string get_node_id(const program_node* ptr) { return "node_" + std::to_stri
void dump_full_node(std::ofstream& out, const program_node* node) { out << node->type()->to_string(*node); }
} // namespace
std::string get_dir_path(build_options opts) {
auto path = opts.get<build_option_type::graph_dumps_dir>()->directory_path;
std::string get_dir_path(const ExecutionConfig& config) {
auto path = config.get_property(ov::intel_gpu::dump_graphs);
if (path.empty()) {
return {};
}
@ -151,15 +151,6 @@ std::string get_dir_path(build_options opts) {
return path;
}
/// Returns given name for serialization process.
inline std::string get_serialization_network_name(build_options opts) {
return opts.get<build_option_type::serialize_network>()->serialization_network_name;
}
inline std::string get_load_program_name(build_options opts) {
return opts.get<build_option_type::load_program>()->load_program_name;
}
void dump_graph_init(std::ofstream& graph,
const program& program,
std::function<bool(program_node const&)> const& filter) {

View File

@ -245,8 +245,7 @@ bool program_node::is_detached(bool whole_branch) {
}
layout program_node::calc_output_layout() const {
bool allow_new_shape_infer =
get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
bool allow_new_shape_infer = get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
if (allow_new_shape_infer) {
auto out_layouts = type()->calc_output_layouts(*this, *get_kernel_impl_params());
if (!out_layouts.empty()) {
@ -262,8 +261,7 @@ layout program_node::calc_output_layout() const {
}
std::vector<layout> program_node::calc_output_layouts() const {
bool allow_new_shape_infer =
get_program().get_options().get<build_option_type::allow_new_shape_infer>()->enabled();
bool allow_new_shape_infer = get_program().get_config().get_property(ov::intel_gpu::allow_new_shape_infer);
if (allow_new_shape_infer) {
auto out_layouts = type()->calc_output_layouts(*this, *get_kernel_impl_params());
if (!out_layouts.empty())
@ -802,7 +800,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
memory::ptr cur_bin_mem_ptr = cur_node.as<data>().get_attached_memory_ptr();
if (cur_bin_mem_ptr == nullptr)
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for bin + eltw");
auto& stream = cur_bin_mem_ptr->get_engine()->get_program_stream();
auto& stream = cur_bin_mem_ptr->get_engine()->get_service_stream();
mem_lock<float, mem_lock_type::read_write> bin_and_eltw_lock(cur_bin_mem_ptr, stream);
size_t cur_bin_mem_size = cur_node.get_output_layout().count();
@ -844,7 +842,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
memory::ptr prev_bin_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
if (prev_bin_mem_ptr == nullptr)
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + bin");
auto& stream = prev_bin_mem_ptr->get_engine()->get_program_stream();
auto& stream = prev_bin_mem_ptr->get_engine()->get_service_stream();
mem_lock<float, mem_lock_type::read_write> eltw_and_bin_lock(prev_bin_mem_ptr, stream);
size_t prev_bin_mem_size = prev_node.get_output_layout().count();
@ -932,7 +930,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const
memory::ptr prev_scale_mem_ptr = prev_node.as<data>().get_attached_memory_ptr();
if (prev_scale_mem_ptr == nullptr)
throw std::runtime_error("OneDNN post-ops optimization error: nonexistent node for eltw + scale");
auto& stream = prev_scale_mem_ptr->get_engine()->get_program_stream();
auto& stream = prev_scale_mem_ptr->get_engine()->get_service_stream();
mem_lock<float, mem_lock_type::read_write> eltw_and_scale_lock(prev_scale_mem_ptr, stream);
size_t prev_scale_mem_size = prev_node.get_output_layout().count();

View File

@ -11,6 +11,7 @@
#include "intel_gpu/plugin/compiled_model.hpp"
#include "intel_gpu/plugin/async_infer_request.hpp"
#include "intel_gpu/plugin/async_infer_request_legacy.hpp"
#include "intel_gpu/plugin/legacy_api_helper.hpp"
#include "openvino/runtime/intel_gpu/properties.hpp"
#include <description_buffer.hpp>
@ -35,30 +36,27 @@ using namespace InferenceEngine::details;
namespace ov {
namespace intel_gpu {
CompiledModel::CompiledModel(InferenceEngine::CNNNetwork &network, std::shared_ptr<InferenceEngine::RemoteContext> context, Config config) :
CompiledModel::CompiledModel(InferenceEngine::CNNNetwork &network,
InferenceEngine::RemoteContext::Ptr context,
const ExecutionConfig& config) :
InferenceEngine::ExecutableNetworkThreadSafeDefault{[&]() -> InferenceEngine::ITaskExecutor::Ptr {
if (config.exclusiveAsyncRequests) {
if (config.get_property(ov::intel_gpu::exclusive_async_requests)) {
//exclusiveAsyncRequests essentially disables the streams (and hence should be checked first) => aligned with the CPU behavior
return executorManager()->getExecutor("GPU");
} else if (config.throughput_streams > 1) {
} else if (config.get_property(ov::num_streams) > 1) {
return std::make_shared<InferenceEngine::CPUStreamsExecutor>(
IStreamsExecutor::Config{"Intel GPU plugin executor", config.throughput_streams});
IStreamsExecutor::Config{"Intel GPU plugin executor", config.get_property(ov::num_streams)});
} else {
return std::make_shared<InferenceEngine::CPUStreamsExecutor>(
IStreamsExecutor::Config{"Intel GPU plugin executor", 1});
}
}()},
m_context(context),
m_config(config),
m_taskExecutor{ _taskExecutor },
m_waitExecutor(executorManager()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) {
auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
OPENVINO_ASSERT((casted_context != nullptr), "Invalid remote context");
m_context = casted_context;
auto graph_base = std::make_shared<Graph>(network, m_context, m_config, 0);
for (uint16_t n = 0; n < m_config.throughput_streams; n++) {
auto graph_base = std::make_shared<Graph>(network, get_context_impl(m_context), m_config, 0);
for (uint16_t n = 0; n < m_config.get_property(ov::num_streams); n++) {
auto graph = n == 0 ? graph_base : std::make_shared<Graph>(graph_base, n);
m_graphs.push_back(graph);
}
@ -87,29 +85,27 @@ static InferenceEngine::Layout layout_from_string(const std::string & name) {
IE_THROW(NetworkNotRead) << "Unknown layout with name '" << name << "'";
}
CompiledModel::CompiledModel(std::istream& networkModel, std::shared_ptr<InferenceEngine::RemoteContext> context, Config config) :
CompiledModel::CompiledModel(std::istream& networkModel, InferenceEngine::RemoteContext::Ptr context, const ExecutionConfig& config) :
InferenceEngine::ExecutableNetworkThreadSafeDefault{[&]() -> InferenceEngine::ITaskExecutor::Ptr {
if (config.exclusiveAsyncRequests) {
if (config.get_property(ov::intel_gpu::exclusive_async_requests)) {
//exclusiveAsyncRequests essentially disables the streams (and hence should be checked first) => aligned with the CPU behavior
return executorManager()->getExecutor("GPU");
} else if (config.throughput_streams > 1) {
} else if (config.get_property(ov::num_streams) > 1) {
return std::make_shared<InferenceEngine::CPUStreamsExecutor>(
IStreamsExecutor::Config{"Intel GPU plugin executor", config.throughput_streams});
IStreamsExecutor::Config{"Intel GPU plugin executor", config.get_property(ov::num_streams)});
} else {
return std::make_shared<InferenceEngine::CPUStreamsExecutor>(
IStreamsExecutor::Config{"Intel GPU plugin executor", 1});
}
}()},
m_context(context),
m_config(config),
m_taskExecutor{ _taskExecutor },
m_waitExecutor(executorManager()->getIdleCPUStreamsExecutor({ "GPUWaitExecutor" })) {
auto casted_context = std::dynamic_pointer_cast<gpu::ClContext>(context);
auto context_impl = get_context_impl(m_context);
auto& engine = context_impl->get_engine();
OPENVINO_ASSERT((casted_context != nullptr), "Invalid remote context");
m_context = casted_context;
cldnn::BinaryInputBuffer ib(networkModel, *getContextImpl(m_context)->GetEngine());
cldnn::BinaryInputBuffer ib(networkModel, engine);
// InputsInfo and OutputsInfor for CNNNetwork
{
@ -255,8 +251,8 @@ CompiledModel::CompiledModel(std::istream& networkModel, std::shared_ptr<Inferen
setOutputs(new_results);
}
auto graph_base = std::make_shared<Graph>(ib, m_context, m_config, 0);
for (uint16_t n = 0; n < m_config.throughput_streams; n++) {
auto graph_base = std::make_shared<Graph>(ib, context_impl, m_config, 0);
for (uint16_t n = 0; n < m_config.get_property(ov::num_streams); n++) {
auto graph = n == 0 ? graph_base : std::make_shared<Graph>(graph_base, n);
m_graphs.push_back(graph);
}
@ -266,9 +262,9 @@ template <class T>
IInferRequestInternal::Ptr CompiledModel::GetInferRequestImpl(const std::vector<std::shared_ptr<const ov::Node>>& inputs,
const std::vector<std::shared_ptr<const ov::Node>>& outputs) {
auto ptr = std::make_shared<T>(inputs, outputs, std::static_pointer_cast<CompiledModel>(shared_from_this()));
if (m_config.throughput_streams > 1)
if (m_config.get_property(ov::num_streams) > 1)
ptr->EnableStreams();
if (m_config.useProfiling)
if (m_config.get_property(ov::enable_profiling))
ptr->EnableProfiling();
if (m_graphs.front()->use_external_queue())
ptr->enable_external_queue();
@ -282,9 +278,9 @@ IInferRequestInternal::Ptr CompiledModel::CreateInferRequestImpl(InputsDataMap n
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "CompiledModel::CreateInferRequestImpl");
auto ptr = std::make_shared<InferRequestLegacy>(networkInputs, networkOutputs,
std::static_pointer_cast<CompiledModel>(shared_from_this()));
if (m_config.throughput_streams > 1)
if (m_config.get_property(ov::num_streams) > 1)
ptr->EnableStreams();
if (m_config.useProfiling)
if (m_config.get_property(ov::enable_profiling))
ptr->EnableProfiling();
if (m_graphs.front()->use_external_queue())
ptr->enable_external_queue();
@ -469,50 +465,17 @@ std::shared_ptr<ngraph::Function> CompiledModel::GetExecGraphInfo() {
}
InferenceEngine::Parameter CompiledModel::GetConfig(const std::string &name) const {
const bool is_new_api = _plugin->IsNewAPI();
auto it = m_config.key_config_map.find(name);
if (it != m_config.key_config_map.end()) {
std::string val = it->second;
if (is_new_api) {
if (name == ov::enable_profiling) {
return val == PluginConfigParams::YES ? true : false;
} else if (name == ov::hint::model_priority) {
return ov::util::from_string(val, ov::hint::model_priority);
} else if (name == ov::intel_gpu::hint::host_task_priority) {
return ov::util::from_string(val, ov::intel_gpu::hint::host_task_priority);
} else if (name == ov::intel_gpu::hint::queue_priority) {
return ov::util::from_string(val, ov::intel_gpu::hint::queue_priority);
} else if (name == ov::intel_gpu::hint::queue_throttle) {
return ov::util::from_string(val, ov::intel_gpu::hint::queue_throttle);
} else if (name == ov::intel_gpu::enable_loop_unrolling) {
return val == PluginConfigParams::YES ? true : false;
} else if (name == ov::cache_dir) {
return ov::util::from_string(val, ov::cache_dir);
} else if (name == ov::hint::performance_mode) {
return ov::util::from_string(val, ov::hint::performance_mode);
} else if (name == ov::compilation_num_threads) {
return ov::util::from_string(val, ov::compilation_num_threads);
} else if (name == ov::num_streams) {
return ov::util::from_string(val, ov::num_streams);
} else if (name == ov::hint::num_requests) {
return ov::util::from_string(val, ov::hint::num_requests);
} else if (name == ov::hint::inference_precision) {
return ov::util::from_string(val, ov::hint::inference_precision);
} else if (name == ov::device::id) {
return ov::util::from_string(val, ov::device::id);
} else {
auto actual_name = name;
if (LegacyAPIHelper::is_legacy_property({name, nullptr}, _plugin->IsNewAPI())) {
actual_name = LegacyAPIHelper::convert_legacy_property({name, nullptr}).first;
}
auto val = m_config.get_property(actual_name);
if (LegacyAPIHelper::is_legacy_property({name, nullptr}, _plugin->IsNewAPI())) {
val = LegacyAPIHelper::convert_to_legacy_property({actual_name, val}).second;
}
return val;
}
} else {
if (name == PluginConfigParams::KEY_MODEL_PRIORITY ||
name == GPUConfigParams::KEY_GPU_HOST_TASK_PRIORITY)
return Config::ConvertPropertyToLegacy(name, val);
else
return val;
}
} else {
IE_THROW() << "Unsupported ExecutableNetwork config key: " << name;
}
}
InferenceEngine::Parameter CompiledModel::GetMetric(const std::string &name) const {
@ -550,14 +513,28 @@ InferenceEngine::Parameter CompiledModel::GetMetric(const std::string &name) con
metrics.push_back(METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS));
IE_SET_METRIC_RETURN(SUPPORTED_METRICS, metrics);
} else if (name == METRIC_KEY(SUPPORTED_CONFIG_KEYS)) {
std::vector<std::string> configKeys;
for (auto && value : m_config.key_config_map)
if (!Config::isNewApiProperty(value.first))
configKeys.push_back(value.first);
static const std::vector<std::string> configKeys {
CONFIG_KEY(MODEL_PRIORITY),
CONFIG_KEY(PERFORMANCE_HINT),
CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS),
CONFIG_KEY(PERF_COUNT),
CONFIG_KEY(DYN_BATCH_ENABLED),
CONFIG_KEY(CONFIG_FILE),
CONFIG_KEY(DEVICE_ID),
CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
CONFIG_KEY(CACHE_DIR),
CONFIG_KEY(GPU_THROUGHPUT_STREAMS),
GPU_CONFIG_KEY(PLUGIN_PRIORITY),
GPU_CONFIG_KEY(PLUGIN_THROTTLE),
GPU_CONFIG_KEY(HOST_TASK_PRIORITY),
GPU_CONFIG_KEY(NV12_TWO_INPUTS),
GPU_CONFIG_KEY(MAX_NUM_THREADS),
GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING),
};
IE_SET_METRIC_RETURN(SUPPORTED_CONFIG_KEYS, configKeys);
} else if (name == ov::optimal_number_of_infer_requests) {
unsigned int nr = m_config.throughput_streams;
if (m_config.perfHintsConfig.ovPerfHint != CONFIG_VALUE(LATENCY))
unsigned int nr = m_config.get_property(ov::num_streams);
if (m_config.get_property(ov::hint::performance_mode) != ov::hint::PerformanceMode::LATENCY)
nr *= 2;
return decltype(ov::optimal_number_of_infer_requests)::value_type {nr};
} else if (name == ov::execution_devices) {

View File

@ -1,499 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/device_config.hpp"
#include <ie_system_conf.h>
#include <sys/stat.h>
#include <gpu/gpu_config.hpp>
#include <thread>
#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
#include "file_utils.h"
#include "ie_api.h"
#include "intel_gpu/runtime/itt.hpp"
#include "openvino/runtime/intel_gpu/properties.hpp"
#include <openvino/util/common_util.hpp>
#ifdef _WIN32
# include <direct.h>
# ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
# define mkdir(dir, mode) _wmkdir(dir)
# else
# define mkdir(dir, mode) _mkdir(dir)
# endif // OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
#endif // _WIN32
using namespace InferenceEngine;
namespace ov {
namespace intel_gpu {
static void createDirectory(std::string _path) {
#if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
std::wstring widepath = ov::util::string_to_wstring(_path.c_str());
const wchar_t* path = widepath.c_str();
#else
const char* path = _path.c_str();
#endif
auto err = mkdir(path, 0755);
if (err != 0 && errno != EEXIST) {
IE_THROW() << "Couldn't create directory! (err=" << err << "; errno=" << errno << ")";
}
}
static int getNumberOfCores(const IStreamsExecutor::Config::PreferredCoreType core_type) {
const auto total_num_cores = getNumberOfLogicalCPUCores();
const auto total_num_big_cores = getNumberOfLogicalCPUCores(true);
const auto total_num_little_cores = total_num_cores - total_num_big_cores;
int num_cores = total_num_cores;
if (core_type == IStreamsExecutor::Config::BIG) {
num_cores = total_num_big_cores;
} else if (core_type == IStreamsExecutor::Config::LITTLE) {
num_cores = total_num_little_cores;
}
return num_cores;
}
IE_SUPPRESS_DEPRECATED_START
void Config::UpdateFromMap(const std::map<std::string, std::string>& configMap, const cldnn::device_info& info) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Config::UpdateFromMap");
for (auto& kvp : configMap) {
std::string key = kvp.first;
std::string val = kvp.second;
const auto hints = perfHintsConfig.SupportedKeys();
if (hints.end() != std::find(hints.begin(), hints.end(), key)) {
perfHintsConfig.SetConfig(key, val);
} else if (key == ov::hint::inference_precision) {
std::stringstream ss(val);
ss >> inference_precision;
OPENVINO_ASSERT(inference_precision == ov::element::f16 ||
inference_precision == ov::element::f32 ||
inference_precision == ov::element::undefined,
"Unexpected inference precision set: ", inference_precision);
} else if (key.compare(PluginConfigParams::KEY_PERF_COUNT) == 0 || key == ov::enable_profiling) {
if (val.compare(PluginConfigParams::YES) == 0) {
useProfiling = true;
} else if (val.compare(PluginConfigParams::NO) == 0) {
useProfiling = false;
} else {
IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
}
} else if (key.compare(PluginConfigParams::KEY_DYN_BATCH_ENABLED) == 0) {
if (val.compare(PluginConfigParams::YES) == 0) {
enableDynamicBatch = true;
} else if (val.compare(PluginConfigParams::NO) == 0) {
enableDynamicBatch = false;
} else {
IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
}
} else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY) == 0) {
std::stringstream ss(val);
uint32_t uVal(0);
ss >> uVal;
if (ss.fail()) {
IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
}
switch (uVal) {
case 0:
case 2:
queuePriority = cldnn::priority_mode_types::med;
break;
case 1:
queuePriority = cldnn::priority_mode_types::low;
break;
case 3:
queuePriority = cldnn::priority_mode_types::high;
break;
default:
IE_THROW(ParameterMismatch) << "Unsupported queue priority value: " << uVal;
}
} else if (key == ov::intel_gpu::hint::queue_priority) {
std::stringstream ss(val);
ov::hint::Priority priority;
ss >> priority;
if (priority == ov::hint::Priority::HIGH)
queuePriority = cldnn::priority_mode_types::high;
else if (priority == ov::hint::Priority::MEDIUM)
queuePriority = cldnn::priority_mode_types::med;
else
queuePriority = cldnn::priority_mode_types::low;
} else if (key.compare(PluginConfigParams::KEY_MODEL_PRIORITY) == 0 || key == ov::hint::model_priority) {
if (val.compare(PluginConfigParams::MODEL_PRIORITY_HIGH) == 0 ||
val.compare(ov::util::to_string(ov::hint::Priority::HIGH)) == 0) {
queuePriority = cldnn::priority_mode_types::high;
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::BIG;
} else if (val.compare(PluginConfigParams::MODEL_PRIORITY_MED) == 0 ||
val.compare(ov::util::to_string(ov::hint::Priority::MEDIUM)) == 0) {
queuePriority = cldnn::priority_mode_types::med;
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::ANY;
} else if (val.compare(PluginConfigParams::MODEL_PRIORITY_LOW) == 0 ||
val.compare(ov::util::to_string(ov::hint::Priority::LOW)) == 0) {
queuePriority = cldnn::priority_mode_types::low;
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::LITTLE;
} else {
IE_THROW() << "Not found appropriate value for config key " << PluginConfigParams::KEY_MODEL_PRIORITY
<< ".\n";
}
if (getAvailableCoresTypes().size() > 1) {
if (task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::BIG ||
task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::LITTLE) {
task_exec_config._streams = std::min(task_exec_config._streams,
getNumberOfCores(task_exec_config._threadPreferredCoreType));
}
} else {
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::ANY;
task_exec_config._streams =
std::min(task_exec_config._streams, static_cast<int>(std::thread::hardware_concurrency()));
}
} else if (key.compare(GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE) == 0) {
std::stringstream ss(val);
uint32_t uVal(0);
ss >> uVal;
if (ss.fail()) {
IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
}
switch (uVal) {
case 0:
case 2:
queueThrottle = cldnn::throttle_mode_types::med;
break;
case 1:
queueThrottle = cldnn::throttle_mode_types::low;
break;
case 3:
queueThrottle = cldnn::throttle_mode_types::high;
break;
default:
IE_THROW(ParameterMismatch) << "Unsupported queue throttle value: " << uVal;
}
} else if (key == ov::intel_gpu::hint::queue_throttle) {
std::stringstream ss(val);
ov::intel_gpu::hint::ThrottleLevel throttle;
ss >> throttle;
if (throttle == ov::intel_gpu::hint::ThrottleLevel::HIGH)
queueThrottle = cldnn::throttle_mode_types::high;
else if (throttle == ov::intel_gpu::hint::ThrottleLevel::MEDIUM)
queueThrottle = cldnn::throttle_mode_types::med;
else
queueThrottle = cldnn::throttle_mode_types::low;
} else if (key.compare(PluginConfigParams::KEY_CONFIG_FILE) == 0) {
std::stringstream ss(val);
std::istream_iterator<std::string> begin(ss);
std::istream_iterator<std::string> end;
std::vector<std::string> configFiles(begin, end);
for (auto& file : configFiles) {
CustomLayer::LoadFromFile(file, customLayers);
}
} else if (key.compare(PluginConfigParams::KEY_CACHE_DIR) == 0 || key == ov::cache_dir) {
if (!val.empty()) {
kernels_cache_dir = val;
createDirectory(kernels_cache_dir);
}
} else if (key.compare(PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS) == 0) {
if (val.compare(PluginConfigParams::YES) == 0) {
exclusiveAsyncRequests = true;
} else if (val.compare(PluginConfigParams::NO) == 0) {
exclusiveAsyncRequests = false;
} else {
IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
}
} else if (key.compare(PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS) == 0 || key == ov::num_streams) {
if (val.compare(PluginConfigParams::GPU_THROUGHPUT_AUTO) == 0 ||
val.compare(ov::util::to_string(ov::streams::AUTO)) == 0) {
throughput_streams = std::max(GetDefaultNStreamsForThroughputMode(), info.num_ccs);
} else {
int val_i;
try {
val_i = std::stoi(val);
} catch (const std::exception&) {
IE_THROW() << "Wrong value for property key " << PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS
<< ". Expected only positive numbers (#streams) or "
<< "PluginConfigParams::GPU_THROUGHPUT_AUTO";
}
if (val_i > 0)
throughput_streams = static_cast<uint16_t>(val_i);
}
} else if (key.compare(PluginConfigParams::KEY_DEVICE_ID) == 0 || key == ov::device::id) {
// Validate if passed value is postivie number.
try {
int val_i = std::stoi(val);
(void)val_i;
} catch (const std::exception&) {
IE_THROW() << "Wrong value for property key " << ov::device::id.name()
<< ". DeviceIDs are only represented by positive numbers";
}
// Set this value.
device_id = val;
} else if (key.compare(PluginConfigInternalParams::KEY_LP_TRANSFORMS_MODE) == 0) {
if (val.compare(PluginConfigParams::YES) == 0) {
enableInt8 = true;
} else if (val.compare(PluginConfigParams::NO) == 0) {
enableInt8 = false;
} else {
IE_THROW(NotFound) << "Unsupported property value by plugin: " << val;
}
} else if (key.compare(GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS) == 0) {
if (val.compare(PluginConfigParams::YES) == 0) {
nv12_two_inputs = true;
} else if (val.compare(PluginConfigParams::NO) == 0) {
nv12_two_inputs = false;
} else {
IE_THROW(NotFound) << "Unsupported NV12 flag value: " << val;
}
} else if (key.compare(GPUConfigParams::KEY_GPU_MAX_NUM_THREADS) == 0 || key == ov::compilation_num_threads) {
int max_threads = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
try {
int val_i = std::stoi(val);
if (val_i <= 0 || val_i > max_threads) {
val_i = max_threads;
}
task_exec_config._streams = std::min(task_exec_config._streams, val_i);
} catch (const std::exception&) {
IE_THROW() << "Wrong value for property key " << GPUConfigParams::KEY_GPU_MAX_NUM_THREADS << ": " << val
<< "\nSpecify the number of threads use for build as an integer."
<< "\nOut of range value will be set as a default value, maximum concurrent threads.";
}
} else if (key.compare(GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING) == 0 ||
key == ov::intel_gpu::enable_loop_unrolling) {
if (val.compare(PluginConfigParams::YES) == 0) {
enable_loop_unrolling = true;
} else if (val.compare(PluginConfigParams::NO) == 0) {
enable_loop_unrolling = false;
} else {
IE_THROW(ParameterMismatch) << "Unsupported KEY_GPU_ENABLE_LOOP_UNROLLING flag value: " << val;
}
} else if (key.compare(GPUConfigParams::KEY_GPU_HOST_TASK_PRIORITY) == 0 ||
key == ov::intel_gpu::hint::host_task_priority) {
if (val.compare(GPUConfigParams::GPU_HOST_TASK_PRIORITY_HIGH) == 0 ||
val.compare(ov::util::to_string(ov::hint::Priority::HIGH)) == 0) {
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::BIG;
} else if (val.compare(GPUConfigParams::GPU_HOST_TASK_PRIORITY_MEDIUM) == 0 ||
val.compare(ov::util::to_string(ov::hint::Priority::MEDIUM)) == 0) {
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::ANY;
} else if (val.compare(GPUConfigParams::GPU_HOST_TASK_PRIORITY_LOW) == 0 ||
val.compare(ov::util::to_string(ov::hint::Priority::LOW)) == 0) {
task_exec_config._threadPreferredCoreType = IStreamsExecutor::Config::LITTLE;
} else {
IE_THROW(NotFound) << "Unsupported host task priority by plugin: " << val;
}
} else {
IE_THROW(NotFound) << "Unsupported property key by plugin: " << key;
}
adjustKeyMapValues();
}
}
void Config::adjustKeyMapValues() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Config::AdjustKeyMapValues");
if (useProfiling) {
key_config_map[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::YES;
key_config_map[ov::enable_profiling.name()] = PluginConfigParams::YES;
} else {
key_config_map[PluginConfigParams::KEY_PERF_COUNT] = PluginConfigParams::NO;
key_config_map[ov::enable_profiling.name()] = PluginConfigParams::NO;
}
if (exclusiveAsyncRequests)
key_config_map[PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS] = PluginConfigParams::YES;
else
key_config_map[PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS] = PluginConfigParams::NO;
if (enableDynamicBatch)
key_config_map[PluginConfigParams::KEY_DYN_BATCH_ENABLED] = PluginConfigParams::YES;
else
key_config_map[PluginConfigParams::KEY_DYN_BATCH_ENABLED] = PluginConfigParams::NO;
if (nv12_two_inputs) {
key_config_map[GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS] = PluginConfigParams::YES;
} else {
key_config_map[GPUConfigParams::KEY_GPU_NV12_TWO_INPUTS] = PluginConfigParams::NO;
}
key_config_map[ov::hint::inference_precision.name()] = inference_precision.get_type_name();
{
if (queuePriority == cldnn::priority_mode_types::high &&
(task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::BIG ||
getAvailableCoresTypes().size() == 1)) {
key_config_map[ov::hint::model_priority.name()] =
ov::util::to_string(ov::hint::Priority::HIGH);
} else if (queuePriority == cldnn::priority_mode_types::low &&
(task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::LITTLE ||
getAvailableCoresTypes().size() == 1)) {
key_config_map[ov::hint::model_priority.name()] =
ov::util::to_string(ov::hint::Priority::LOW);
} else if (queuePriority == cldnn::priority_mode_types::med &&
task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::ANY) {
key_config_map[ov::hint::model_priority.name()] =
ov::util::to_string(ov::hint::Priority::MEDIUM);
}
}
{
std::string qp = "0";
switch (queuePriority) {
case cldnn::priority_mode_types::low:
qp = "1";
break;
case cldnn::priority_mode_types::med:
qp = "2";
break;
case cldnn::priority_mode_types::high:
qp = "3";
break;
default:
break;
}
key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY] = qp;
}
{
std::string priority;
if (queuePriority == cldnn::priority_mode_types::high)
priority = ov::util::to_string(ov::hint::Priority::HIGH);
else if (queuePriority == cldnn::priority_mode_types::low)
priority = ov::util::to_string(ov::hint::Priority::LOW);
else
priority = ov::util::to_string(ov::hint::Priority::MEDIUM);
key_config_map[ov::intel_gpu::hint::queue_priority.name()] = priority;
}
{
std::string qt = "0";
switch (queueThrottle) {
case cldnn::throttle_mode_types::low:
qt = "1";
break;
case cldnn::throttle_mode_types::med:
qt = "2";
break;
case cldnn::throttle_mode_types::high:
qt = "3";
break;
default:
break;
}
key_config_map[GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE] = qt;
}
{
std::string throttleLevel;
if (queueThrottle == cldnn::throttle_mode_types::high)
throttleLevel = ov::util::to_string(ov::intel_gpu::hint::ThrottleLevel::HIGH);
else if (queueThrottle == cldnn::throttle_mode_types::low)
throttleLevel = ov::util::to_string(ov::intel_gpu::hint::ThrottleLevel::LOW);
else
throttleLevel = ov::util::to_string(ov::intel_gpu::hint::ThrottleLevel::MEDIUM);
key_config_map[ov::intel_gpu::hint::queue_throttle.name()] = throttleLevel;
}
{
std::string hostTaskPriority;
if (task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::LITTLE)
hostTaskPriority = ov::util::to_string(ov::hint::Priority::LOW);
else if (task_exec_config._threadPreferredCoreType == IStreamsExecutor::Config::BIG)
hostTaskPriority = ov::util::to_string(ov::hint::Priority::HIGH);
else
hostTaskPriority = ov::util::to_string(ov::hint::Priority::MEDIUM);
key_config_map[ov::intel_gpu::hint::host_task_priority.name()] = hostTaskPriority;
}
key_config_map[PluginConfigParams::KEY_CACHE_DIR] = kernels_cache_dir;
key_config_map[ov::cache_dir.name()] = kernels_cache_dir;
key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams);
key_config_map[ov::num_streams.name()] = std::to_string(throughput_streams);
key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id;
key_config_map[ov::device::id.name()] = device_id;
key_config_map[PluginConfigParams::KEY_CONFIG_FILE] = "";
key_config_map[GPUConfigParams::KEY_GPU_MAX_NUM_THREADS] = std::to_string(task_exec_config._streams);
key_config_map[ov::compilation_num_threads.name()] = std::to_string(task_exec_config._streams);
if (enable_loop_unrolling) {
key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::YES;
key_config_map[ov::intel_gpu::enable_loop_unrolling.name()] = PluginConfigParams::YES;
} else {
key_config_map[GPUConfigParams::KEY_GPU_ENABLE_LOOP_UNROLLING] = PluginConfigParams::NO;
key_config_map[ov::intel_gpu::enable_loop_unrolling.name()] = PluginConfigParams::NO;
}
key_config_map[PluginConfigParams::KEY_PERFORMANCE_HINT] = perfHintsConfig.ovPerfHint;
key_config_map[ov::hint::performance_mode.name()] = perfHintsConfig.ovPerfHint;
key_config_map[PluginConfigParams::KEY_PERFORMANCE_HINT_NUM_REQUESTS] =
std::to_string(perfHintsConfig.ovPerfHintNumRequests);
}
bool Config::isNewApiProperty(std::string property) {
static const std::set<std::string> new_api_keys{
ov::intel_gpu::hint::queue_priority.name(),
ov::intel_gpu::hint::queue_throttle.name(),
ov::hint::inference_precision.name(),
ov::compilation_num_threads.name(),
ov::num_streams.name(),
};
return new_api_keys.find(property) != new_api_keys.end();
}
std::string Config::ConvertPropertyToLegacy(const std::string& key, const std::string& value) {
if (key == PluginConfigParams::KEY_MODEL_PRIORITY) {
auto priority = ov::util::from_string(value, ov::hint::model_priority);
if (priority == ov::hint::Priority::HIGH)
return PluginConfigParams::MODEL_PRIORITY_HIGH;
else if (priority == ov::hint::Priority::MEDIUM)
return PluginConfigParams::MODEL_PRIORITY_MED;
else if (priority == ov::hint::Priority::LOW)
return PluginConfigParams::MODEL_PRIORITY_LOW;
} else if (key == GPUConfigParams::KEY_GPU_HOST_TASK_PRIORITY) {
auto priority = ov::util::from_string(value, ov::intel_gpu::hint::host_task_priority);
if (priority == ov::hint::Priority::HIGH)
return GPUConfigParams::GPU_HOST_TASK_PRIORITY_HIGH;
else if (priority == ov::hint::Priority::MEDIUM)
return GPUConfigParams::GPU_HOST_TASK_PRIORITY_MEDIUM;
else if (priority == ov::hint::Priority::LOW)
return GPUConfigParams::GPU_HOST_TASK_PRIORITY_LOW;
}
IE_THROW() << "Unsupported value for legacy key : " << key;
}
bool Config::CanShareContextWith(const Config& other) const {
return this->throughput_streams == other.throughput_streams &&
this->useProfiling == other.useProfiling &&
this->dumpCustomKernels == other.dumpCustomKernels &&
this->queueThrottle == other.queueThrottle &&
this->queuePriority == other.queuePriority &&
this->kernels_cache_dir == other.kernels_cache_dir &&
this->device_id == other.device_id &&
this->task_exec_config._streams == other.task_exec_config._streams &&
this->task_exec_config._threadPreferredCoreType == other.task_exec_config._threadPreferredCoreType &&
this->enable_loop_unrolling == other.enable_loop_unrolling;
}
void Configs::CreateConfig(std::string device_id) {
if (configs.find(device_id) == configs.end()) {
configs.emplace(device_id, Config(device_id));
}
}
Config& Configs::GetConfig(std::string device_id) {
if (device_id.empty()) {
return GetDefaultDeviceConfig();
}
if (configs.find(device_id) == configs.end()) {
IE_THROW() << "Config for device with " << device_id << " ID is not registered in GPU plugin";
}
return configs.find(device_id)->second;
}
Config& Configs::GetDefaultDeviceConfig() {
return GetConfig(default_device_id);
}
IE_SUPPRESS_DEPRECATED_END
} // namespace intel_gpu
} // namespace ov

View File

@ -45,32 +45,32 @@ using namespace InferenceEngine::details;
namespace ov {
namespace intel_gpu {
Graph::Graph(InferenceEngine::CNNNetwork& network, gpu::ClContext::Ptr context, Config config, uint16_t stream_id)
Graph::Graph(InferenceEngine::CNNNetwork& network, RemoteContextImpl::Ptr context, const ExecutionConfig& config, uint16_t stream_id)
: m_context(context)
, m_networkName(network.getName())
, m_config(config)
, m_stream_id(stream_id)
, m_state(0) {
m_program = std::make_shared<Program>(network, GetEngine(), m_config);
m_program = std::make_shared<Program>(network, get_engine(), config);
if (m_program->m_max_batch > 1)
m_config.max_dynamic_batch = m_program->m_max_batch;
m_config.set_property(ov::intel_gpu::max_dynamic_batch(m_program->m_max_batch));
Build();
}
Graph::Graph(cldnn::BinaryInputBuffer &ib, gpu::ClContext::Ptr context, Config config, uint16_t stream_id)
Graph::Graph(cldnn::BinaryInputBuffer &ib, RemoteContextImpl::Ptr context, const ExecutionConfig& config, uint16_t stream_id)
: m_context(context)
, m_config(config)
, m_stream_id(stream_id)
, m_state(0) {
m_program = std::make_shared<Program>(GetEngine(), m_config);
m_program = std::make_shared<Program>(get_engine(), config);
if (m_program->m_max_batch > 1)
m_config.max_dynamic_batch = m_program->m_max_batch;
m_config.set_property(ov::intel_gpu::max_dynamic_batch(m_program->m_max_batch));
ib >> m_program->inputLayouts;
ib >> primitiveIDs;
ib >> outputDims;
m_networks.emplace_back(std::make_shared<cldnn::network>(ib, GetEngine()->create_stream(), *GetEngine(), m_stream_id));
m_networks.emplace_back(std::make_shared<cldnn::network>(ib, get_engine().create_stream(config), get_engine(), m_stream_id));
}
Graph::Graph(std::shared_ptr<Graph> graph, uint16_t stream_id)
@ -130,21 +130,19 @@ void Graph::Build() {
}
bool Graph::use_external_queue() const {
auto impl = getContextImpl(m_context);
return impl->GetExternalQueue() != nullptr;
return m_context->get_external_queue() != nullptr;
}
std::shared_ptr<cldnn::network> Graph::BuildNetwork(std::shared_ptr<cldnn::program> program) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Graph::BuildNetwork");
std::shared_ptr<cldnn::network> network = nullptr;
auto impl = getContextImpl(m_context);
auto externalQueue = impl->GetExternalQueue();
auto externalQueue = m_context->get_external_queue();
if (externalQueue) {
if (m_config.throughput_streams != 1)
if (m_config.get_property(ov::num_streams) != 1)
IE_THROW(ParameterMismatch) << "Throughput streams can't be used with shared queue!\n";
auto &engine = m_program->GetEngine();
network = std::make_shared<cldnn::network>(program, engine.create_stream(externalQueue), m_stream_id);
auto &engine = m_program->get_engine();
network = std::make_shared<cldnn::network>(program, engine.create_stream(m_config, externalQueue), m_stream_id);
} else {
network = std::make_shared<cldnn::network>(program, m_stream_id);
}
@ -164,7 +162,7 @@ Graph::variable_states_map Graph::AllocateVariablesMemories() {
std::vector<cldnn::network::VariableState::Ptr> memoryStates;
memoryStates.reserve(orderedLayouts.size());
for (const auto& layout : orderedLayouts)
memoryStates.push_back(std::make_shared<cldnn::network::VariableState>(GetEngine()->allocate_memory(layout, false)));
memoryStates.push_back(std::make_shared<cldnn::network::VariableState>(get_engine().allocate_memory(layout, false)));
states.insert({memStateInfo.first, memoryStates });
}
return states;
@ -173,7 +171,7 @@ Graph::variable_states_map Graph::AllocateVariablesMemories() {
std::shared_ptr<ngraph::Function> Graph::GetExecGraphInfoByPrimitivesInfo(std::vector<cldnn::primitive_info>& primitives_info,
bool filter_const_primitives) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Graph::GetExecGraphInfoByPrimitivesInfo");
if (m_config.useProfiling) {
if (m_config.get_property(ov::enable_profiling)) {
try {
// Update may throw an exception for step-by-step runtime graph dump,
// since network->get_executed_primitives() method can't be called before network execution

View File

@ -10,6 +10,7 @@
#include <description_buffer.hpp>
#include "intel_gpu/plugin/infer_request.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/compiled_model.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/plugin/variable_state.hpp"
@ -394,6 +395,8 @@ InferRequest::InferRequest(InputsDataMap networkInputs, OutputsDataMap networkOu
: IInferRequestInternal(networkInputs, networkOutputs) {
IE_ASSERT(nullptr != execNetwork);
streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
m_context = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(execNetwork->GetContext());
OPENVINO_ASSERT(m_context != nullptr, "[GPU] Can't initialize context of InferRequest: wrong context type");
}
InferRequest::InferRequest(const std::vector<std::shared_ptr<const ov::Node>>& inputs,
@ -402,6 +405,8 @@ InferRequest::InferRequest(const std::vector<std::shared_ptr<const ov::Node>>& i
: IInferRequestInternal(inputs, outputs) {
IE_ASSERT(nullptr != execNetwork);
streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
m_context = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(execNetwork->GetContext());
OPENVINO_ASSERT(m_context != nullptr, "[GPU] Can't initialize context of InferRequest: wrong context type");
}
// ----------------------------------------------------------------------------------------- //
@ -450,7 +455,7 @@ void InferRequest::enqueue() {
FormatFromTensorDesc(blobsDesc),
tensor_from_dims(blobsDesc.getDims()));
auto mergedBlobs = create_remote_blob<RemoteCLbuffer>(blobsDesc, layout, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
auto mergedBlobs = create_remote_blob<RemoteCLbuffer>(blobsDesc, layout, BlobType::BT_BUF_INTERNAL);
dst = mergedBlobs->buffer().as<uint8_t*>();
_inputs[name] = mergedBlobs;
@ -591,8 +596,8 @@ Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, bool is_dynamic
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequest::create_host_blob");
// Disable USM usage as USMHostAllocator may fail for attempt to allocate 0 bytes
// If we add WA for such case to avoid driver call, then deallocate method will return false and Blob::setShape call will throw an exception
bool use_usm = m_graph->GetEngine()->use_unified_shared_memory() && !is_dynamic;
auto alloc = use_usm ? std::make_shared<USMHostAllocator>(m_graph->GetContext().get()) : CreateDefaultAllocator();
bool use_usm = m_graph->get_engine().use_unified_shared_memory() && !is_dynamic;
auto alloc = use_usm ? std::make_shared<USMHostAllocator>(m_context) : CreateDefaultAllocator();
auto blob = make_blob_with_precision(desc, alloc);
blob->allocate();
return blob;
@ -600,8 +605,8 @@ Blob::Ptr InferRequest::create_host_blob(const TensorDesc& desc, bool is_dynamic
template<typename RemoteBlobType, typename>
InferenceEngine::Blob::Ptr InferRequest::create_remote_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout,
const RemoteBlobImpl::BlobType mem_type, void* mem_ptr) {
auto blob = std::make_shared<RemoteBlobType>(m_graph->GetContext(),
const BlobType mem_type, void* mem_ptr) {
auto blob = std::make_shared<RemoteBlobType>(m_context,
m_graph->GetNetwork()->get_stream(),
desc,
layout,
@ -615,12 +620,12 @@ InferenceEngine::Blob::Ptr InferRequest::create_remote_blob(const InferenceEngin
}
template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteCLbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
const RemoteBlobImpl::BlobType, void*);
const BlobType, void*);
template InferenceEngine::Blob::Ptr InferRequest::create_remote_blob<RemoteUSMbuffer>(const InferenceEngine::TensorDesc&, const cldnn::layout&,
const RemoteBlobImpl::BlobType, void*);
const BlobType, void*);
Blob::Ptr InferRequest::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
auto blob = create_remote_blob<RemoteUSMbuffer>(desc, layout, RemoteBlobImpl::BlobType::BT_USM_SHARED, usm_host_mem);
auto blob = create_remote_blob<RemoteUSMbuffer>(desc, layout, BlobType::BT_USM_SHARED, usm_host_mem);
OPENVINO_ASSERT(blob, "[GPU] Failed to allocate shared host <-> device blob");
return blob;
}
@ -771,7 +776,7 @@ void InferRequest::allocate_inputs() {
_inputs[name] = create_host_blob(desc, input_layout.is_dynamic());
// Pre-allocate device input only if USM is not supported; in other case it will be allocated
// in prepare_input() function later
if (input_layout.is_static() && !m_graph->GetEngine()->use_unified_shared_memory()) {
if (input_layout.is_static() && !m_graph->get_engine().use_unified_shared_memory()) {
_deviceInputs[name] = create_device_blob(desc);
}
}
@ -813,7 +818,7 @@ void InferRequest::allocate_outputs() {
_outputs[no.first] = create_host_blob(desc, output_layout.is_dynamic());
// Pre-allocate device output only if USM is not supported; in other case it will be allocated
// in prepare_output() function later
if (output_layout.is_static() && !m_graph->GetEngine()->use_unified_shared_memory()) {
if (output_layout.is_static() && !m_graph->get_engine().use_unified_shared_memory()) {
_deviceOutputs[no.first] = create_device_blob(desc);
}
}
@ -840,7 +845,7 @@ std::map<std::string, InferenceEngineProfileInfo> InferRequest::GetPerformanceCo
void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_mems, InferenceEngine::Blob::Ptr& user_blob,
const cldnn::primitive_id& blob_name, const cldnn::layout& layout, bool need_lockable_mem) {
const auto input_ptr = static_cast<const void*>(user_blob->cbuffer());
const auto alloc_type = m_graph->GetEngine()->detect_usm_allocation_type(input_ptr);
const auto alloc_type = m_graph->get_engine().detect_usm_allocation_type(input_ptr);
const auto is_usm_host = alloc_type == cldnn::allocation_type::usm_host;
const auto has_device_blob = device_mems.find(blob_name) != device_mems.end();
bool can_skip_allocation = false;
@ -851,7 +856,7 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
OPENVINO_ASSERT(impl, str_device_output_unsupported_blob);
OPENVINO_ASSERT(impl->is_allocated(), str_input_not_allocated);
auto impl_mem = impl->getMemory();
auto impl_mem = impl->get_memory();
auto src_ptr = user_blob->cbuffer().as<uint8_t*>();
// If device mem already exists, we can reuse blob if buffer has usm_host type and points to the same memory,
// so we don't need to allocate new memory
@ -875,7 +880,7 @@ void InferRequest::allocate_dev_mem_if_needed(InferenceEngine::BlobMap& device_m
device_mems[blob_name] = create_shared_device_blob(user_blob->getTensorDesc(), layout, user_blob->buffer().as<void*>());
} else if (need_lockable_mem) {
device_mems[blob_name] =
create_remote_blob<RemoteUSMbuffer>(user_blob->getTensorDesc(), layout, RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
create_remote_blob<RemoteUSMbuffer>(user_blob->getTensorDesc(), layout, BlobType::BT_USM_HOST_INTERNAL);
} else {
device_mems[blob_name] = create_device_blob(user_blob->getTensorDesc());
}
@ -894,7 +899,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
auto remote_ptr = inputBlob->as<gpu::ClBlob>();
auto& stream = m_graph->GetNetwork()->get_stream();
const bool is_dev_input = remote_ptr != nullptr;
const bool can_use_usm = m_graph->GetEngine()->use_unified_shared_memory();
const bool can_use_usm = m_graph->get_engine().use_unified_shared_memory();
auto conv_to_supported_prec = [](Precision::ePrecision prec) {
switch (prec) {
@ -951,7 +956,7 @@ void InferRequest::prepare_input(const cldnn::primitive_id& inputName, Blob::Ptr
if (!impl->is_allocated()) {
IE_THROW() << str_input_not_allocated;
}
auto inputMem = impl->getMemory();
auto inputMem = impl->get_memory();
auto input_layout = m_graph->GetInputLayouts().find(inputName);
if (input_layout != m_graph->GetInputLayouts().end()) {
@ -1003,7 +1008,7 @@ void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::P
const auto output_id = outputsMap.at(outputName);
const auto output_layout = m_graph->GetNetwork()->get_node_output_layout(output_id);
const bool is_static = output_layout.is_static();
const bool can_use_usm = m_graph->GetEngine()->use_unified_shared_memory();
const bool can_use_usm = m_graph->get_engine().use_unified_shared_memory();
auto remote_ptr = outputBlob->as<gpu::ClBlob>();
const bool is_dev_input = remote_ptr != nullptr;
@ -1027,7 +1032,7 @@ void InferRequest::prepare_output(const cldnn::primitive_id& outputName, Blob::P
if (!impl->is_allocated()) {
IE_THROW(NotAllocated) << str_output_not_allocated;
}
auto outputMem = impl->getMemory();
auto outputMem = impl->get_memory();
_nw_ptr->set_output_memory(internalName, outputMem);
}
@ -1038,10 +1043,10 @@ InferenceEngine::Blob::Ptr InferRequest::create_device_blob(const InferenceEngin
auto l = cldnn::layout(shape, dt, format);
if (m_graph->GetEngine()->use_unified_shared_memory()) {
return create_remote_blob<RemoteUSMbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_USM_DEVICE_INTERNAL);
if (m_graph->get_engine().use_unified_shared_memory()) {
return create_remote_blob<RemoteUSMbuffer>(desc, l, BlobType::BT_USM_DEVICE_INTERNAL);
} else {
return create_remote_blob<RemoteCLbuffer>(desc, l, RemoteBlobImpl::BlobType::BT_BUF_INTERNAL);
return create_remote_blob<RemoteCLbuffer>(desc, l, BlobType::BT_BUF_INTERNAL);
}
}
@ -1049,7 +1054,7 @@ std::vector<std::shared_ptr<InferenceEngine::IVariableStateInternal>> InferReque
std::vector<std::shared_ptr<InferenceEngine::IVariableStateInternal>> ret{};
ret.reserve(variables_states_.size());
for (const auto& pair : variables_states_)
ret.push_back(std::make_shared<VariableState>(pair.first, pair.second, m_graph->GetEngine(), m_curBatch));
ret.push_back(std::make_shared<VariableState>(pair.first, pair.second, m_graph->get_engine(), m_curBatch));
return ret;
}

View File

@ -10,6 +10,8 @@
#include <description_buffer.hpp>
#include "intel_gpu/plugin/infer_request_legacy.hpp"
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_blob.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/compiled_model.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/plugin/variable_state.hpp"
@ -289,7 +291,7 @@ void InferRequestLegacy::SetBlob(const std::string& name, const Blob::Ptr& data)
bool is_nv12 = nv12_ptr != nullptr;
int expected_batch = is_batched ? desc.getDims()[0] : 1;
if (ColorFormat::NV12 == foundInput->getPreProcess().getColorFormat() &&
m_graph->getConfig().nv12_two_inputs) {
m_graph->get_config().get_property(ov::intel_gpu::nv12_two_inputs)) {
// try extracting Y and UV remote blobs from it
// and put them into appropriate network inputs
// that should then go into biplanar NV12 reorder
@ -500,7 +502,7 @@ void InferRequestLegacy::checkBlobs() {
auto node = findInputByNodeName(input.first);
bool is_dynamic = (node && node->get_output_partial_shape(0).is_dynamic());
if (!is_dynamic)
checkInputBlob(input.second, input.first, foundInput, m_graph->getConfig().nv12_two_inputs);
checkInputBlob(input.second, input.first, foundInput, m_graph->get_config().get_property(ov::intel_gpu::nv12_two_inputs));
}
for (auto const &output : _outputs) {
DataPtr foundOutput = nullptr;
@ -619,6 +621,8 @@ InferRequestLegacy::InferRequestLegacy(InputsDataMap networkInputs, OutputsDataM
: IInferRequestInternal(networkInputs, networkOutputs) {
IE_ASSERT(nullptr != execNetwork);
streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
m_context = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(execNetwork->GetContext());
OPENVINO_ASSERT(m_context != nullptr, "[GPU] Can't initialize context of InferRequestLegacy: wrong context type");
}
InferRequestLegacy::InferRequestLegacy(const std::vector<std::shared_ptr<const ov::Node>>& inputs,
@ -627,6 +631,8 @@ InferRequestLegacy::InferRequestLegacy(const std::vector<std::shared_ptr<const o
: IInferRequestInternal(inputs, outputs) {
IE_ASSERT(nullptr != execNetwork);
streamExecutor = dynamic_cast<InferenceEngine::IStreamsExecutor*>(execNetwork->m_taskExecutor.get());
m_context = std::dynamic_pointer_cast<InferenceEngine::gpu::ClContext>(execNetwork->GetContext());
OPENVINO_ASSERT(m_context != nullptr, "[GPU] Can't initialize context of InferRequestLegacy: wrong context type");
}
// ----------------------------------------------------------------------------------------- //
@ -698,7 +704,7 @@ void InferRequestLegacy::enqueue() {
FormatFromTensorDesc(blobsDesc),
tensor_from_dims(blobsDesc.getDims()));
auto mergedBlobs = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
auto mergedBlobs = std::make_shared<RemoteCLbuffer>(m_context,
m_graph->GetNetwork()->get_stream(),
blobsDesc,
layout);
@ -914,14 +920,14 @@ Blob::Ptr InferRequestLegacy::create_host_blob(const TensorDesc& desc, std::shar
}
Blob::Ptr InferRequestLegacy::create_shared_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout, void* usm_host_mem) {
auto blob = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
auto blob = std::make_shared<RemoteUSMbuffer>(m_context,
m_graph->GetNetwork()->get_stream(),
desc,
layout,
usm_host_mem,
0,
0,
RemoteBlobImpl::BlobType::BT_USM_SHARED);
BlobType::BT_USM_SHARED);
if (!blob)
IE_THROW(NotAllocated) << "Failed to allocate shared host <-> device blob";
blob->allocate();
@ -1009,7 +1015,7 @@ void InferRequestLegacy::allocate_inputs() {
const TensorDesc& desc = ni.second->getTensorDesc();
bool is_nv12_input = ColorFormat::NV12 == ni.second->getPreProcess().getColorFormat() &&
m_graph->getConfig().nv12_two_inputs;
m_graph->get_config().get_property(ov::intel_gpu::nv12_two_inputs);
auto parameter = std::find_if(_parameters.begin(), _parameters.end(), [&](const std::shared_ptr<const ov::Node>& node) {
return node->get_friendly_name() == name;
@ -1040,10 +1046,10 @@ void InferRequestLegacy::allocate_inputs() {
Blob::Ptr inputBlob = create_host_blob(desc);
_inputs[name] = inputBlob;
} else {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
if (m_graph->get_engine().use_unified_shared_memory()) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_context));
_inputs[name] = host_blob;
_deviceInputs[name] = create_shared_device_blob(desc, litr->second, host_blob->buffer().as<void*>());
} else {
@ -1103,10 +1109,10 @@ void InferRequestLegacy::allocate_outputs() {
auto device_blob = create_device_blob(device_blob_desc, output_layout);
_deviceOutputs[no.first] = device_blob;
} else {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
if (m_graph->get_engine().use_unified_shared_memory()) {
// For USM case we create host blob using custom USM host allocator
// and then create shared device blob on top of this buffer
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_graph->GetContext().get()));
auto host_blob = create_host_blob(desc, std::make_shared<USMHostAllocator>(m_context));
_outputs[no.first] = host_blob;
_deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as<void*>());
} else {
@ -1183,7 +1189,7 @@ void InferRequestLegacy::prepare_input(const cldnn::primitive_id& inputName, Blo
if (!impl->is_allocated()) {
IE_THROW() << str_input_not_allocated;
}
auto inputMem = impl->getMemory();
auto inputMem = impl->get_memory();
auto input_layout = m_graph->GetInputLayouts().find(inputName);
if (input_layout != m_graph->GetInputLayouts().end()) {
@ -1241,25 +1247,25 @@ void InferRequestLegacy::prepare_output(const cldnn::primitive_id& outputName, B
if (!impl->is_allocated()) {
IE_THROW(NotAllocated) << str_output_not_allocated;
}
auto outputMem = impl->getMemory();
auto outputMem = impl->get_memory();
_nw_ptr->set_output_memory(internalName, outputMem);
}
InferenceEngine::Blob::Ptr InferRequestLegacy::create_device_blob(const InferenceEngine::TensorDesc& desc, const cldnn::layout& layout) {
if (m_graph->GetEngine()->use_unified_shared_memory()) {
auto blobPtr = std::make_shared<RemoteUSMbuffer>(m_graph->GetContext(),
if (m_graph->get_engine().use_unified_shared_memory()) {
auto blobPtr = std::make_shared<RemoteUSMbuffer>(m_context,
m_graph->GetNetwork()->get_stream(),
desc,
layout,
nullptr,
0,
0,
RemoteBlobImpl::BlobType::BT_USM_HOST_INTERNAL);
BlobType::BT_USM_HOST_INTERNAL);
getBlobImpl(blobPtr.get())->allocate();
checkAlloc(blobPtr, str_device_mem_not_allocated);
return blobPtr;
} else {
auto blobPtr = std::make_shared<RemoteCLbuffer>(m_graph->GetContext(),
auto blobPtr = std::make_shared<RemoteCLbuffer>(m_context,
m_graph->GetNetwork()->get_stream(),
desc,
layout);
@ -1273,7 +1279,7 @@ std::vector<std::shared_ptr<InferenceEngine::IVariableStateInternal>> InferReque
std::vector<std::shared_ptr<InferenceEngine::IVariableStateInternal>> ret{};
ret.reserve(variables_states_.size());
for (const auto& pair : variables_states_)
ret.push_back(std::make_shared<VariableState>(pair.first, pair.second, m_graph->GetEngine(), m_curBatch));
ret.push_back(std::make_shared<VariableState>(pair.first, pair.second, m_graph->get_engine(), m_curBatch));
return ret;
}

View File

@ -0,0 +1,272 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/plugin/legacy_api_helper.hpp"
#include "ie_plugin_config.hpp"
#include "gpu/gpu_config.hpp"
namespace ov {
namespace intel_gpu {
bool LegacyAPIHelper::is_new_api_property(const std::pair<std::string, ov::Any>& property) {
static const std::vector<std::string> new_properties_list = {
ov::intel_gpu::hint::queue_priority.name(),
ov::intel_gpu::hint::queue_throttle.name(),
ov::hint::inference_precision.name(),
ov::compilation_num_threads.name(),
ov::num_streams.name(),
};
return std::find(new_properties_list.begin(), new_properties_list.end(), property.first) != new_properties_list.end();
}
bool LegacyAPIHelper::is_legacy_property(const std::pair<std::string, ov::Any>& property, bool is_new_api) {
static const std::vector<std::string> legacy_properties_list = {
InferenceEngine::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS,
InferenceEngine::GPUConfigParams::KEY_GPU_MAX_NUM_THREADS,
InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY,
InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE,
};
static const std::vector<std::string> legacy_property_values_list = {
InferenceEngine::PluginConfigParams::KEY_MODEL_PRIORITY,
InferenceEngine::GPUConfigParams::KEY_GPU_HOST_TASK_PRIORITY,
};
bool legacy_property = std::find(legacy_properties_list.begin(), legacy_properties_list.end(), property.first) != legacy_properties_list.end();
bool need_value_conversion = !is_new_api &&
std::find(legacy_property_values_list.begin(), legacy_property_values_list.end(), property.first) != legacy_property_values_list.end();
return legacy_property || need_value_conversion;
}
ov::AnyMap LegacyAPIHelper::convert_legacy_properties(const std::map<std::string, std::string>& properties, bool is_new_api) {
return convert_legacy_properties(ov::AnyMap(properties.begin(), properties.end()), is_new_api);
}
ov::AnyMap LegacyAPIHelper::convert_legacy_properties(const ov::AnyMap& properties, bool is_new_api) {
ov::AnyMap converted_properties;
for (auto& property : properties) {
if (is_legacy_property(property, is_new_api)) {
auto new_property = convert_legacy_property(property);
converted_properties[new_property.first] = new_property.second;
} else {
converted_properties[property.first] = property.second;
}
}
return converted_properties;
}
std::pair<std::string, ov::Any> LegacyAPIHelper::convert_legacy_property(const std::pair<std::string, ov::Any>& legacy_property) {
auto legacy_name = legacy_property.first;
if (legacy_name == InferenceEngine::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS) {
ov::Any converted_val{legacy_property.second};
auto legacy_val = legacy_property.second.as<std::string>();
if (legacy_val == InferenceEngine::PluginConfigParams::GPU_THROUGHPUT_AUTO)
converted_val = ov::streams::AUTO;
return { ov::num_streams.name(), converted_val };
} else if (legacy_name == InferenceEngine::PluginConfigParams::KEY_MODEL_PRIORITY) {
ov::Any converted_val{nullptr};
auto legacy_val = legacy_property.second.as<std::string>();
if (legacy_val == InferenceEngine::PluginConfigParams::MODEL_PRIORITY_HIGH) {
converted_val = ov::hint::Priority::HIGH;
} else if (legacy_val == InferenceEngine::PluginConfigParams::MODEL_PRIORITY_MED) {
converted_val = ov::hint::Priority::MEDIUM;
} else if (legacy_val == InferenceEngine::PluginConfigParams::MODEL_PRIORITY_LOW) {
converted_val = ov::hint::Priority::LOW;
} else {
converted_val = legacy_val;
}
return { ov::hint::model_priority.name(), converted_val };
} else if (legacy_name == InferenceEngine::GPUConfigParams::KEY_GPU_MAX_NUM_THREADS) {
return { ov::compilation_num_threads.name(), legacy_property.second };
} else if (legacy_name == InferenceEngine::GPUConfigParams::KEY_GPU_HOST_TASK_PRIORITY) {
ov::Any converted_val{nullptr};
auto legacy_val = legacy_property.second.as<std::string>();
if (legacy_val == InferenceEngine::GPUConfigParams::GPU_HOST_TASK_PRIORITY_HIGH) {
converted_val = ov::hint::Priority::HIGH;
} else if (legacy_val == InferenceEngine::GPUConfigParams::GPU_HOST_TASK_PRIORITY_MEDIUM) {
converted_val = ov::hint::Priority::MEDIUM;
} else if (legacy_val == InferenceEngine::GPUConfigParams::GPU_HOST_TASK_PRIORITY_LOW) {
converted_val = ov::hint::Priority::LOW;
} else {
converted_val = legacy_val;
}
return { ov::intel_gpu::hint::host_task_priority.name(), converted_val };
} else if (legacy_name == InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY) {
ov::Any converted_val{nullptr};
auto legacy_val = legacy_property.second.as<std::string>();
if (!legacy_val.empty()) {
std::stringstream ss(legacy_val);
uint32_t uVal(0);
ss >> uVal;
OPENVINO_ASSERT(!ss.fail(), "[GPU] Unsupported property value by plugin: ", legacy_val);
switch (uVal) {
case 0:
case 2:
converted_val = ov::hint::Priority::MEDIUM;
break;
case 1:
converted_val = ov::hint::Priority::LOW;
break;
case 3:
converted_val = ov::hint::Priority::HIGH;
break;
default:
OPENVINO_ASSERT(false, "[GPU] Unsupported queue priority value ", uVal);
}
}
return { ov::intel_gpu::hint::queue_priority.name(), converted_val };
} else if (legacy_name == InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE) {
ov::Any converted_val{nullptr};
auto legacy_val = legacy_property.second.as<std::string>();
if (!legacy_val.empty()) {
std::stringstream ss(legacy_val);
uint32_t uVal(0);
ss >> uVal;
OPENVINO_ASSERT(!ss.fail(), "[GPU] Unsupported property value by plugin: ", legacy_val);
switch (uVal) {
case 0:
case 2:
converted_val = ov::intel_gpu::hint::ThrottleLevel::MEDIUM;
break;
case 1:
converted_val = ov::intel_gpu::hint::ThrottleLevel::LOW;
break;
case 3:
converted_val = ov::intel_gpu::hint::ThrottleLevel::HIGH;
break;
default:
OPENVINO_ASSERT(false, "[GPU] Unsupported queue throttle value ", uVal);
}
}
return { ov::intel_gpu::hint::queue_throttle.name(), converted_val };
}
OPENVINO_ASSERT(false, "[GPU] Unhandled legacy property in convert_legacy_property method: ", legacy_property.first);
}
std::pair<std::string, ov::Any> LegacyAPIHelper::convert_to_legacy_property(const std::pair<std::string, ov::Any>& property) {
auto name = property.first;
if (name == ov::num_streams.name()) {
ov::Any legacy_val{property.second};
if (!property.second.empty()) {
if (property.second.as<ov::streams::Num>() == ov::streams::AUTO) {
legacy_val = InferenceEngine::PluginConfigParams::GPU_THROUGHPUT_AUTO;
}
}
return { InferenceEngine::PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS, legacy_val };
} else if (name == ov::hint::model_priority.name()) {
ov::Any legacy_val{nullptr};
if (!property.second.empty()) {
ov::hint::Priority val = property.second.as<ov::hint::Priority>();
switch (val) {
case ov::hint::Priority::LOW: legacy_val = InferenceEngine::PluginConfigParams::MODEL_PRIORITY_LOW; break;
case ov::hint::Priority::MEDIUM: legacy_val = InferenceEngine::PluginConfigParams::MODEL_PRIORITY_MED; break;
case ov::hint::Priority::HIGH: legacy_val = InferenceEngine::PluginConfigParams::MODEL_PRIORITY_HIGH; break;
default: OPENVINO_ASSERT(false, "[GPU] Unsupported model priority value ", val);
}
}
return { InferenceEngine::PluginConfigParams::KEY_MODEL_PRIORITY, legacy_val };
} else if (name == ov::compilation_num_threads.name()) {
return { InferenceEngine::GPUConfigParams::KEY_GPU_MAX_NUM_THREADS, property.second };
} else if (name == ov::intel_gpu::hint::host_task_priority.name()) {
ov::Any legacy_val{nullptr};
if (!property.second.empty()) {
ov::hint::Priority val = property.second.as<ov::hint::Priority>();
switch (val) {
case ov::hint::Priority::LOW: legacy_val = InferenceEngine::GPUConfigParams::GPU_HOST_TASK_PRIORITY_LOW; break;
case ov::hint::Priority::MEDIUM: legacy_val = InferenceEngine::GPUConfigParams::GPU_HOST_TASK_PRIORITY_MEDIUM; break;
case ov::hint::Priority::HIGH: legacy_val = InferenceEngine::GPUConfigParams::GPU_HOST_TASK_PRIORITY_HIGH; break;
default: OPENVINO_ASSERT(false, "[GPU] Unsupported host task priority value ", val);
}
}
return { InferenceEngine::PluginConfigParams::KEY_MODEL_PRIORITY, legacy_val };
} else if (name == ov::intel_gpu::hint::queue_priority.name()) {
ov::Any legacy_val{nullptr};
if (!property.second.empty()) {
ov::hint::Priority val = property.second.as<ov::hint::Priority>();
switch (val) {
case ov::hint::Priority::LOW: legacy_val = "1"; break;
case ov::hint::Priority::MEDIUM: legacy_val = "2"; break;
case ov::hint::Priority::HIGH: legacy_val = "3"; break;
default: OPENVINO_ASSERT(false, "[GPU] Unsupported queue throttle value ", val);
}
}
return { InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_PRIORITY, legacy_val };
} else if (name == ov::intel_gpu::hint::queue_throttle.name()) {
ov::Any legacy_val{nullptr};
if (!property.second.empty()) {
ov::intel_gpu::hint::ThrottleLevel val = property.second.as<ov::intel_gpu::hint::ThrottleLevel>();
switch (val) {
case ov::intel_gpu::hint::ThrottleLevel::LOW: legacy_val = "1"; break;
case ov::intel_gpu::hint::ThrottleLevel::MEDIUM: legacy_val = "2"; break;
case ov::intel_gpu::hint::ThrottleLevel::HIGH: legacy_val = "3"; break;
default: OPENVINO_ASSERT(false, "[GPU] Unsupported queue throttle value ", val);
}
}
return { InferenceEngine::GPUConfigParams::KEY_GPU_PLUGIN_THROTTLE, legacy_val };
}
OPENVINO_ASSERT(false, "[GPU] Unhandled legacy property in convert_to_legacy_property method: ", property.first);
}
std::vector<std::string> LegacyAPIHelper::get_supported_configs() {
static const std::vector<std::string> supported_config = {
CONFIG_KEY(MODEL_PRIORITY),
CONFIG_KEY(PERFORMANCE_HINT),
CONFIG_KEY(PERFORMANCE_HINT_NUM_REQUESTS),
CONFIG_KEY(PERF_COUNT),
CONFIG_KEY(DYN_BATCH_ENABLED),
CONFIG_KEY(CONFIG_FILE),
CONFIG_KEY(DEVICE_ID),
CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS),
CONFIG_KEY(CACHE_DIR),
CONFIG_KEY(GPU_THROUGHPUT_STREAMS),
GPU_CONFIG_KEY(PLUGIN_PRIORITY),
GPU_CONFIG_KEY(PLUGIN_THROTTLE),
GPU_CONFIG_KEY(HOST_TASK_PRIORITY),
GPU_CONFIG_KEY(NV12_TWO_INPUTS),
GPU_CONFIG_KEY(MAX_NUM_THREADS),
GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING),
};
return supported_config;
}
std::vector<std::string> LegacyAPIHelper::get_supported_metrics(bool model_caching_enabled) {
std::vector<std::string> supported_metrics = {
METRIC_KEY(AVAILABLE_DEVICES),
METRIC_KEY(SUPPORTED_METRICS),
METRIC_KEY(FULL_DEVICE_NAME),
METRIC_KEY(OPTIMIZATION_CAPABILITIES),
METRIC_KEY(SUPPORTED_CONFIG_KEYS),
METRIC_KEY(RANGE_FOR_ASYNC_INFER_REQUESTS),
METRIC_KEY(RANGE_FOR_STREAMS),
METRIC_KEY(DEVICE_TYPE),
METRIC_KEY(DEVICE_GOPS),
METRIC_KEY(OPTIMAL_BATCH_SIZE),
METRIC_KEY(MAX_BATCH_SIZE),
GPU_METRIC_KEY(DEVICE_TOTAL_MEM_SIZE),
GPU_METRIC_KEY(UARCH_VERSION),
GPU_METRIC_KEY(EXECUTION_UNITS_COUNT),
GPU_METRIC_KEY(MEMORY_STATISTICS),
};
if (model_caching_enabled)
supported_metrics.push_back(METRIC_KEY(IMPORT_EXPORT_SUPPORT));
return supported_metrics;
}
} // namespace intel_gpu
} // namespace ov

View File

@ -38,7 +38,7 @@ static void CreateAdaptiveMaxPoolOp(Program& p, const std::shared_ptr<ngraph::op
const cldnn::layout indices_layout{cldnn::element_type_to_data_type(indices_precision),
cldnn::format::get_default_format(indices_shape.size()),
tensor_from_dims(indices_shape)};
const auto indices_memory = p.GetEngine().allocate_memory(indices_layout);
const auto indices_memory = p.get_engine().allocate_memory(indices_layout);
const cldnn::primitive_id indices_id_w = layer_type_name + "_md_write";
const cldnn::mutable_data indices_mutable_prim_w{indices_id_w, indices_memory};

View File

@ -202,8 +202,8 @@ void createClDnnConstant(Program& p, const ngraph::Shape& constDims, const std::
p.profiling_ids.push_back(initialconstPrimID);
} else {
GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant]" << std::endl;
cldnn::memory::ptr mem = p.GetEngine().allocate_memory(constLayout, false);
auto& stream = p.GetEngine().get_program_stream();
cldnn::memory::ptr mem = p.get_engine().allocate_memory(constLayout, false);
auto& stream = p.get_engine().get_service_stream();
cldnn::mem_lock<char> lock{mem, stream};
auto buf = lock.data();
auto bufSize = constLayout.bytes_count();

View File

@ -314,7 +314,7 @@ static void DeformableConvolutionImpl(Program& p,
std::vector<cldnn::primitive_id> weights = {inputs[2].pid};
// Remove weights from inputs
inputs.erase(inputs.begin() + 2);
auto device_info = p.GetEngine().get_device_info();
auto device_info = p.get_engine().get_device_info();
bool supports_subgroups = device_info.supports_khr_subgroups || device_info.supports_intel_subgroups;
if (groups == 1 && supports_subgroups) {
std::string defConvLayerNameInterp = layerName + "_interp";

View File

@ -74,7 +74,7 @@ static void CreateCommonCTCGreedyDecoderOp(Program& p, const std::shared_ptr<ngr
tensor_from_dims(op->get_output_shape(1)));
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayout));
shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayout));
cldnn::primitive_id ctc_gd_mutable_id_w = layer_type_name_ID(op) + "_md_write";
auto ctc_gd_mutable_prim = cldnn::mutable_data(ctc_gd_mutable_id_w,

View File

@ -33,7 +33,7 @@ static void CreateExperimentalDetectronDetectionOutputOp(
const cldnn::layout mutable_layout1{cldnn::element_type_to_data_type(mutable_precision1),
cldnn::format::get_default_format(output_shape1.size()),
tensor_from_dims(output_shape1)};
cldnn::memory::ptr shared_memory1{p.GetEngine().allocate_memory(mutable_layout1)};
cldnn::memory::ptr shared_memory1{p.get_engine().allocate_memory(mutable_layout1)};
const auto mutable_id_w1 = layer_type_name + "_md_write.1";
const cldnn::mutable_data mutable_prim_w{mutable_id_w1, shared_memory1};
@ -45,7 +45,7 @@ static void CreateExperimentalDetectronDetectionOutputOp(
const cldnn::layout mutable_layout2{cldnn::element_type_to_data_type(mutable_precision2),
cldnn::format::get_default_format(output_shape2.size()),
tensor_from_dims(output_shape2)};
cldnn::memory::ptr shared_memory2{p.GetEngine().allocate_memory(mutable_layout2)};
cldnn::memory::ptr shared_memory2{p.get_engine().allocate_memory(mutable_layout2)};
const auto mutable_id_w2 = layer_type_name + "_md_write.2";
const cldnn::mutable_data mutable_prim_w2{mutable_id_w2, shared_memory2};

View File

@ -33,7 +33,7 @@ static void CreateExperimentalDetectronGenerateProposalsSingleImageOp(
const cldnn::layout mutable_layout{cldnn::element_type_to_data_type(mutable_precision),
cldnn::format::get_default_format(output_shape.size()),
tensor_from_dims(output_shape)};
cldnn::memory::ptr shared_memory{p.GetEngine().allocate_memory(mutable_layout)};
cldnn::memory::ptr shared_memory{p.get_engine().allocate_memory(mutable_layout)};
const auto mutable_id_w = layer_type_name + "_md_write";
const cldnn::mutable_data mutable_prim_w{mutable_id_w, shared_memory};

View File

@ -22,7 +22,7 @@ static void CreateExperimentalDetectronROIFeatureExtractorOp(Program& p, const s
cldnn::format::get_default_format(op->get_output_shape(1).size()),
tensor_from_dims(op->get_output_shape(1)));
cldnn::memory::ptr shared_memory {p.GetEngine().allocate_memory(mutableLayout)};
cldnn::memory::ptr shared_memory {p.get_engine().allocate_memory(mutableLayout)};
cldnn::primitive_id experimental_detectron_mutable_id_w = layer_type_name_ID(op) + "_md_write";
cldnn::mutable_data experimenta_detectron_mutable_prim(experimental_detectron_mutable_id_w,

View File

@ -32,7 +32,7 @@ static void CreateGenerateProposalsIEInternalOp(
const cldnn::layout mutable_layout_1{cldnn::element_type_to_data_type(mutable_precision_1),
cldnn::format::get_default_format(output_shape_1.size()),
tensor_from_dims(output_shape_1)};
cldnn::memory::ptr shared_memory_1{p.GetEngine().allocate_memory(mutable_layout_1)};
cldnn::memory::ptr shared_memory_1{p.get_engine().allocate_memory(mutable_layout_1)};
const auto mutable_id_w_1 = layer_type_name + "_md_write.1";
const cldnn::mutable_data mutable_prim_w_1{mutable_id_w_1, shared_memory_1};
@ -45,7 +45,7 @@ static void CreateGenerateProposalsIEInternalOp(
const cldnn::layout mutable_layout_2{cldnn::element_type_to_data_type(mutable_precision_2),
cldnn::format::get_default_format(output_shape_2.size()),
tensor_from_dims(output_shape_2)};
cldnn::memory::ptr shared_memory_2{p.GetEngine().allocate_memory(mutable_layout_2)};
cldnn::memory::ptr shared_memory_2{p.get_engine().allocate_memory(mutable_layout_2)};
const auto mutable_id_w_2 = layer_type_name + "_md_write.2";
const cldnn::mutable_data mutable_prim_w_2{mutable_id_w_2, shared_memory_2};

View File

@ -29,8 +29,8 @@ namespace intel_gpu {
template<class DATA_TYPE>
static DATA_TYPE CreateScalarData(Program &p, const cldnn::primitive_id& id, int64_t num) {
auto mem = p.GetEngine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } });
cldnn::mem_lock<int64_t> ptr{mem, p.GetEngine().get_program_stream()};
auto mem = p.get_engine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } });
cldnn::mem_lock<int64_t> ptr{mem, p.get_engine().get_service_stream()};
*ptr.begin() = num;
return {id, mem};
}
@ -42,7 +42,7 @@ static cldnn::mutable_data CreateAdditionalOutputData(Program &p, const std::sha
const auto format = cldnn::format::get_default_format(op->get_output_shape(output_idx).size());
const auto tensor = tensor_from_dims(op->get_output_shape(output_idx));
cldnn::layout output_layout = cldnn::layout(precision, format, tensor);
auto mem = p.GetEngine().allocate_memory(output_layout);
auto mem = p.get_engine().allocate_memory(output_layout);
auto md = cldnn::mutable_data(id, {cldnn::input_info(input)}, mem); // cldnn::data cannot set dependency
return md;
}
@ -82,7 +82,7 @@ static void CreateLoopOp(Program& p, const std::shared_ptr<Loop>& op) {
}
// get body topology from ngraph function
Program body_program(body_network, p.GetEnginePtr(), p.GetConfig(), true);
Program body_program(body_network, p.get_engine(), p.get_config(), true);
auto body_topology = *body_program.GetTopology();
// setup input_primitive_maps/ output_primitive_maps and back_edges

View File

@ -34,7 +34,7 @@ void CreateNmsStaticShapeIE8Op(Program& p, const std::shared_ptr<ngraph::op::int
cldnn::format::bfyx,
cldnn::tensor(static_cast<int32_t>(outputIndices), 1, 1, 1));
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));
shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayoutFirst));
cldnn::primitive_id matrix_nms_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";
auto matrix_nms_mutable_prim_first = cldnn::mutable_data(matrix_nms_mutable_id_w_first, shared_memory.back());
@ -46,7 +46,7 @@ void CreateNmsStaticShapeIE8Op(Program& p, const std::shared_ptr<ngraph::op::int
cldnn::format::bfyx,
cldnn::tensor(static_cast<int32_t>(batches_num), 1, 1, 1));
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));
shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayoutSecond));
cldnn::primitive_id matrix_nms_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
auto matrix_nms_mutable_prim_second = cldnn::mutable_data(matrix_nms_mutable_id_w_second, shared_memory.back());

View File

@ -32,7 +32,7 @@ static void CreateMulticlassNmsIEInternalOp(Program& p, const std::shared_ptr<ng
const cldnn::layout mutable_layout1{cldnn::element_type_to_data_type(mutable_precision1),
cldnn::format::get_default_format(output_shape1.size()),
tensor_from_dims(output_shape1)};
cldnn::memory::ptr shared_memory1{p.GetEngine().allocate_memory(mutable_layout1)};
cldnn::memory::ptr shared_memory1{p.get_engine().allocate_memory(mutable_layout1)};
const auto mutable_id_w1 = layer_type_name + "_md_write.1";
const cldnn::mutable_data mutable_prim_w{mutable_id_w1, shared_memory1};
@ -44,7 +44,7 @@ static void CreateMulticlassNmsIEInternalOp(Program& p, const std::shared_ptr<ng
const cldnn::layout mutable_layout2{cldnn::element_type_to_data_type(mutable_precision2),
cldnn::format::get_default_format(output_shape2.size()),
tensor_from_dims(output_shape2)};
cldnn::memory::ptr shared_memory2{p.GetEngine().allocate_memory(mutable_layout2)};
cldnn::memory::ptr shared_memory2{p.get_engine().allocate_memory(mutable_layout2)};
const auto mutable_id_w2 = layer_type_name + "_md_write.2";
const cldnn::mutable_data mutable_prim_w2{mutable_id_w2, shared_memory2};

View File

@ -105,7 +105,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
tensor_from_dims(op->get_output_shape(2)));
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutSecond));
shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayoutSecond));
cldnn::primitive_id non_max_supression_mutable_id_w_second = layer_type_name_ID(op) + "_md_write_second";
auto nms_mutable_prim_second = cldnn::mutable_data(non_max_supression_mutable_id_w_second,
@ -121,7 +121,7 @@ static void CreateNonMaxSuppressionIEInternalOp(Program& p, const std::shared_pt
cldnn::tensor(static_cast<int32_t>(outputIndices), 3, 1, 1));
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
shared_memory.emplace_back(p.GetEngine().allocate_memory(mutableLayoutFirst));
shared_memory.emplace_back(p.get_engine().allocate_memory(mutableLayoutFirst));
cldnn::primitive_id non_max_supression_mutable_id_w_first = layer_type_name_ID(op) + "_md_write_first";
auto nms_mutable_prim_first = cldnn::mutable_data(non_max_supression_mutable_id_w_first,

View File

@ -36,8 +36,8 @@ static void CreateNormalizeL2Op(Program& p, const std::shared_ptr<ngraph::op::v0
// We create fake scale constant and fill it with ones to keep the same behavior as current primitive
auto scale = std::make_shared<ngraph::op::v0::Constant>(op->get_output_element_type(0), ngraph::Shape{1}, std::vector<float>{1.0});
cldnn::layout constLayout = cldnn::layout(cldnn::element_type_to_data_type(op->get_output_element_type(0)), cldnn::format::bfyx, cldnn::tensor{1});
auto mem = p.GetEngine().allocate_memory(constLayout, false);
cldnn::mem_lock<int8_t> tmpPointer{mem, p.GetEngine().get_program_stream()};
auto mem = p.get_engine().allocate_memory(constLayout, false);
cldnn::mem_lock<int8_t> tmpPointer{mem, p.get_engine().get_service_stream()};
auto buf = tmpPointer.data();
auto bufSize = scale->get_output_tensor(0).size();

View File

@ -108,8 +108,8 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
if (bufIter != p.blobMemCache.end()) {
meanBlobID = bufIter->second;
} else {
auto mem = p.GetEngine().allocate_memory(meanBlobLayout, false);
cldnn::mem_lock<int8_t> tmpPointer{ mem, p.GetEngine().get_program_stream() };
auto mem = p.get_engine().allocate_memory(meanBlobLayout, false);
cldnn::mem_lock<int8_t> tmpPointer{ mem, p.get_engine().get_service_stream() };
auto buf = tmpPointer.data();
auto bufSize = meanBlobLayout.bytes_count();
@ -197,7 +197,7 @@ static void CreateParameterOp(Program& p, const std::shared_ptr<ngraph::op::v0::
p.inputLayouts.insert({ inputInfo->name(), networkInputLayout });
p.add_primitive(*op, cldnn::input_layout(inputName, networkInputLayout));
} else {
if (ColorFormat::NV12 == preProcess.getColorFormat() && p.GetConfig().nv12_two_inputs) {
if (ColorFormat::NV12 == preProcess.getColorFormat() && p.get_config().get_property(ov::intel_gpu::nv12_two_inputs)) {
// for NV12, create two input layouts with reorder instead of one,
// and then would expect compound blob in inferRequest
if (InferenceEngine::Layout::NCHW != l &&

View File

@ -90,7 +90,7 @@ static void CreateMaxPoolOp(Program& p, const std::shared_ptr<ngraph::op::v8::Ma
cldnn::layout mutableLayout = cldnn::layout(cldnn::element_type_to_data_type(mutable_precision),
cldnn::format::get_default_format(output_shape.size()),
tensor_from_dims(output_shape));
const auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
const auto shared_memory = p.get_engine().allocate_memory(mutableLayout);
const cldnn::primitive_id maxpool_mutable_id_w = layer_type_name + "_md_write";
auto indices_mutable_prim = cldnn::mutable_data(maxpool_mutable_id_w,
shared_memory);

View File

@ -65,7 +65,7 @@ static void CreateProposalOp(Program& p, const std::shared_ptr<ngraph::op::v0::P
tensor_from_dims(op->get_output_shape(1)));
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
auto shared_memory = p.get_engine().allocate_memory(mutableLayout);
cldnn::primitive_id proposal_mutable_id_w = layer_type_name_ID(op) + "_md_write";
auto argmax_mutable_prim = cldnn::mutable_data(proposal_mutable_id_w,

View File

@ -28,8 +28,8 @@ namespace intel_gpu {
template<class DATA_TYPE>
static DATA_TYPE CreateScalarData(Program &p, const cldnn::primitive_id& id, int64_t num) {
auto mem = p.GetEngine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } });
cldnn::mem_lock<int64_t> ptr{mem, p.GetEngine().get_program_stream()};
auto mem = p.get_engine().allocate_memory({ cldnn::data_types::i64, cldnn::format::bfyx, { 1, 1, 1, 1 } });
cldnn::mem_lock<int64_t> ptr{mem, p.get_engine().get_service_stream()};
*ptr.begin() = num;
return {id, mem};
}
@ -41,7 +41,7 @@ static cldnn::mutable_data CreateAdditionalOutputData(Program &p, const std::sha
const auto format = cldnn::format::get_default_format(op->get_output_shape(output_idx).size());
const auto tensor = tensor_from_dims(op->get_output_shape(output_idx));
cldnn::layout output_layout = cldnn::layout(precision, format, tensor);
auto mem = p.GetEngine().allocate_memory(output_layout);
auto mem = p.get_engine().allocate_memory(output_layout);
auto md = cldnn::mutable_data(id, {cldnn::input_info(input)}, mem); // cldnn::data cannot set dependency
return md;
}
@ -51,7 +51,7 @@ static void CreateTensorIteratorOp(Program &p, const std::shared_ptr<TensorItera
// get body topology from ngraph function
InferenceEngine::CNNNetwork body_network(op->get_body());
Program body_program(body_network, p.GetEnginePtr(), p.GetConfig(), true);
Program body_program(body_network, p.get_engine(), p.get_config(), true);
auto body_topology = *body_program.GetTopology();
// setup input_primitive_maps/ output_primitive_maps and back_edges

View File

@ -66,7 +66,7 @@ static void CreateTopKOp(Program& p, const std::shared_ptr<ngraph::op::v1::TopK>
tensor_from_dims(op->get_output_shape(1)));
GPU_DEBUG_LOG << "[" << layer_type_name_ID(op) << ": mutable data]" << std::endl;
auto shared_memory = p.GetEngine().allocate_memory(mutableLayout);
auto shared_memory = p.get_engine().allocate_memory(mutableLayout);
cldnn::primitive_id argmax_mutable_id_w = layer_type_name_ID(op) + "_md_write";
auto argmax_mutable_prim = cldnn::mutable_data(argmax_mutable_id_w,

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,10 @@
#include "intel_gpu/primitives/mutable_data.hpp"
#include "intel_gpu/primitives/data.hpp"
#ifdef __linux__
# include <dlfcn.h>
#endif
using namespace InferenceEngine;
using namespace InferenceEngine::details;
@ -121,7 +125,7 @@ bool Program::IsDynBatchModel(const std::shared_ptr<ov::Model>& model,
return dyn_shape_batch_found;
}
Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::engine> engine, const Config& config,
Program::Program(InferenceEngine::CNNNetwork& network, cldnn::engine& engine, const ExecutionConfig& config,
bool createTopologyOnly, bool partialBuild)
: m_curBatch(-1)
, m_config(config)
@ -136,30 +140,60 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::en
IE_THROW() << "Function pointer inside CNNNetwork is nullptr";
}
// locate global custom kernel config
// and auto-load kernels from it
#ifdef _WIN32
CHAR mpath[MAX_PATH + 1];
HMODULE nModule;
GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
(LPCSTR)CustomLayer::LoadFromFile,
&nModule);
GetModuleFileName(nModule, mpath, sizeof(mpath));
#elif __linux__
Dl_info dl_info;
dladdr(reinterpret_cast<void *>(CustomLayer::LoadFromFile), &dl_info);
const char* mpath = dl_info.dli_fname;
#endif
std::string configFile(mpath);
std::size_t dir_split_pos = configFile.find_last_of("/\\");
std::string config_path;
if (dir_split_pos != std::string::npos) {
// path contains directory
config_path = configFile.substr(0, dir_split_pos);
}
config_path += "/cldnn_global_custom_kernels/cldnn_global_custom_kernels.xml";
CustomLayer::LoadFromFile(config_path, m_custom_layers, true);
auto custom_layers_config = m_config.get_property(ov::intel_gpu::config_file);
CustomLayer::LoadFromFile(custom_layers_config, m_custom_layers, custom_layers_config.empty());
auto ops = func->get_ordered_ops();
bool dyn_shape_batch_found = false;
std::map<std::string, ngraph::PartialShape> shapes;
std::map<std::string, std::pair<int64_t, int64_t>> batch_dim;
if (m_config.enableDynamicBatch) {
auto enable_dynamic_batch = m_config.get_property(ov::intel_gpu::enable_dynamic_batch);
if (enable_dynamic_batch) {
m_config.set_property(ov::intel_gpu::max_dynamic_batch(network.getBatchSize()));
// in case of legacy dynamic batch,
// we assume 4D input with 0 batch dim
auto param = func->get_parameters().front();
auto pname = getParamName(param);
shapes[pname] = param->get_output_partial_shape(0);
batch_dim[pname].first = 0;
batch_dim[pname].second = m_config.max_dynamic_batch;
batch_dim[pname].second = m_config.get_property(ov::intel_gpu::max_dynamic_batch);
} else {
dyn_shape_batch_found = IsDynBatchModel(func, shapes, batch_dim);
if (dyn_shape_batch_found) {
m_config.max_dynamic_batch = batch_dim.begin()->second.second;
m_config.set_property(ov::intel_gpu::max_dynamic_batch(batch_dim.begin()->second.second));
}
}
int m_bv_sz = GetMaxBatchSizeForSingleProgram();
m_max_batch = m_config.max_dynamic_batch;
m_max_batch = m_config.get_property(ov::intel_gpu::max_dynamic_batch);
if (dyn_shape_batch_found || config.max_dynamic_batch > 1) {
if (dyn_shape_batch_found || m_max_batch > 1) {
// compile log2 networks to serve dynamic batch requests
for (int b = m_bv_sz - 1; b >= 0; b--) {
inputLayouts.clear();
@ -188,8 +222,8 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::en
}
new_func->reshape(new_shapes);
{
auto deviceInfo = engine->get_device_info();
TransformationsPipeline transformations(config, deviceInfo);
auto deviceInfo = engine.get_device_info();
TransformationsPipeline transformations(m_config, deviceInfo);
transformations.apply(new_func);
}
@ -275,9 +309,10 @@ Program::Program(InferenceEngine::CNNNetwork& network, std::shared_ptr<cldnn::en
}
int Program::GetMaxBatchSizeForSingleProgram() {
if (m_config.max_dynamic_batch > 1) {
auto max_dynamic_batch = m_config.get_property(ov::intel_gpu::max_dynamic_batch);
if (max_dynamic_batch > 1) {
// calculate number of networks necessary based on binary log
unsigned int tmp = m_config.max_dynamic_batch;
unsigned int tmp = max_dynamic_batch;
unsigned int mask = 1U << 31;
unsigned int ldigit = 31;
@ -324,7 +359,6 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
InferenceEngine::OutputsDataMap networkOutputs,
bool createTopologyOnly, bool partialBuild) {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::BuildProgram");
cldnn::build_options options;
for (const auto& op : ops) {
if (op->is_dynamic()) {
@ -333,11 +367,10 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
}
}
options.set_option(cldnn::build_option::allow_new_shape_infer(allow_new_shape_infer));
options.set_option(cldnn::build_option::optimize_data(true));
if (partialBuild) {
options.set_option(cldnn::build_option::partial_build_program(true));
}
m_config.set_property(ov::intel_gpu::partial_build_program(partialBuild));
m_config.set_property(ov::intel_gpu::optimize_data(true));
m_config.set_property(ov::intel_gpu::allow_new_shape_infer(allow_new_shape_infer));
PrepareBuild(networkInputs, networkOutputs);
{
GPU_DEBUG_DEFINE_MEM_LOGGER("CreateSingleLayerPrimitives");
@ -351,7 +384,7 @@ std::shared_ptr<cldnn::program> Program::BuildProgram(const std::vector<std::sha
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Program::CreateProgram");
cldnn::program::ptr program;
try {
program = cldnn::program::build_program(*m_engine, *m_topology, options);
program = cldnn::program::build_program(m_engine, *m_topology, m_config);
} catch (std::exception& e) {
IE_THROW() << "cldnn program build failed! " << e.what();
}
@ -396,8 +429,8 @@ void Program::CreateSingleLayerPrimitive(cldnn::topology& topology, const std::s
bool is_created = false;
const ngraph::NodeTypeInfo* op_type_info = &op->get_type_info();
while (op_type_info != nullptr) {
auto customLayer = m_config.customLayers.find(op->get_type_name());
if (customLayer != m_config.customLayers.end()) {
auto customLayer = m_custom_layers.find(op->get_type_name());
if (customLayer != m_custom_layers.end()) {
CreateCustomOp(*this, op, customLayer->second);
return;
}
@ -488,7 +521,7 @@ void Program::add_primitive(const ngraph::Node& op, std::shared_ptr<cldnn::primi
prim->origin_op_type_name = prim->type_string();
}
if (this->m_config.useProfiling && should_profile) {
if (this->m_config.get_property(ov::enable_profiling) && should_profile) {
profiling_ids.push_back(prim_id);
init_profile_info(*prim);
}

View File

@ -0,0 +1,68 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <memory>
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/remote_blob.hpp"
using namespace InferenceEngine;
using namespace InferenceEngine::gpu;
using namespace InferenceEngine::details;
namespace ov {
namespace intel_gpu {
void RemoteAllocator::regLockedBlob(void* handle, const RemoteBlobImpl* blob) {
std::lock_guard<RemoteAllocator> locker(*this);
auto iter = m_lockedBlobs.find(handle);
if (iter == m_lockedBlobs.end()) {
m_lockedBlobs.emplace(handle, blob);
}
}
void RemoteAllocator::unlock(void* handle) noexcept {
std::lock_guard<RemoteAllocator> locker(*this);
auto iter = m_lockedBlobs.find(handle);
if (iter != m_lockedBlobs.end()) {
iter->second->unlock();
m_lockedBlobs.erase(iter);
}
}
void* USMHostAllocator::lock(void* handle, InferenceEngine::LockOp) noexcept {
if (!_usm_host_blob)
return nullptr;
try {
return _usm_host_blob->get();
} catch (...) {
return nullptr;
}
};
void USMHostAllocator::unlock(void* handle) noexcept {}
void* USMHostAllocator::alloc(size_t size) noexcept {
try {
auto td = TensorDesc(Precision::U8, SizeVector{size}, InferenceEngine::Layout::C);
ParamMap params = {{GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER)}};
_usm_host_blob = std::dynamic_pointer_cast<USMBlob>(_context->CreateBlob(td, params));
_usm_host_blob->allocate();
if (!getBlobImpl(_usm_host_blob.get())->is_allocated()) {
return nullptr;
}
return _usm_host_blob->get();
} catch (...) {
return nullptr;
}
}
bool USMHostAllocator::free(void* handle) noexcept {
try {
_usm_host_blob = nullptr;
} catch(...) { }
return true;
}
} // namespace intel_gpu
} // namespace ov

View File

@ -0,0 +1,285 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <memory>
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/remote_blob.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/plugin/plugin.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/runtime/device_query.hpp"
using namespace InferenceEngine;
using namespace InferenceEngine::gpu;
using namespace InferenceEngine::details;
namespace ov {
namespace intel_gpu {
RemoteBlobImpl::RemoteBlobImpl(InferenceEngine::gpu::ClContext::Ptr context,
cldnn::stream& stream,
const cldnn::layout& layout,
cldnn::shared_handle mem,
cldnn::shared_surface surf,
uint32_t plane,
BlobType mem_type)
: m_allocator(std::make_shared<RemoteAllocator>())
, m_context(context)
, m_stream(stream)
, m_mem(mem)
, m_surf(surf)
, m_plane(plane)
, m_layout(layout)
, m_mem_type(mem_type)
, m_memory_object(nullptr)
, lockedCounter(0)
, lockedHolder(nullptr)
, _handle(nullptr) {
if (supports_caching()) {
m_hash = cldnn::hash_combine(0, m_mem);
m_hash = cldnn::hash_combine(m_hash, m_surf);
m_hash = cldnn::hash_combine(m_hash, plane);
m_hash = cldnn::hash_combine(m_hash, static_cast<std::underlying_type<cldnn::format::type>::type>(layout.format));
m_hash = cldnn::hash_combine(m_hash, static_cast<std::underlying_type<cldnn::data_types>::type>(layout.data_type));
for (auto& d : layout.get_shape()) {
m_hash = cldnn::hash_combine(m_hash, d);
}
}
}
AnyMap RemoteBlobImpl::getParams() const {
OPENVINO_ASSERT(is_allocated(), "[GPU] Can't get RemoteBlob params as blob wasn't allocated properly");
auto params = m_memory_object->get_internal_params();
switch (m_mem_type) {
case BlobType::BT_BUF_INTERNAL:
case BlobType::BT_BUF_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(OCL_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BlobType::BT_USM_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BlobType::BT_USM_HOST_INTERNAL:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BlobType::BT_USM_DEVICE_INTERNAL:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
#ifdef _WIN32
case BlobType::BT_DX_BUF_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(DX_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(VA_DEVICE), params.user_device },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem },
{ GPU_PARAM_KEY(DEV_OBJECT_HANDLE), params.surface }
};
#endif
case BlobType::BT_IMG_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(OCL_IMAGE2D) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BlobType::BT_SURF_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(VA_SURFACE) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(VA_DEVICE), params.user_device },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem },
{ GPU_PARAM_KEY(DEV_OBJECT_HANDLE), params.surface },
{ GPU_PARAM_KEY(VA_PLANE), params.plane }
};
default:
IE_THROW() << "Unsupported shared object type " << static_cast<int>(m_mem_type);
}
}
bool RemoteBlobImpl::deallocate() noexcept {
m_memory_object.reset();
return m_memory_object == nullptr;
}
bool RemoteBlobImpl::is_allocated() const noexcept {
return m_memory_object != nullptr;
}
bool RemoteBlobImpl::is_locked() const noexcept {
return lockedHolder != nullptr;
}
void RemoteBlobImpl::allocate() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "RemoteBlobImpl::Allocate");
auto context = get_context_impl(m_context);
auto enable_caching = supports_caching();
if (enable_caching) {
m_memory_object = context->try_get_cached_memory(m_hash);
if (m_memory_object)
return;
}
auto& engine = context->get_engine();
switch (m_mem_type) {
case BlobType::BT_BUF_INTERNAL: {
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
break;
}
case BlobType::BT_USM_HOST_INTERNAL: {
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_host);
break;
}
case BlobType::BT_USM_DEVICE_INTERNAL: {
m_memory_object = engine.allocate_memory(m_layout, cldnn::allocation_type::usm_device);
break;
}
case BlobType::BT_BUF_SHARED: {
m_memory_object = engine.share_buffer(m_layout, m_mem);
break;
}
case BlobType::BT_USM_SHARED: {
m_memory_object = engine.share_usm(m_layout, m_mem);
break;
}
#ifdef _WIN32
case BlobType::BT_SURF_SHARED: {
m_memory_object = engine.share_surface(m_layout, m_mem, m_plane);
break;
}
case BlobType::BT_DX_BUF_SHARED: {
m_memory_object = engine.share_dx_buffer(m_layout, m_mem);
break;
}
#else
case BlobType::BT_SURF_SHARED: {
m_memory_object = engine.share_surface(m_layout, m_surf, m_plane);
break;
}
#endif
case BlobType::BT_IMG_SHARED: {
m_memory_object = engine.share_image(m_layout, m_mem);
break;
}
default:
m_memory_object.reset();
}
if (enable_caching)
context->add_to_cache(m_hash, m_memory_object);
}
const std::shared_ptr<IAllocator>& RemoteBlobImpl::getAllocator() const noexcept {
return m_allocator;
};
std::string RemoteBlobImpl::getDeviceName() const noexcept {
return m_context->getDeviceName();
};
std::shared_ptr<InferenceEngine::RemoteContext> RemoteBlobImpl::getContext() const noexcept {
return m_context;
}
void RemoteBlobImpl::reinterpret(cldnn::layout new_layout) {
OPENVINO_ASSERT(m_layout.bytes_count() >= new_layout.bytes_count(),
"[GPU] Can't reinterpret blob to the size bigger than allocated memory buffer");
m_layout = new_layout;
auto engine = m_memory_object->get_engine();
m_memory_object = engine->reinterpret_buffer(*m_memory_object, new_layout);
}
void RemoteBlobImpl::lock() const {
if (!is_allocated()) {
IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
}
std::lock_guard<std::mutex> locker(lockedMutex);
if (lockedCounter == 0) {
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memory_object, m_stream));
auto ptr = lockedHolder->data();
_handle = reinterpret_cast<void*>(ptr);
auto casted_allocator = std::dynamic_pointer_cast<RemoteAllocator>(m_allocator);
OPENVINO_ASSERT(casted_allocator, "[GPU] Invalid remote allocator type");
casted_allocator->regLockedBlob(_handle, this);
}
lockedCounter++;
}
void RemoteBlobImpl::unlock() const {
std::lock_guard<std::mutex> locker(lockedMutex);
lockedCounter--;
if (lockedCounter == 0)
lockedHolder.reset();
}
LockedMemory<void> RemoteBlobImpl::buffer() noexcept {
try {
lock();
return LockedMemory<void>(m_allocator.get(), _handle, 0);
} catch (...) {
return LockedMemory<void>(nullptr, nullptr, 0);
}
}
LockedMemory<const void> RemoteBlobImpl::cbuffer() const noexcept {
try {
lock();
return LockedMemory<const void>(m_allocator.get(), _handle, 0);
} catch (...) {
return LockedMemory<const void>(nullptr, nullptr, 0);
}
}
LockedMemory<void> RemoteBlobImpl::rwmap() noexcept {
try {
lock();
return LockedMemory<void>(m_allocator.get(), _handle, 0);
} catch (...) {
return LockedMemory<void>(nullptr, nullptr, 0);
}
}
LockedMemory<const void> RemoteBlobImpl::rmap() const noexcept {
try {
lock();
return LockedMemory<const void>(m_allocator.get(), _handle, 0);
} catch (...) {
return LockedMemory<const void>(nullptr, nullptr, 0);
}
}
LockedMemory<void> RemoteBlobImpl::wmap() noexcept {
try {
lock();
return LockedMemory<void>(m_allocator.get(), _handle, 0);
} catch (...) {
return LockedMemory<void>(nullptr, nullptr, 0);
}
}
bool RemoteBlobImpl::supports_caching() const {
return m_mem_type == BlobType::BT_BUF_SHARED ||
m_mem_type == BlobType::BT_USM_SHARED ||
m_mem_type == BlobType::BT_IMG_SHARED ||
m_mem_type == BlobType::BT_SURF_SHARED ||
m_mem_type == BlobType::BT_DX_BUF_SHARED;
}
} // namespace intel_gpu
} // namespace ov

View File

@ -4,7 +4,8 @@
#include <memory>
#include "intel_gpu/plugin/remote_context.hpp"
#include "intel_gpu/plugin/plugin.hpp"
#include "intel_gpu/plugin/remote_blob.hpp"
#include "intel_gpu/plugin/remote_allocators.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "intel_gpu/runtime/device_query.hpp"
@ -14,285 +15,28 @@ using namespace InferenceEngine::details;
namespace ov {
namespace intel_gpu {
RemoteAllocator RemoteBlobImpl::m_allocator;
RemoteBlobImpl::RemoteBlobImpl(ClContext::Ptr context,
cldnn::stream& stream,
const cldnn::layout& layout,
cldnn::shared_handle mem,
cldnn::shared_surface surf,
uint32_t plane,
BlobType mem_type)
: m_context(context)
, m_stream(stream)
, m_mem(mem)
, m_surf(surf)
, m_plane(plane)
, m_layout(layout)
, m_mem_type(mem_type)
, m_memObject(nullptr)
, lockedCounter(0)
, lockedHolder(nullptr)
, _handle(nullptr)
, _allocator(nullptr) {
auto _impl = getContextImpl(m_context.lock());
m_engine = _impl->GetEngine();
// Verify shared buffer/usm memory and ensure that requested byte size is not greater than allocated one
switch (m_mem_type) {
case BlobType::BT_BUF_SHARED: {
m_engine->share_buffer(m_layout, m_mem);
break;
}
case BlobType::BT_USM_SHARED: {
m_engine->share_usm(m_layout, m_mem);
break;
}
default: break;
}
}
AnyMap RemoteBlobImpl::getParams() const {
assert(m_memObject != nullptr);
auto params = m_memObject->get_internal_params();
switch (m_mem_type) {
case BT_BUF_INTERNAL:
case BT_BUF_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(OCL_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_USM_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_USER_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_USM_HOST_INTERNAL:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_HOST_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_USM_DEVICE_INTERNAL:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(USM_DEVICE_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
#ifdef _WIN32
case BT_DX_BUF_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(DX_BUFFER) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(VA_DEVICE), params.user_device },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem },
{ GPU_PARAM_KEY(DEV_OBJECT_HANDLE), params.surface }
};
#endif
case BT_IMG_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(OCL_IMAGE2D) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem }
};
case BT_SURF_SHARED:
return{
{ GPU_PARAM_KEY(SHARED_MEM_TYPE), GPU_PARAM_VALUE(VA_SURFACE) },
{ GPU_PARAM_KEY(OCL_CONTEXT), params.context },
{ GPU_PARAM_KEY(VA_DEVICE), params.user_device },
{ GPU_PARAM_KEY(MEM_HANDLE), params.mem },
{ GPU_PARAM_KEY(DEV_OBJECT_HANDLE), params.surface },
{ GPU_PARAM_KEY(VA_PLANE), params.plane }
};
default:
IE_THROW() << "Unsupported shared object type " << m_mem_type;
}
}
bool RemoteBlobImpl::deallocate() noexcept {
m_memObject.reset();
return m_memObject == nullptr;
}
bool RemoteBlobImpl::is_allocated() const noexcept {
return m_memObject != nullptr;
}
bool RemoteBlobImpl::is_locked() const noexcept {
return lockedHolder != nullptr;
}
void RemoteBlobImpl::allocate() {
OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "RemoteBlobImpl::Allocate");
assert(m_memObject == nullptr);
auto _impl = getContextImpl(m_context.lock());
std::lock_guard<ExecutionContextImpl> locker(*_impl);
switch (m_mem_type) {
case BlobType::BT_BUF_INTERNAL: {
m_memObject = m_engine->allocate_memory(m_layout, cldnn::allocation_type::cl_mem);
break;
}
case BlobType::BT_USM_HOST_INTERNAL: {
m_memObject = m_engine->allocate_memory(m_layout, cldnn::allocation_type::usm_host);
break;
}
case BlobType::BT_USM_DEVICE_INTERNAL: {
m_memObject = m_engine->allocate_memory(m_layout, cldnn::allocation_type::usm_device);
break;
}
case BlobType::BT_BUF_SHARED: {
m_memObject = m_engine->share_buffer(m_layout, m_mem);
break;
}
case BlobType::BT_USM_SHARED: {
m_memObject = m_engine->share_usm(m_layout, m_mem);
break;
}
#ifdef _WIN32
case BlobType::BT_SURF_SHARED: {
m_memObject = m_engine->share_surface(m_layout, m_mem, m_plane);
break;
}
case BlobType::BT_DX_BUF_SHARED: {
m_memObject = m_engine->share_dx_buffer(m_layout, m_mem);
break;
}
#else
case BlobType::BT_SURF_SHARED: {
m_memObject = m_engine->share_surface(m_layout, m_surf, m_plane);
break;
}
#endif
case BlobType::BT_IMG_SHARED: {
m_memObject = m_engine->share_image(m_layout, m_mem);
break;
}
default:
m_memObject.reset();
}
}
const std::shared_ptr<IAllocator>& RemoteBlobImpl::getAllocator() const noexcept {
if (!_allocator) {
_allocator = std::shared_ptr<IAllocator>(&m_allocator, [] (IAllocator*) {});
}
return _allocator;
};
std::string RemoteBlobImpl::getDeviceName() const noexcept {
return getContextImpl(m_context.lock())->getDeviceName();
};
std::shared_ptr<InferenceEngine::RemoteContext> RemoteBlobImpl::getContext() const noexcept {
return m_context.lock();
}
void RemoteBlobImpl::reinterpret(cldnn::layout new_layout) {
OPENVINO_ASSERT(m_layout.bytes_count() >= new_layout.bytes_count(),
"[GPU] Can't reinterpret blob to the size bigger than allocated memory buffer");
m_layout = new_layout;
auto engine = m_memObject->get_engine();
m_memObject = engine->reinterpret_buffer(*m_memObject, new_layout);
}
void RemoteBlobImpl::lock() const {
if (!is_allocated()) {
IE_THROW(NotAllocated) << "[GPU] Remote blob can't be locked as it's not allocated";
}
std::lock_guard<std::mutex> locker(lockedMutex);
if (lockedCounter == 0) {
lockedHolder = std::unique_ptr<cldnn::mem_lock<uint8_t>>(new cldnn::mem_lock<uint8_t>(m_memObject, m_stream));
auto ptr = lockedHolder->data();
_handle = reinterpret_cast<void*>(ptr);
m_allocator.regLockedBlob(_handle, this);
}
lockedCounter++;
}
void RemoteBlobImpl::unlock() const {
std::lock_guard<std::mutex> locker(lockedMutex);
lockedCounter--;
if (lockedCounter == 0)
lockedHolder.reset();
}
LockedMemory<void> RemoteBlobImpl::buffer() noexcept {
try {
lock();
return LockedMemory<void>(reinterpret_cast<IAllocator*>(&m_allocator), _handle, 0);
} catch (...) {
return LockedMemory<void>(nullptr, nullptr, 0);
}
}
LockedMemory<const void> RemoteBlobImpl::cbuffer() const noexcept {
try {
lock();
return LockedMemory<const void>(reinterpret_cast<IAllocator*>(&m_allocator), _handle, 0);
} catch (...) {
return LockedMemory<const void>(nullptr, nullptr, 0);
}
}
LockedMemory<void> RemoteBlobImpl::rwmap() noexcept {
try {
lock();
return LockedMemory<void>(reinterpret_cast<IAllocator *>(&m_allocator), _handle, 0);
} catch (...) {
return LockedMemory<void>(nullptr, nullptr, 0);
}
}
LockedMemory<const void> RemoteBlobImpl::rmap() const noexcept {
try {
lock();
return LockedMemory<const void>(reinterpret_cast<IAllocator *>(&m_allocator), _handle, 0);
} catch (...) {
return LockedMemory<const void>(nullptr, nullptr, 0);
}
}
LockedMemory<void> RemoteBlobImpl::wmap() noexcept {
try {
lock();
return LockedMemory<void>(reinterpret_cast<IAllocator *>(&m_allocator), _handle, 0);
} catch (...) {
return LockedMemory<void>(nullptr, nullptr, 0);
}
}
void RemoteAllocator::regLockedBlob(void* handle, const RemoteBlobImpl* blob) {
std::lock_guard<RemoteAllocator> locker(*this);
auto iter = m_lockedBlobs.find(handle);
if (iter == m_lockedBlobs.end()) {
m_lockedBlobs.emplace(handle, blob);
}
}
void RemoteAllocator::unlock(void* handle) noexcept {
std::lock_guard<RemoteAllocator> locker(*this);
auto iter = m_lockedBlobs.find(handle);
if (iter != m_lockedBlobs.end()) {
iter->second->unlock();
m_lockedBlobs.erase(iter);
}
}
ExecutionContextImpl::ExecutionContextImpl(const std::shared_ptr<IInferencePlugin> plugin,
const AnyMap& params,
const Config& config)
RemoteContextImpl::RemoteContextImpl(std::string device_name, std::vector<cldnn::device::ptr> devices)
: m_va_display(nullptr)
, m_external_queue(nullptr)
, m_config(config)
, m_type(ContextType::OCL)
, m_plugin(plugin) {
m_lock.clear(std::memory_order_relaxed);
, m_device_name(device_name)
, m_memory_cache(cache_capacity) {
OPENVINO_ASSERT(devices.size() == 1, "[GPU] Currently context can be created for single device only");
// TODO: Parameterize this based on plugin config and compilation options
auto engine_type = cldnn::engine_types::ocl;
auto runtime_type = cldnn::runtime_types::ocl;
m_engine = cldnn::engine::create(engine_type, runtime_type, devices.front());
GPU_DEBUG_LOG << "Initialize RemoteContext for " << m_device_name << " (" << m_engine->get_device_info().dev_name << ")" << std::endl;
}
RemoteContextImpl::RemoteContextImpl(const std::vector<RemoteContextImpl::Ptr>& known_contexts, const AnyMap& params)
: m_va_display(nullptr)
, m_external_queue(nullptr)
, m_type(ContextType::OCL)
, m_memory_cache(cache_capacity) {
gpu_handle_param _context_id = nullptr;
gpu_handle_param _va_device = nullptr;
int ctx_device_id = 0;
@ -300,18 +44,18 @@ ExecutionContextImpl::ExecutionContextImpl(const std::shared_ptr<IInferencePlugi
if (params.size()) {
// parameter map is non-empty
std::string contextTypeStr = _StrFromParams(params, GPU_PARAM_KEY(CONTEXT_TYPE));
std::string contextTypeStr = extract_object<std::string>(params, GPU_PARAM_KEY(CONTEXT_TYPE));
if (GPU_PARAM_VALUE(OCL) == contextTypeStr) {
_context_id = _ObjFromParamSimple<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_CONTEXT));
_context_id = extract_object<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_CONTEXT));
if (params.find(GPU_PARAM_KEY(OCL_QUEUE)) != params.end())
m_external_queue = _ObjFromParamSimple<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_QUEUE));
m_external_queue = extract_object<gpu_handle_param>(params, GPU_PARAM_KEY(OCL_QUEUE));
if (params.find(GPU_PARAM_KEY(OCL_CONTEXT_DEVICE_ID)) != params.end())
ctx_device_id = _ObjFromParamSimple<int>(params, GPU_PARAM_KEY(OCL_CONTEXT_DEVICE_ID));
ctx_device_id = extract_object<int>(params, GPU_PARAM_KEY(OCL_CONTEXT_DEVICE_ID));
} else if (GPU_PARAM_VALUE(VA_SHARED) == contextTypeStr) {
m_va_display = _va_device = _ObjFromParamSimple<gpu_handle_param>(params, GPU_PARAM_KEY(VA_DEVICE));
m_va_display = _va_device = extract_object<gpu_handle_param>(params, GPU_PARAM_KEY(VA_DEVICE));
m_type = ContextType::DEV_SHARED;
} else {
IE_THROW() << "Invalid execution context type" << contextTypeStr;
@ -329,29 +73,15 @@ ExecutionContextImpl::ExecutionContextImpl(const std::shared_ptr<IInferencePlugi
cldnn::device_query device_query(engine_type, runtime_type, _context_id, _va_device, ctx_device_id, target_tile_id);
auto device_map = device_query.get_available_devices();
auto iter = device_map.find(std::to_string(cldnn::device_query::device_id));
if (iter == device_map.end())
iter = device_map.find(m_config.device_id);
if (iter == device_map.end())
iter = device_map.begin();
auto& dev = iter->second;
OPENVINO_ASSERT(device_map.size() == 1, "[GPU] Only one device expected in case of context sharing");
auto engine_params = Plugin::GetParams(m_config, dev, m_external_queue);
m_engine = cldnn::engine::create(engine_params.engine_type,
engine_params.runtime_type, dev,
cldnn::engine_configuration(m_config.useProfiling,
engine_params.queue_type,
std::string(),
m_config.queuePriority,
m_config.queueThrottle,
true,
engine_params.use_unified_shared_memory,
m_config.kernels_cache_dir,
m_config.throughput_streams),
engine_params.task_executor);
m_engine = cldnn::engine::create(engine_type, runtime_type, device_map.begin()->second);
m_device_name = get_device_name(known_contexts, m_engine->get_device());
GPU_DEBUG_LOG << "Initialize RemoteContext for " << m_device_name << " (" << m_engine->get_device_info().dev_name << ")" << std::endl;
}
AnyMap ExecutionContextImpl::getParams() const {
AnyMap RemoteContextImpl::get_params() const {
AnyMap ret = { { GPU_PARAM_KEY(OCL_CONTEXT), m_engine->get_user_context() } };
switch (m_type) {
@ -370,26 +100,191 @@ AnyMap ExecutionContextImpl::getParams() const {
return ret;
}
std::string ExecutionContextImpl::getDeviceName() const noexcept {
auto devName = m_plugin.lock()->GetName();
auto engine_type = cldnn::engine_types::ocl;
auto runtime_type = cldnn::runtime_types::ocl;
try {
// Use actual runtime and engine types
cldnn::device_query device_query(engine_type, runtime_type);
auto all_devices = device_query.get_available_devices();
auto current_device = m_engine->get_device();
for (auto& kv : all_devices) {
if (current_device->is_same(kv.second))
return devName + "." + kv.first;
// For external contexts we try to match underlying handles with default contexts created by plugin to find device name
std::string RemoteContextImpl::get_device_name(const std::vector<RemoteContextImpl::Ptr>& known_contexts,
const cldnn::device::ptr current_device) {
std::string device_name = "GPU";
for (auto& c : known_contexts) {
if (c->get_engine().get_device()->is_same(current_device)) {
device_name = c->get_device_name();
break;
}
} catch (...) { }
}
return device_name;
}
if (!m_config.device_id.empty())
devName += "." + m_config.device_id;
return devName;
std::string RemoteContextImpl::get_device_name() const noexcept {
return m_device_name;
}
cldnn::memory::ptr RemoteContextImpl::try_get_cached_memory(size_t hash) {
std::lock_guard<std::mutex> lock(m_cache_mutex);
if (m_memory_cache.has(hash))
return m_memory_cache.get(hash);
return nullptr;
}
void RemoteContextImpl::add_to_cache(size_t hash, cldnn::memory::ptr memory) {
std::lock_guard<std::mutex> lock(m_cache_mutex);
m_memory_cache.add(hash, memory);
}
InferenceEngine::RemoteBlob::Ptr RemoteContextImpl::reuse_surface(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
const InferenceEngine::ParamMap& params) {
using namespace InferenceEngine;
auto& stream = m_engine->get_service_stream();
uint32_t plane = extract_object<uint32_t>(params, GPU_PARAM_KEY(VA_PLANE));
#ifdef _WIN32
cldnn::shared_handle surf = extract_object<cldnn::shared_handle>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
#else
cldnn::shared_surface surf = extract_object<cldnn::shared_surface>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
#endif
cldnn::layout layout(DataTypeFromPrecision(desc.getPrecision()),
ImageFormatFromLayout(desc.getLayout()),
tensor_from_dims(desc.getDims()));
#ifdef _WIN32
auto blob = std::make_shared<RemoteD3DSurface>(public_context, stream,
desc, layout, surf, 0, plane,
BlobType::BT_SURF_SHARED);
#else
auto blob = std::make_shared<RemoteVASurface>(public_context, stream,
desc, layout, nullptr, surf, plane,
BlobType::BT_SURF_SHARED);
#endif
return blob;
}
InferenceEngine::RemoteBlob::Ptr RemoteContextImpl::reuse_memory(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
cldnn::shared_handle mem,
BlobType blob_type) {
auto& stream = m_engine->get_service_stream();
cldnn::layout layout(DataTypeFromPrecision(desc.getPrecision()),
FormatFromLayout(desc.getLayout()),
tensor_from_dims(desc.getDims()));
switch (blob_type) {
case BlobType::BT_BUF_SHARED: {
return std::make_shared<RemoteCLbuffer>(public_context, stream, desc, layout, mem, 0, 0, blob_type);
}
case BlobType::BT_USM_SHARED: {
return std::make_shared<RemoteUSMbuffer>(public_context, stream, desc, layout, mem, 0, 0, blob_type);
}
case BlobType::BT_IMG_SHARED: {
layout.format = ImageFormatFromLayout(desc.getLayout());
return std::make_shared<RemoteCLImage2D>(public_context, stream, desc, layout, mem, 0, 0, blob_type);
}
#ifdef _WIN32
case BlobType::BT_DX_BUF_SHARED: {
return std::make_shared<RemoteD3DBuffer>(public_context, stream, desc, layout, mem, 0, 0, blob_type);
}
#endif
default:
break;
}
return nullptr;
}
InferenceEngine::RemoteBlob::Ptr RemoteContextImpl::create_buffer(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc) {
cldnn::layout layout(DataTypeFromPrecision(desc.getPrecision()),
FormatFromLayout(desc.getLayout()),
tensor_from_dims(desc.getDims()));
auto& stream = m_engine->get_service_stream();
return std::make_shared<RemoteCLbuffer>(public_context,
stream,
desc,
layout,
nullptr, 0, 0,
BlobType::BT_BUF_INTERNAL);
}
InferenceEngine::RemoteBlob::Ptr RemoteContextImpl::create_usm(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
BlobType alloc_type) {
cldnn::layout layout(DataTypeFromPrecision(desc.getPrecision()),
FormatFromLayout(desc.getLayout()),
tensor_from_dims(desc.getDims()));
auto& stream = m_engine->get_service_stream();
return std::make_shared<RemoteUSMbuffer>(public_context,
stream,
desc,
layout,
nullptr, 0, 0,
alloc_type);
}
void RemoteContextImpl::check_if_shared() {
OPENVINO_ASSERT(m_type == RemoteContextImpl::ContextType::DEV_SHARED, "[GPU] Shared context is required to to share this type of memory");
}
InferenceEngine::MemoryBlob::Ptr RemoteContextImpl::create_host_blob(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc) {
if (m_engine->use_unified_shared_memory())
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(desc, std::make_shared<USMHostAllocator>(public_context)));
else
return std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(make_blob_with_precision(desc));
}
InferenceEngine::RemoteBlob::Ptr RemoteContextImpl::create_blob(InferenceEngine::gpu::ClContext::Ptr public_context,
const InferenceEngine::TensorDesc& desc,
const InferenceEngine::ParamMap& params) {
using namespace InferenceEngine;
if (params.empty()) {
// user wants plugin to allocate blob by itself and return handle
return create_buffer(public_context, desc);
} else {
// user will supply shared object handle
std::string mem_type = extract_object<std::string>(params, GPU_PARAM_KEY(SHARED_MEM_TYPE));
bool is_usm = mem_type == GPU_PARAM_VALUE(USM_HOST_BUFFER) ||
mem_type == GPU_PARAM_VALUE(USM_DEVICE_BUFFER) ||
mem_type == GPU_PARAM_VALUE(USM_USER_BUFFER);
OPENVINO_ASSERT(!is_usm || m_engine->use_unified_shared_memory(),
"[GPU] Can't create USM tensor as USM is not supported (or manually disabled) on current device");
if (GPU_PARAM_VALUE(VA_SURFACE) == mem_type) {
check_if_shared();
return reuse_surface(public_context, desc, params);
} else if (GPU_PARAM_VALUE(USM_HOST_BUFFER) == mem_type) {
return create_usm(public_context, desc, BlobType::BT_USM_HOST_INTERNAL);
} else if (GPU_PARAM_VALUE(USM_DEVICE_BUFFER) == mem_type) {
return create_usm(public_context, desc, BlobType::BT_USM_DEVICE_INTERNAL);
} else {
BlobType blob_type;
cldnn::shared_handle mem = nullptr;
if (GPU_PARAM_VALUE(OCL_BUFFER) == mem_type) {
blob_type = BlobType::BT_BUF_SHARED;
mem = extract_object<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
} else if (GPU_PARAM_VALUE(USM_USER_BUFFER) == mem_type) {
blob_type = BlobType::BT_USM_SHARED;
mem = extract_object<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
} else if (GPU_PARAM_VALUE(OCL_IMAGE2D) == mem_type) {
blob_type = BlobType::BT_IMG_SHARED;
mem = extract_object<cldnn::shared_handle>(params, GPU_PARAM_KEY(MEM_HANDLE));
#ifdef _WIN32
} else if (GPU_PARAM_VALUE(DX_BUFFER) == mem_type) {
blob_type = BlobType::BT_DX_BUF_SHARED;
mem = extract_object<cldnn::shared_handle>(params, GPU_PARAM_KEY(DEV_OBJECT_HANDLE));
check_if_shared();
#endif
} else {
OPENVINO_ASSERT(false, "[GPU] Unsupported shared object type ", mem_type);
}
return reuse_memory(public_context, desc, mem, blob_type);
}
}
}
} // namespace intel_gpu

View File

@ -127,11 +127,12 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
const auto defaultPrecisions = ngraph::pass::low_precision::precision_set::int8_support;
bool enableInt8;
bool enable_loop_unrolling = config.get_property(ov::intel_gpu::enable_loop_unrolling);
{
ngraph::pass::Manager manager;
manager.set_per_pass_validation(false);
enableInt8 = config.enableInt8 && ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(func);
enableInt8 = config.get_property(ov::intel_gpu::enable_lp_transformations) && ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(func);
if (enableInt8) {
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(
std::vector<ngraph::element::Type>{ ngraph::element::i8, ngraph::element::u8, ngraph::element::i4, ngraph::element::u4 });
@ -144,7 +145,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ngraph::pass::WrapInterpolateIntoTransposes>();
manager.register_pass<ngraph::pass::TransposeSinking>();
if (!config.enable_loop_unrolling) {
if (!enable_loop_unrolling) {
manager.register_pass<ngraph::pass::BidirectionalLSTMSequenceDecomposition>();
manager.register_pass<ngraph::pass::BidirectionalGRUSequenceDecomposition>();
manager.register_pass<ngraph::pass::BidirectionalRNNSequenceDecomposition>();
@ -158,7 +159,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ngraph::pass::GRUCellDecomposition>();
manager.register_pass<ngraph::pass::RNNCellDecomposition>();
if (config.enable_loop_unrolling) {
if (enable_loop_unrolling) {
manager.register_pass<ngraph::pass::BidirectionalLSTMSequenceDecomposition>();
manager.register_pass<ngraph::pass::BidirectionalGRUSequenceDecomposition>();
manager.register_pass<ngraph::pass::BidirectionalRNNSequenceDecomposition>();
@ -205,14 +206,14 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
};
// Add conversion from FP data types to infer precision if it's specified
if (config.inference_precision != ov::element::undefined) {
auto inference_precision = config.inference_precision;
if (!fp_precision_supported(inference_precision))
inference_precision = fallback_precision;
auto infer_precision = config.get_property(ov::hint::inference_precision);
if (infer_precision != ov::element::undefined) {
if (!fp_precision_supported(infer_precision))
infer_precision = fallback_precision;
for (auto& et : fp_element_types) {
if (et != inference_precision) {
convert_precision_list.push_back({et, inference_precision});
if (et != infer_precision) {
convert_precision_list.push_back({et, infer_precision});
}
}
}
@ -330,7 +331,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
return isCellPrimitiveSupported(node);
});
if (config.enable_loop_unrolling) {
if (enable_loop_unrolling) {
pass_config->set_callback<ngraph::pass::ConvertRNNSequenceToTensorIterator,
ngraph::pass::ConvertGRUSequenceToTensorIterator,
ngraph::pass::ConvertLSTMSequenceToTensorIterator>(
@ -550,10 +551,10 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ngraph::pass::UnrollTensorIterator>();
auto pass_config = manager.get_pass_config();
pass_config->set_callback<ngraph::pass::UnrollTensorIterator>(
[this](const std::shared_ptr<const ngraph::Node> &node) -> bool {
[enable_loop_unrolling](const std::shared_ptr<const ngraph::Node> &node) -> bool {
auto sub_graph_op = std::dynamic_pointer_cast<const ngraph::op::util::SubGraphOp>(node);
int64_t num_iter = sub_graph_op->get_num_iterations();
if (!config.enable_loop_unrolling)
if (!enable_loop_unrolling)
return num_iter != 1;
return num_iter >= 16;
});

View File

@ -9,17 +9,16 @@ namespace intel_gpu {
VariableState::VariableState(const std::string &name,
const std::vector<cldnn::network::VariableState::Ptr> &states,
std::shared_ptr<cldnn::engine> engine, int currentBatch) :
InferenceEngine::IVariableStateInternal {name},
currentBatch_ {currentBatch},
states_ {states},
desc_{
cldnn::engine& engine, int currentBatch)
: InferenceEngine::IVariableStateInternal {name}
, currentBatch_ {currentBatch}
, states_ {states}
, desc_ {
PrecisionFromDataType(states.front()->memory->get_layout().data_type),
AggregateShape(states.front()->memory->get_layout()),
InferenceEngine::Layout::ANY
},
engine_ {std::move(engine)} {
}
}
, engine_(engine) { }
void VariableState::Reset() {
IterateOverStates([this](cldnn::network::VariableState &state) {
@ -31,11 +30,11 @@ void VariableState::SetState(const InferenceEngine::Blob::Ptr &newState) {
auto lock = std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(newState)->rmap();
auto data = lock.as<char*>();
IterateOverStates([&data, this](cldnn::network::VariableState &state) {
state.memory->copy_from(engine_->get_program_stream(), data);
state.memory->copy_from(engine_.get_service_stream(), data);
data += state.memory->get_layout().bytes_count();
state.is_set = true;
});
engine_->get_program_stream().enqueue_barrier();
engine_.get_service_stream().enqueue_barrier();
}
InferenceEngine::Blob::CPtr VariableState::GetState() const {
@ -44,7 +43,7 @@ InferenceEngine::Blob::CPtr VariableState::GetState() const {
auto blobLock = std::dynamic_pointer_cast<InferenceEngine::MemoryBlob>(blob)->wmap();
auto data = blobLock.as<char*>();
IterateOverStates([&data, this](cldnn::network::VariableState &state) {
cldnn::mem_lock<char, cldnn::mem_lock_type::read> lock { state.memory, engine_->get_program_stream() };
cldnn::mem_lock<char, cldnn::mem_lock_type::read> lock { state.memory, engine_.get_service_stream() };
std::copy(lock.begin(), lock.end(), data);
data += state.memory->get_layout().bytes_count();
});

View File

@ -56,10 +56,8 @@ static size_t get_cpu_ram_size() {
namespace cldnn {
engine::engine(const device::ptr device, const engine_configuration& configuration, const InferenceEngine::ITaskExecutor::Ptr task_executor)
: _task_executor(task_executor)
, _device(device)
, _configuration(configuration) {}
engine::engine(const device::ptr device)
: _device(device) {}
device_info engine::get_device_info() const {
return _device->get_info();
@ -74,7 +72,7 @@ bool engine::use_unified_shared_memory() const {
GPU_DEBUG_IF(debug_config->disable_usm) {
return false;
}
if (_device->get_mem_caps().supports_usm() && _configuration.use_unified_shared_memory) {
if (_device->get_mem_caps().supports_usm()) {
return true;
}
return false;
@ -248,19 +246,11 @@ void engine::subtract_memory_used(uint64_t bytes, allocation_type type) {
}
}
const InferenceEngine::ITaskExecutor::Ptr engine::get_task_executor() {
return _task_executor;
}
std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
runtime_types runtime_type,
const device::ptr device,
const engine_configuration& configuration,
const InferenceEngine::ITaskExecutor::Ptr task_executor) {
std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_types runtime_type, const device::ptr device) {
std::shared_ptr<cldnn::engine> ret;
switch (engine_type) {
case engine_types::ocl:
ret = ocl::create_ocl_engine(device, runtime_type, configuration, task_executor);
ret = ocl::create_ocl_engine(device, runtime_type);
break;
default:
throw std::runtime_error("Invalid engine type");
@ -270,17 +260,14 @@ std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
return ret;
}
std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type,
runtime_types runtime_type,
const engine_configuration& configuration,
const InferenceEngine::ITaskExecutor::Ptr task_executor) {
std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_types runtime_type) {
device_query query(engine_type, runtime_type);
auto devices = query.get_available_devices();
auto iter = devices.find(std::to_string(device_query::device_id));
auto& device = iter != devices.end() ? iter->second : devices.begin()->second;
return engine::create(engine_type, runtime_type, device, configuration, task_executor);
return engine::create(engine_type, runtime_type, device);
}
} // namespace cldnn

View File

@ -0,0 +1,196 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "intel_gpu/runtime/execution_config.hpp"
#include "intel_gpu/runtime/debug_configuration.hpp"
#include <thread>
namespace ov {
namespace intel_gpu {
ExecutionConfig::ExecutionConfig() {
set_default();
}
class InferencePrecisionValidator : public BaseValidator {
public:
bool is_valid(const ov::Any& v) const override {
auto precision = v.as<ov::element::Type>();
return precision == ov::element::f16 || precision == ov::element::f32;
}
};
class PerformanceModeValidator : public BaseValidator {
public:
bool is_valid(const ov::Any& v) const override {
auto mode = v.as<ov::hint::PerformanceMode>();
return mode == ov::hint::PerformanceMode::CUMULATIVE_THROUGHPUT ||
mode == ov::hint::PerformanceMode::THROUGHPUT ||
mode == ov::hint::PerformanceMode::LATENCY ||
mode == ov::hint::PerformanceMode::UNDEFINED;
}
};
void ExecutionConfig::set_default() {
register_property<PropertyVisibility::PUBLIC>(
std::make_tuple(ov::device::id, "0"),
std::make_tuple(ov::enable_profiling, false),
std::make_tuple(ov::cache_dir, ""),
std::make_tuple(ov::num_streams, 1),
std::make_tuple(ov::compilation_num_threads, std::max(1, static_cast<int>(std::thread::hardware_concurrency()))),
std::make_tuple(ov::hint::inference_precision, ov::element::f16, InferencePrecisionValidator()),
std::make_tuple(ov::hint::model_priority, ov::hint::Priority::MEDIUM),
std::make_tuple(ov::hint::performance_mode, ov::hint::PerformanceMode::LATENCY, PerformanceModeValidator()),
std::make_tuple(ov::hint::num_requests, 0),
std::make_tuple(ov::intel_gpu::hint::host_task_priority, ov::hint::Priority::MEDIUM),
std::make_tuple(ov::intel_gpu::hint::queue_throttle, ov::intel_gpu::hint::ThrottleLevel::MEDIUM),
std::make_tuple(ov::intel_gpu::hint::queue_priority, ov::hint::Priority::MEDIUM),
std::make_tuple(ov::intel_gpu::enable_loop_unrolling, true),
// Legacy API properties
std::make_tuple(ov::intel_gpu::enable_dynamic_batch, false),
std::make_tuple(ov::intel_gpu::exclusive_async_requests, false),
std::make_tuple(ov::intel_gpu::nv12_two_inputs, false),
std::make_tuple(ov::intel_gpu::config_file, ""),
std::make_tuple(ov::intel_gpu::enable_lp_transformations, false));
register_property<PropertyVisibility::INTERNAL>(
std::make_tuple(ov::intel_gpu::max_dynamic_batch, 1),
std::make_tuple(ov::intel_gpu::queue_type, QueueTypes::out_of_order),
std::make_tuple(ov::intel_gpu::optimize_data, false),
std::make_tuple(ov::intel_gpu::enable_memory_pool, true),
std::make_tuple(ov::intel_gpu::allow_static_input_reorder, false),
std::make_tuple(ov::intel_gpu::custom_outputs, std::vector<std::string>{}),
std::make_tuple(ov::intel_gpu::tuning_config, ov::intel_gpu::TuningConfig{}),
std::make_tuple(ov::intel_gpu::dump_graphs, ""),
std::make_tuple(ov::intel_gpu::force_implementations, ImplForcingMap{}),
std::make_tuple(ov::intel_gpu::partial_build_program, false),
std::make_tuple(ov::intel_gpu::allow_new_shape_infer, false));
}
void ExecutionConfig::register_property_impl(const std::pair<std::string, ov::Any>& property, PropertyVisibility visibility, BaseValidator::Ptr validator) {
property_validators[property.first] = validator;
supported_properties[property.first] = visibility;
internal_properties[property.first] = property.second;
}
void ExecutionConfig::set_property(const AnyMap& config) {
for (auto& kv : config) {
auto& name = kv.first;
auto& val = kv.second;
OPENVINO_ASSERT(is_supported(kv.first), "[GPU] Attepmpt to set property ", name, " (", val.as<std::string>(), ") which was not registered!\n");
OPENVINO_ASSERT(property_validators.at(name)->is_valid(val), "[GPU] Invalid value for property ", name, ": ", val.as<std::string>());
internal_properties[name] = val;
}
}
bool ExecutionConfig::is_supported(const std::string& name) const {
bool supported = supported_properties.find(name) != supported_properties.end();
bool has_validator = property_validators.find(name) != property_validators.end();
return supported && has_validator;
}
bool ExecutionConfig::is_set_by_user(const std::string& name) const {
return user_properties.find(name) != user_properties.end();
}
void ExecutionConfig::set_user_property(const AnyMap& config) {
for (auto& kv : config) {
auto& name = kv.first;
auto& val = kv.second;
bool supported = is_supported(name) && supported_properties.at(name) == PropertyVisibility::PUBLIC;
OPENVINO_ASSERT(supported, "[GPU] Attepmpt to set user property ", name, " (", val.as<std::string>(), ") which was not registered or internal!\n");
OPENVINO_ASSERT(property_validators.at(name)->is_valid(val), "[GPU] Invalid value for property ", name, ": `", val.as<std::string>(), "`");
user_properties[kv.first] = kv.second;
}
}
Any ExecutionConfig::get_property(const std::string& name) const {
if (user_properties.find(name) != user_properties.end()) {
return user_properties.at(name);
}
OPENVINO_ASSERT(internal_properties.find(name) != internal_properties.end(), "[GPU] Can't get internal property with name ", name);
return internal_properties.at(name);
}
void ExecutionConfig::apply_performance_hints(const cldnn::device_info& info) {
if (is_set_by_user(ov::hint::performance_mode)) {
const auto mode = get_property(ov::hint::performance_mode);
if (!is_set_by_user(ov::num_streams)) {
if (mode == ov::hint::PerformanceMode::LATENCY) {
set_property(ov::num_streams(1));
} else if (mode == ov::hint::PerformanceMode::THROUGHPUT) {
set_property(ov::num_streams(ov::streams::AUTO));
}
}
}
if (get_property(ov::num_streams) == ov::streams::AUTO) {
int32_t n_streams = std::max<int32_t>(info.num_ccs, 2);
set_property(ov::num_streams(n_streams));
}
}
void ExecutionConfig::apply_priority_hints(const cldnn::device_info& info) {
if (is_set_by_user(ov::hint::model_priority)) {
const auto priority = get_property(ov::hint::model_priority);
if (!is_set_by_user(ov::intel_gpu::hint::queue_priority)) {
set_property(ov::intel_gpu::hint::queue_priority(priority));
}
}
}
void ExecutionConfig::apply_debug_options(const cldnn::device_info& info) {
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_graphs.empty()) {
set_property(ov::intel_gpu::dump_graphs(debug_config->dump_graphs));
}
GPU_DEBUG_IF(debug_config->serialize_compile == 1) {
set_property(ov::compilation_num_threads(1));
}
}
void ExecutionConfig::apply_hints(const cldnn::device_info& info) {
apply_performance_hints(info);
apply_priority_hints(info);
apply_debug_options(info);
}
void ExecutionConfig::apply_user_properties(const cldnn::device_info& info) {
// Copy internal properties before applying hints to ensure that
// a property set by hint won't be overriden by a value in user config.
// E.g num_streams=AUTO && hint=THROUGHPUT
// If we apply hints first and then copy all values from user config to internal one,
// then we'll get num_streams=AUTO in final config while some integer number is expected.
for (auto& kv : user_properties) {
internal_properties[kv.first] = kv.second;
}
apply_hints(info);
if (!is_set_by_user(ov::intel_gpu::enable_lp_transformations)) {
set_property(ov::intel_gpu::enable_lp_transformations(info.supports_imad || info.supports_immad));
}
user_properties.clear();
}
std::string ExecutionConfig::to_string() const {
std::stringstream s;
s << "internal properties:\n";
for (auto& kv : internal_properties) {
s << "\t" << kv.first << ": " << kv.second.as<std::string>() << std::endl;
}
s << "user properties:\n";
for (auto& kv : user_properties) {
s << "\t" << kv.first << ": " << kv.second.as<std::string>() << std::endl;
}
return s.str();
}
} // namespace intel_gpu
} // namespace ov

View File

@ -58,7 +58,7 @@ namespace cldnn {
std::mutex kernels_cache::_mutex;
std::string kernels_cache::get_cache_path() const {
auto path = _engine.configuration().kernels_cache_path;
auto path = _config.get_property(ov::cache_dir);
if (path.empty()) {
return {};
}
@ -76,7 +76,7 @@ bool kernels_cache::is_cache_enabled() const {
}
}
return !_engine.configuration().kernels_cache_path.empty();
return !_config.get_property(ov::cache_dir).empty();
}
size_t kernels_cache::get_max_kernels_per_batch() const {
@ -156,8 +156,16 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
}
}
kernels_cache::kernels_cache(engine& engine, uint32_t prog_id, const std::vector<std::string>& batch_header_str)
: _engine(engine), _prog_id(prog_id), batch_header_str(std::move(batch_header_str)) { }
kernels_cache::kernels_cache(engine& engine,
const ExecutionConfig& config,
uint32_t prog_id,
InferenceEngine::CPUStreamsExecutor::Ptr task_executor,
const std::vector<std::string>& batch_header_str)
: _engine(engine)
, _task_executor(task_executor)
, _config(config)
, _prog_id(prog_id)
, batch_header_str(std::move(batch_header_str)) { }
kernel_id kernels_cache::set_kernel_source(
const std::shared_ptr<kernel_string>& kernel_string,
@ -188,8 +196,8 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
auto& cl_build_engine = dynamic_cast<const ocl::ocl_engine&>(build_engine);
bool dump_sources = !_engine.configuration().sources_dumps_dir.empty() || batch.dump_custom_program;
std::string dump_sources_dir = _engine.configuration().sources_dumps_dir;
bool dump_sources = batch.dump_custom_program;
std::string dump_sources_dir = "";
GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_sources.empty()) {
dump_sources = true;
@ -371,25 +379,21 @@ void kernels_cache::build_all() {
if (!_pending_compilation)
return;
std::unique_ptr<ocl::ocl_engine> _build_engine = nullptr;
if (_engine.type() == engine_types::ocl) {
_build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl,
_engine.configuration(), _engine.get_task_executor()));
}
ocl::ocl_engine& _build_engine = downcast<ocl::ocl_engine>(_engine);
std::vector<batch_program> batches;
{
std::lock_guard<std::mutex> lock(_mutex);
get_program_source(_kernels_code, &batches);
}
auto _task_executor = _engine.get_task_executor();
if (_task_executor) {
std::exception_ptr exception;
std::vector<InferenceEngine::Task> tasks;
for (size_t idx = 0; idx < batches.size(); idx++) {
auto& batch = batches[idx];
tasks.push_back([this, &_build_engine, &batch, &exception] {
try {
build_batch(*_build_engine, batch);
build_batch(_build_engine, batch);
} catch(...) {
exception = std::current_exception();
}
@ -401,6 +405,11 @@ void kernels_cache::build_all() {
if (exception) {
std::rethrow_exception(exception);
}
} else {
for (size_t idx = 0; idx < batches.size(); idx++) {
build_batch(_build_engine, batches[idx]);
}
}
{
std::lock_guard<std::mutex> lock(_mutex);
@ -458,8 +467,7 @@ void kernels_cache::compile() {
std::unique_ptr<ocl::ocl_engine> _build_engine = nullptr;
if (_engine.type() == engine_types::ocl) {
_build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl,
_engine.configuration(), _engine.get_task_executor()));
_build_engine = std::unique_ptr<ocl::ocl_engine>(new ocl::ocl_engine(_engine.get_device(), runtime_types::ocl));
}
// create batches
@ -497,8 +505,7 @@ void kernels_cache::save(BinaryOutputBuffer& ob) const {
}
ob << entry_point_to_id;
std::unique_ptr<ocl::ocl_engine> build_engine =
cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl, _engine.configuration(), _engine.get_task_executor());
std::unique_ptr<ocl::ocl_engine> build_engine = cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl);
std::vector<std::vector<unsigned char>> precompiled_kernels;
@ -540,7 +547,7 @@ void kernels_cache::load(BinaryInputBuffer& ib) {
OPENVINO_ASSERT(_engine.type() == engine_types::ocl, "[GPU] Not supported engine type");
std::unique_ptr<ocl::ocl_engine> build_engine =
cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl, _engine.configuration(), _engine.get_task_executor());
cldnn::make_unique<ocl::ocl_engine>(_engine.get_device(), runtime_types::ocl);
std::map<std::string, std::string> entry_point_to_id;
std::vector<std::vector<unsigned char>> precompiled_kernels;

View File

@ -7,6 +7,7 @@
#include "intel_gpu/graph/serialization/binary_buffer.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/kernel.hpp"
#include "intel_gpu/runtime/execution_config.hpp"
#include <map>
#include <mutex>
@ -76,6 +77,8 @@ public:
private:
static std::mutex _mutex;
engine& _engine;
InferenceEngine::CPUStreamsExecutor::Ptr _task_executor;
ExecutionConfig _config;
uint32_t _prog_id = 0;
kernels_code _kernels_code;
size_t _kernel_idx = 0;
@ -91,7 +94,11 @@ private:
size_t get_max_kernels_per_batch() const;
public:
explicit kernels_cache(engine& engine, uint32_t prog_id, const std::vector<std::string>& batch_header_str = {});
explicit kernels_cache(engine& engine,
const ExecutionConfig& config,
uint32_t prog_id,
InferenceEngine::CPUStreamsExecutor::Ptr task_executor = nullptr,
const std::vector<std::string>& batch_header_str = {});
kernel_id set_kernel_source(const std::shared_ptr<kernel_string>& kernel_string,
bool dump_custom_program);
kernel::ptr get_kernel(kernel_id id) const;

View File

@ -14,20 +14,20 @@ command_queues_builder::command_queues_builder()
: _profiling(false),
_out_of_order(false),
_supports_queue_families(false),
_priority_mode(priority_mode_types::disabled),
_throttle_mode(throttle_mode_types::disabled) {}
_priority_mode(),
_throttle_mode() {}
#if CL_TARGET_OPENCL_VERSION >= 200
std::vector<cl_queue_properties> command_queues_builder::get_properties(const cl::Device& device, uint16_t stream_id) {
std::vector<cl_queue_properties> properties;
if (_priority_mode != priority_mode_types::disabled) {
if (_priority_mode.has_value()) {
unsigned cl_queue_priority_value = CL_QUEUE_PRIORITY_MED_KHR;
switch (_priority_mode) {
case priority_mode_types::high:
switch (_priority_mode.value()) {
case ov::hint::Priority::HIGH:
cl_queue_priority_value = CL_QUEUE_PRIORITY_HIGH_KHR;
break;
case priority_mode_types::low:
case ov::hint::Priority::LOW:
cl_queue_priority_value = CL_QUEUE_PRIORITY_LOW_KHR;
break;
default:
@ -37,13 +37,13 @@ std::vector<cl_queue_properties> command_queues_builder::get_properties(const cl
properties.insert(properties.end(), {CL_QUEUE_PRIORITY_KHR, cl_queue_priority_value});
}
if (_throttle_mode != throttle_mode_types::disabled) {
if (_throttle_mode.has_value()) {
unsigned cl_queue_throttle_value = CL_QUEUE_THROTTLE_MED_KHR;
switch (_throttle_mode) {
case throttle_mode_types::high:
switch (_throttle_mode.value()) {
case ov::intel_gpu::hint::ThrottleLevel::HIGH:
cl_queue_throttle_value = CL_QUEUE_THROTTLE_HIGH_KHR;
break;
case throttle_mode_types::low:
case ov::intel_gpu::hint::ThrottleLevel::LOW:
cl_queue_throttle_value = CL_QUEUE_THROTTLE_LOW_KHR;
break;
default:
@ -107,27 +107,19 @@ ocl_queue_type command_queues_builder::build(const cl::Context& context, const c
#else
queue = clCreateCommandQueue(context.get(), device.get(), properties, &error_code);
#endif
if (error_code != CL_SUCCESS) {
CLDNN_ERROR_MESSAGE("Command queues builders",
"clCreateCommandQueueWithPropertiesINTEL error " + std::to_string(error_code));
}
OPENVINO_ASSERT(error_code == CL_SUCCESS, "[GPU] Command queues builder returned ", error_code, " error code");
return queue;
}
void command_queues_builder::set_priority_mode(priority_mode_types priority, bool extension_support) {
void command_queues_builder::set_priority_mode(ov::hint::Priority priority, bool extension_support) {
if (extension_support) {
_priority_mode = priority;
} else {
_priority_mode = priority_mode_types::disabled;
}
}
void command_queues_builder::set_throttle_mode(throttle_mode_types throttle, bool extension_support) {
void command_queues_builder::set_throttle_mode(ov::intel_gpu::hint::ThrottleLevel throttle, bool extension_support) {
if (extension_support) {
_throttle_mode = throttle;
} else {
_throttle_mode = throttle_mode_types::disabled;
}
}

View File

@ -6,6 +6,7 @@
#include "ocl_common.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/optionals.hpp"
namespace cldnn {
namespace ocl {
@ -14,8 +15,8 @@ class command_queues_builder {
public:
command_queues_builder();
ocl_queue_type build(const cl::Context& context, const cl::Device& device);
void set_throttle_mode(throttle_mode_types throttle, bool extension_support);
void set_priority_mode(priority_mode_types priority, bool extension_support);
void set_throttle_mode(ov::intel_gpu::hint::ThrottleLevel throttle, bool extension_support);
void set_priority_mode(ov::hint::Priority priority, bool extension_support);
void set_profiling(bool flag) { _profiling = flag; }
void set_out_of_order(bool flag) { _out_of_order = flag; }
void set_supports_queue_families(bool extension_support);
@ -24,8 +25,8 @@ private:
bool _profiling;
bool _out_of_order;
bool _supports_queue_families;
priority_mode_types _priority_mode;
throttle_mode_types _throttle_mode;
optional_value<ov::hint::Priority> _priority_mode;
optional_value<ov::intel_gpu::hint::ThrottleLevel> _throttle_mode;
#if CL_TARGET_OPENCL_VERSION >= 200
std::vector<cl_queue_properties> get_properties(const cl::Device& device, uint16_t stream_id = 0);
#else

View File

@ -288,7 +288,7 @@ bool ocl_device::is_same(const device::ptr other) {
if (!casted)
return false;
return _context == casted->get_context() && _device == casted->get_device() && _platform == casted->get_platform();
return _device == casted->get_device() && _platform == casted->get_platform();
}
} // namespace ocl

View File

@ -41,9 +41,8 @@ namespace ocl {
ocl_error::ocl_error(cl::Error const& err)
: ov::Exception("[GPU] " + std::string(err.what()) + std::string(", error code: ") + std::to_string(err.err())) {}
ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type,
const engine_configuration& conf, const InferenceEngine::ITaskExecutor::Ptr task_executor)
: engine(dev, conf, task_executor) {
ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type)
: engine(dev) {
OPENVINO_ASSERT(runtime_type == runtime_types::ocl, "[GPU] Invalid runtime type specified for OCL engine. Only OCL runtime is supported");
auto casted = dynamic_cast<ocl_device*>(dev.get());
@ -52,12 +51,11 @@ ocl_engine::ocl_engine(const device::ptr dev, runtime_types runtime_type,
casted->get_device().getInfo(CL_DEVICE_EXTENSIONS, &_extensions);
_usm_helper.reset(new cl::UsmHelper(get_cl_context(), get_cl_device(), use_unified_shared_memory()));
_program_stream.reset(new ocl_stream(*this));
_service_stream.reset(new ocl_stream(*this, ExecutionConfig()));
}
#ifdef ENABLE_ONEDNN_FOR_GPU
dnnl::engine& ocl_engine::get_onednn_engine() const {
void ocl_engine::create_onednn_engine(const ExecutionConfig& config) {
const std::lock_guard<std::mutex> lock(onednn_mutex);
OPENVINO_ASSERT(_device->get_info().vendor_id == INTEL_VENDOR_ID, "[GPU] OneDNN engine can be used for Intel GPUs only");
if (!_onednn_engine) {
@ -65,12 +63,12 @@ dnnl::engine& ocl_engine::get_onednn_engine() const {
if (!casted)
throw ov::Exception("[GPU] Invalid device type stored in ocl_engine");
auto config = this->configuration();
if (config.kernels_cache_path.empty()) {
std::string cache_dir = config.get_property(ov::cache_dir);
if (cache_dir.empty()) {
_onednn_engine = std::make_shared<dnnl::engine>(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get()));
} else {
// Use cached blob
auto path = config.kernels_cache_path;
auto path = cache_dir;
if (path.back() != '/' && path.back() != '\\') {
path += "/";
}
@ -79,7 +77,7 @@ dnnl::engine& ocl_engine::get_onednn_engine() const {
if (blob_id.empty()) {
// Create engine without cache_blob
_onednn_engine = std::make_shared<dnnl::engine>(dnnl::ocl_interop::make_engine(casted->get_device().get(), casted->get_context().get()));
return *_onednn_engine;
return;
}
std::string id_str(blob_id.begin(), blob_id.end());
@ -98,7 +96,10 @@ dnnl::engine& ocl_engine::get_onednn_engine() const {
}
}
}
}
dnnl::engine& ocl_engine::get_onednn_engine() const {
OPENVINO_ASSERT(_onednn_engine, "[GPU] Can't get onednn engine handle as it was not initialized. Please check that create_onednn_engine() was called");
return *_onednn_engine;
}
#endif
@ -154,7 +155,7 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
}
if (reset || res->is_memory_reset_needed(layout)) {
res->fill(get_program_stream());
res->fill(get_service_stream());
}
return res;
@ -266,26 +267,24 @@ bool ocl_engine::extension_supported(std::string extension) const {
return _extensions.find(extension) != std::string::npos;
}
stream::ptr ocl_engine::create_stream() const {
return std::make_shared<ocl_stream>(*this);
stream::ptr ocl_engine::create_stream(const ExecutionConfig& config) const {
return std::make_shared<ocl_stream>(*this, config);
}
stream::ptr ocl_engine::create_stream(void* handle) const {
return std::make_shared<ocl_stream>(*this, handle);
stream::ptr ocl_engine::create_stream(const ExecutionConfig& config, void* handle) const {
return std::make_shared<ocl_stream>(*this, config, handle);
}
stream& ocl_engine::get_program_stream() const {
return *_program_stream;
stream& ocl_engine::get_service_stream() const {
return *_service_stream;
}
std::shared_ptr<cldnn::engine> ocl_engine::create(const device::ptr device, runtime_types runtime_type,
const engine_configuration& configuration, const InferenceEngine::ITaskExecutor::Ptr task_executor) {
return std::make_shared<ocl::ocl_engine>(device, runtime_type, configuration, task_executor);
std::shared_ptr<cldnn::engine> ocl_engine::create(const device::ptr device, runtime_types runtime_type) {
return std::make_shared<ocl::ocl_engine>(device, runtime_type);
}
std::shared_ptr<cldnn::engine> create_ocl_engine(const device::ptr device, runtime_types runtime_type,
const engine_configuration& configuration, const InferenceEngine::ITaskExecutor::Ptr task_executor) {
return ocl_engine::create(device, runtime_type, configuration, task_executor);
std::shared_ptr<cldnn::engine> create_ocl_engine(const device::ptr device, runtime_types runtime_type) {
return ocl_engine::create(device, runtime_type);
}
} // namespace ocl

Some files were not shown because too many files have changed in this diff Show More