[GPU] Move tuning cache loading to kernel selector (#15112)

* [GPU] Move tuning cache loading to kernel selector. Remove tuning modes

* [GPU] Removed kernel runner
This commit is contained in:
Vladimir Paramuzov 2023-01-20 15:17:12 +04:00 committed by GitHub
parent c1a9152d1c
commit 34d16b8777
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 77 additions and 1089 deletions

View File

@ -17,10 +17,6 @@
#include <utility>
#include <set>
namespace kernel_selector {
class TuningCache;
} // namespace kernel_selector
namespace cldnn {
struct topology;
@ -248,9 +244,6 @@ public:
kernel::ptr get_kernel(kernel_id id);
kernels_cache& get_kernels_cache() const;
void load_tuning_cache();
std::shared_ptr<kernel_selector::TuningCache> get_tuning_cache() const { return tuning_cache; }
// returns {-1, -1} if it failed to estimate by allocating given batch size
std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
@ -270,7 +263,6 @@ private:
std::vector<program_node*> outputs;
nodes_ordering processing_order;
std::unique_ptr<pass_manager> pm;
std::shared_ptr<kernel_selector::TuningCache> tuning_cache;
bool is_body_program;
int8_t is_subgroup_local_block_io_supported;

View File

@ -48,41 +48,6 @@ static constexpr Property<bool, PropertyMutability::RW> partial_build_program{"G
static constexpr Property<bool, PropertyMutability::RW> allow_new_shape_infer{"GPU_ALLOW_NEW_SHAPE_INFER"};
static constexpr Property<std::string, PropertyMutability::RW> dump_graphs{"GPU_DUMP_GRAPHS"};
static constexpr Property<std::vector<std::string>, PropertyMutability::RW> custom_outputs{"GPU_CUSTOM_OUTPUTS"};
/// @brief Tuning mode.
enum class TuningMode {
/// @brief Tuning is disabled.
tuning_disabled,
/// @brief Tuning using the cached data (no on-line tuning for non-existing data).
tuning_use_cache,
/// @brief Tuning using the cached data if exist, tune and update cache otherwise.
tuning_tune_and_cache,
/// @brief Tuning using the cached data and update tasks.
/// @details Performs updating tasks like removal of invalid caches, promoting to new format, etc.
/// No tuning for non-existing data.
tuning_use_and_update,
/// @brief Retune the cache data even if it exists.
tuning_retune_and_cache
};
struct TuningConfig {
TuningMode mode;
std::string cache_file_path;
TuningConfig() : mode(TuningMode::tuning_disabled), cache_file_path("") {}
};
inline std::ostream& operator<<(std::ostream& os, const TuningConfig& val) {
os << val.cache_file_path;
return os;
}
static constexpr Property<TuningConfig, PropertyMutability::RW> tuning_config{"GPU_TUNING_CONFIG"};
static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implementations{"GPU_FORCE_IMPLEMENTATIONS"};
static constexpr Property<std::string, PropertyMutability::RW> config_file{"CONFIG_FILE"};
static constexpr Property<bool, PropertyMutability::RW> enable_lp_transformations{"LP_TRANSFORMS_MODE"};

View File

@ -9,7 +9,6 @@
#include "kernel_selector_helper.h"
#include "arg_max_min/arg_max_min_kernel_selector.h"
#include "arg_max_min/arg_max_min_kernel_base.h"
#include "kernel_runner.h"
namespace cldnn {
namespace ocl {

View File

@ -8,7 +8,6 @@
#include "impls/implementation_map.hpp"
#include "intel_gpu/runtime/error_handler.hpp"
#include "kernel_selector_helper.h"
#include "kernel_runner.h"
#include "kernel_selector/kernels/binary_convolution/binary_convolution_kernel_selector.h"
#include "kernel_selector/kernels/binary_convolution/binary_convolution_params.h"
#include <algorithm>
@ -75,14 +74,6 @@ public:
uint32_t dilation_x = dilation.size() >= 1 ? dilation[dilation.size() - 1] : 1;
params.dilation = {dilation_x, dilation_y, dilation_z};
const auto& tuning_config = impl_param.get_program().get_config().get_property(ov::intel_gpu::tuning_config);
if (tuning_config.mode == ov::intel_gpu::TuningMode::tuning_tune_and_cache ||
tuning_config.mode == ov::intel_gpu::TuningMode::tuning_retune_and_cache) {
optional_params.tuningParams.runner =
std::make_shared<gpu::kernel_runner>(impl_param.get_program().get_engine(), impl_param.get_program().get_id(), true);
}
return {params, optional_params};
}
};

View File

@ -8,7 +8,6 @@
#include "impls/implementation_map.hpp"
#include "intel_gpu/runtime/error_handler.hpp"
#include "kernel_selector_helper.h"
#include "kernel_runner.h"
#include "convolution/convolution_kernel_selector.h"
#include "convolution/convolution_params.h"
#include <algorithm>
@ -166,14 +165,6 @@ public:
auto& kernel_selector = kernel_selector::convolution_kernel_selector::Instance();
const auto& tuning_config = impl_param.get_program().get_config().get_property(ov::intel_gpu::tuning_config);
if (tuning_config.mode == ov::intel_gpu::TuningMode::tuning_tune_and_cache ||
tuning_config.mode == ov::intel_gpu::TuningMode::tuning_retune_and_cache) {
conv_optional_params.tuningParams.runner =
std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), arg.get_program().get_id(), true, true);
}
auto best_kernel = kernel_selector.get_best_kernel(conv_params, conv_optional_params);
return make_unique<convolution_impl>(best_kernel);

View File

@ -7,7 +7,6 @@
#include "impls/implementation_map.hpp"
#include "intel_gpu/runtime/error_handler.hpp"
#include "kernel_selector_helper.h"
#include "kernel_runner.h"
#include "convolution/convolution_kernel_selector.h"
#include "convolution/convolution_params.h"
#include <algorithm>

View File

@ -10,7 +10,6 @@
#include "fully_connected/fully_connected_params.h"
#include "intel_gpu/runtime/error_handler.hpp"
#include "kernel_runner.h"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/input_layout.hpp"
@ -119,7 +118,6 @@ public:
params.quantization = kernel_selector::QuantizationType::NONE;
}
optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(progam.get_engine(), progam.get_id(), true);
return {params, optional_params};
}

View File

@ -28,7 +28,6 @@ using namespace cldnn;
namespace cldnn {
enum class data_types : size_t;
enum class tuning_mode;
struct format;
struct layout;
struct program;
@ -65,7 +64,6 @@ using softmax_dim = kernel_selector::SoftmaxDim;
using mean_subtruct_mode = kernel_selector::MeanSubtractMode;
using mean_op = kernel_selector::MeanOp;
using concat_axis = kernel_selector::ConcatAxis;
using tuning_mode = kernel_selector::TuningMode;
using sample_type = kernel_selector::ResampleType;
using coordinate_transformation_mode = kernel_selector::CoordinateTransformationMode;
using nearest_mode = kernel_selector::NearestMode;
@ -101,7 +99,6 @@ kernel_selector::data_layout to_data_layout(format f);
cldnn::format from_data_layout(kernel_selector::data_layout l);
kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped);
cldnn::format::type from_weights_layout(kernel_selector::weights_layout l);
kernel_selector::tuning_mode to_tuning_mode(ov::intel_gpu::TuningMode mode);
kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {});
kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped = false);
layout from_weights_tensor(const kernel_selector::weights_tensor& t);

View File

@ -1,249 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "kernel_runner.h"
#include "runtime/kernels_cache.hpp"
#include "intel_gpu/runtime/stream.hpp"
#include "weight_bias_params.h"
#include "kernel_selector_helper.h"
#include <chrono>
#include <vector>
#include <limits>
#include <algorithm>
namespace cldnn {
namespace gpu {
kernel_runner::kernel_runner(engine& engine_ref, uint32_t program_id, bool weights_and_bias_exist, bool zero_points_exist)
: _engine(engine_ref), program_id(program_id), weights_and_bias_exist(weights_and_bias_exist), zero_points_exist(zero_points_exist) {}
void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kernels_data,
kernel_arguments_data& args) {
const auto& base_params = *static_cast<kernel_selector::base_params*>(kernels_data[0].params.get());
// Prepare input buffers
if (input_buffers.empty()) {
for (const auto& input : base_params.inputs) {
int num_of_input_elements = static_cast<int>(input.PhysicalSize());
input_buffers.push_back(_engine.allocate_memory(
{from_data_type(input.GetDType()), format::bfyx, tensor(1, 1, num_of_input_elements, 1)}));
}
}
for (const auto& input : input_buffers) {
args.inputs.push_back(input);
}
// Prepare fused operations buffers
if (fused_ops_buffers.empty()) {
for (auto& fused_op : base_params.fused_ops) {
for (auto& fused_ops_input : fused_op.tensors) {
auto num_of_elements = static_cast<int>(fused_ops_input.PhysicalSize());
fused_ops_buffers.push_back(_engine.allocate_memory(
{ from_data_type(fused_ops_input.GetDType()), format::bfyx, tensor(1, 1, num_of_elements, 1) }));
}
}
}
for (const auto& fused_op_input : fused_ops_buffers) {
args.fused_op_inputs.push_back(fused_op_input);
}
// Prepare output buffer
if (output_buffers.empty()) {
for (size_t i = 0; i < base_params.outputs.size(); ++i) {
int num_of_output_elements = static_cast<int>(base_params.outputs[i].PhysicalSize());
output_buffers.push_back(_engine.allocate_memory({from_data_type(base_params.outputs[0].GetDType()),
format::bfyx, tensor(1, 1, num_of_output_elements, 1)}));
}
}
for (const auto& output : output_buffers) {
args.outputs.push_back(output);
}
if (weights_and_bias_exist) {
// Prepare weight buffer
const auto& weights_bias_params =
*static_cast<kernel_selector::weight_bias_params*>(kernels_data[0].params.get());
int num_of_weight_elements_ifm = static_cast<int>(weights_bias_params.weights.IFM().v);
int num_of_weight_elements_spatial_y = static_cast<int>(weights_bias_params.weights.Y().v);
int num_of_weight_elements_spatial_x = static_cast<int>(weights_bias_params.weights.X().v);
int num_of_weight_elements_spatial = static_cast<int>(weights_bias_params.weights.PhysicalSize());
int num_of_weight_elements_ofm = 1;
cldnn::format::type fmt = cldnn::format::bfyx;
if (!cldnn::format::is_image_2d(from_weights_layout(weights_bias_params.weights.GetLayout()))) {
if (weight_buffers.empty())
weight_buffers.push_back(
_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
fmt,
tensor(num_of_weight_elements_ofm, 1, num_of_weight_elements_spatial, 1)}));
if (weight_buffers[0]->get_layout().format != fmt)
weight_buffers[0] =
_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
fmt,
tensor(num_of_weight_elements_ofm, 1, num_of_weight_elements_spatial, 1)});
while (weight_buffers[0]->get_layout().bytes_count() < weights_bias_params.weights.PhysicalSizeInBytes()) {
// Weights layout depends on the kernel. Multiply the buffer size by 2 until it is big enough
// (to avoid complex computations of the exact buffer size according to the chosen layout).
weight_buffers.clear();
num_of_weight_elements_spatial *= 2;
weight_buffers.push_back(
_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
fmt,
tensor(num_of_weight_elements_ofm, 1, num_of_weight_elements_spatial, 1)}));
}
} else {
weight_buffers.clear();
fmt = from_weights_layout(weights_bias_params.weights.GetLayout());
num_of_weight_elements_ofm = static_cast<int>(weights_bias_params.weights.OFM().v);
weight_buffers.push_back(_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
fmt,
tensor(num_of_weight_elements_ofm,
num_of_weight_elements_ifm,
num_of_weight_elements_spatial_x,
num_of_weight_elements_spatial_y)}));
}
args.weights = weight_buffers[0];
// Prepare bias buffer
if (!weights_bias_params.bias.empty()) {
if (bias_buffers.empty()) {
int num_of_bias_elements = static_cast<int>(weights_bias_params.bias[0].PhysicalSize());
bias_buffers.push_back(_engine.allocate_memory({from_data_type(weights_bias_params.bias[0].GetDType()),
format::bfyx,
tensor(1, num_of_bias_elements, 1, 1)}));
}
args.bias = bias_buffers[0];
}
if (zero_points_exist) {
const auto& zero_point_params =
static_cast<const kernel_selector::weight_bias_zero_point_params&>(weights_bias_params);
if (!zero_point_params.weights_zero_points.empty()) {
if (weight_zero_point_buffers.empty()) {
auto& weight_zero_point = zero_point_params.weights_zero_points[0];
auto num_of_elements = static_cast<int>(weight_zero_point.PhysicalSize());
weight_zero_point_buffers.push_back(
_engine.allocate_memory({
from_data_type(weight_zero_point.GetDType()),
format::bfyx,
tensor(1, num_of_elements, 1, 1) }));
}
args.weights_zero_points = weight_zero_point_buffers[0];
}
if (!zero_point_params.activations_zero_points.empty()) {
if (activation_zero_point_buffers.empty()) {
auto& activation_zero_point = zero_point_params.activations_zero_points[0];
auto num_of_elements = static_cast<int>(activation_zero_point.PhysicalSize());
activation_zero_point_buffers.push_back(
_engine.allocate_memory({
from_data_type(activation_zero_point.GetDType()),
format::bfyx,
tensor(1, num_of_elements, 1, 1) }));
}
args.activations_zero_points = activation_zero_point_buffers[0];
}
if (!zero_point_params.compensation.empty()) {
if (compensation_buffers.empty()) {
auto& compensation = zero_point_params.compensation[0];
auto num_of_elements = static_cast<int>(compensation.PhysicalSize());
compensation_buffers.push_back(
_engine.allocate_memory({
from_data_type(compensation.GetDType()),
format::bfyx,
tensor(1, num_of_elements, 1, 1) }));
}
args.compensation = compensation_buffers[0];
}
}
}
}
std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_selector::KernelsData& kernels_data) {
std::vector<std::chrono::nanoseconds> run_times;
stream::ptr stream = _engine.create_stream({});
int num_of_kernels_to_run = static_cast<int>(kernels_data.size());
int num_of_kernels_run = 0;
kernel_selector::KernelsData::const_iterator batch_start = kernels_data.begin();
kernel_selector::KernelsData::const_iterator batch_end;
while (num_of_kernels_to_run > 0) {
int current_compilation_batch = std::min(num_of_kernels_to_run, compilation_batch_size);
batch_end = batch_start + current_compilation_batch;
std::vector<kernel::ptr> kernels;
kernels_cache cache(_engine, {}, program_id);
for (auto it = batch_start; it < batch_end; it++) {
auto kernel_id = cache.set_kernel_source(it->kernels[0].code.kernelString, false);
kernels.push_back(cache.get_kernel(kernel_id));
}
kernel_arguments_data args;
prepare_kernel_args(kernels_data, args);
stream->finish();
int i = 0;
for (auto it = batch_start; it < batch_end; it++) {
std::vector<event::ptr> events;
auto kernel_run_time = std::chrono::nanoseconds::max();
int num_of_runs = 0;
for (int iteration = 0; iteration < runs_per_kernel; iteration++) {
event::ptr event;
try {
stream->set_arguments(*kernels[i], it->kernels[0].params, args);
event = stream->enqueue_kernel(*kernels[i], it->kernels[0].params, args, {});
} catch (std::exception& e) {
std::cout << "[clDNN] Could not run kernel for auto-tune: " << it->kernelName
<< " with auto-tune index " << it->autoTuneIndex << std::endl
<< ", error message:" << e.what();
} catch (...) {
// Could not run this kernel. Push back NULL event (will be ignored later).
std::cout << "[clDNN] Could not run kernel for auto-tune: " << it->kernelName
<< " with auto-tune index " << it->autoTuneIndex << std::endl;
}
events.push_back(event);
}
stream->finish();
for (auto& event : events) {
if (event.get() != NULL) {
auto profiling_intervals = event->get_profiling_info();
for (auto const& profiling_interval : profiling_intervals) {
if (profiling_interval.stage == instrumentation::profiling_stage::executing) {
kernel_run_time = std::min(profiling_interval.value->value(), kernel_run_time);
num_of_runs++;
break;
}
}
}
}
if (num_of_runs > 0) {
run_times.push_back(kernel_run_time);
num_of_kernels_run += 1;
} else {
run_times.push_back(std::chrono::nanoseconds::max());
}
i++;
}
num_of_kernels_to_run -= current_compilation_batch;
batch_start += current_compilation_batch;
}
if (num_of_kernels_run == 0) {
// If all kernels failed to run throw to avoid corrupting cache
throw std::runtime_error("kernel_runner::run_kernels - could not run any of provided kernels");
}
return run_times;
}
} // namespace gpu
} // namespace cldnn

View File

@ -1,45 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/runtime/engine.hpp"
#include "kernel_selector_common.h"
#include "kernel_selector_helper.h"
#include "kernel_runner_interface.h"
#include <vector>
namespace cldnn {
namespace gpu {
class kernel_runner : public kernel_selector::KernelRunnerInterface {
public:
kernel_runner(engine& engine_ref, uint32_t program_id, bool weights_and_bias_exist = false, bool zero_points_exist = false);
std::vector<std::chrono::nanoseconds> run_kernels(const kernel_selector::KernelsData& kernelsData) override;
private:
const int compilation_batch_size = 50;
const int runs_per_kernel = 15;
void prepare_kernel_args(const kernel_selector::KernelsData& kernels_data,
kernel_arguments_data& args);
engine& _engine;
uint32_t program_id;
bool weights_and_bias_exist;
bool zero_points_exist;
std::vector<memory::cptr> input_buffers;
std::vector<memory::cptr> fused_ops_buffers;
std::vector<memory::ptr> output_buffers;
std::vector<memory::cptr> weight_buffers;
std::vector<memory::cptr> bias_buffers;
std::vector<memory::cptr> weight_zero_point_buffers;
std::vector<memory::cptr> activation_zero_point_buffers;
std::vector<memory::cptr> compensation_buffers;
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace gpu
} // namespace cldnn

View File

@ -837,23 +837,6 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
}
}
kernel_selector::tuning_mode to_tuning_mode(ov::intel_gpu::TuningMode mode) {
switch (mode) {
case ov::intel_gpu::TuningMode::tuning_disabled:
return kernel_selector::tuning_mode::TUNING_DISABLED;
case ov::intel_gpu::TuningMode::tuning_use_cache:
return kernel_selector::tuning_mode::TUNING_USE_CACHE;
case ov::intel_gpu::TuningMode::tuning_tune_and_cache:
return kernel_selector::tuning_mode::TUNING_TUNE_AND_CACHE;
case ov::intel_gpu::TuningMode::tuning_use_and_update:
return kernel_selector::tuning_mode::TUNING_USE_AND_UPDATE;
case ov::intel_gpu::TuningMode::tuning_retune_and_cache:
return kernel_selector::tuning_mode::TUNING_RETUNE_AND_CACHE;
default:
return kernel_selector::tuning_mode::TUNING_DISABLED;
}
}
kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset) {
const auto& pad = l.data_padding;
const auto& vals_original = l.get_partial_shape();
@ -1103,7 +1086,6 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.computeUnitsCount = device_info.execution_units_count;
params.engineInfo.maxThreadsPerExecutionUnit = device_info.num_threads_per_eu > 0 ? device_info.num_threads_per_eu : 7;
params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count;
params.engineInfo.deviceCache = program->get_tuning_cache();
params.engineInfo.driverVersion = device_info.driver_version;
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
params.engineInfo.vendor_id = device_info.vendor_id;
@ -1121,10 +1103,6 @@ void set_optional_params(const program& program, kernel_selector::optional_param
program.get_config().get_property(ov::intel_gpu::allow_static_input_reorder);
params.allowInputReordering = false;
params.allowOutputReordering = false;
const auto& tuning_config = program.get_config().get_property(ov::intel_gpu::tuning_config);
params.tuningParams.mode = to_tuning_mode(tuning_config.mode);
params.tuningParams.cacheFilePath = tuning_config.cache_file_path;
}
void kernel_impl_params::save(BinaryOutputBuffer& ob) const {

View File

@ -11,7 +11,6 @@
#include <ie_system_conf.h>
#include "kernel_selector_helper.h"
#include "device_cache_reader.h"
#include "auto_tuner.h"
#include "layout_optimizer.h"
#include "pass_manager.h"
@ -108,7 +107,6 @@ program::program(engine& engine_ref,
_stream(_engine.create_stream(config)),
_config(config),
processing_order(),
tuning_cache(nullptr),
is_body_program(is_body_program),
is_subgroup_local_block_io_supported(-1) {
init_primitives();
@ -141,7 +139,6 @@ program::program(engine& engine_ref,
_config(config),
_task_executor(task_executor),
processing_order(),
tuning_cache(nullptr),
is_subgroup_local_block_io_supported(-1) {
init_primitives();
set_options();
@ -161,7 +158,6 @@ program::program(engine& engine)
_stream(_engine.create_stream({})),
_config(),
processing_order(),
tuning_cache(nullptr),
is_subgroup_local_block_io_supported(-1) { }
program::~program() {
query_local_block_io_supported();
@ -231,16 +227,6 @@ void program::init_kernels() {
}
}
void program::load_tuning_cache() {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "ProgramImpl::LoadTuningCache");
GPU_DEBUG_DEFINE_MEM_LOGGER("ProgramImpl::LoadTuningCache");
try {
tuning_cache = kernel_selector::CreateTuningCacheFromFile("cache.json");
} catch (...) {
tuning_cache = std::make_shared<kernel_selector::TuningCache>();
}
}
kernel_id program::add_kernel(const std::shared_ptr<kernel_string>& kernelSring) {
return _kernels_cache->set_kernel_source(kernelSring, false);
}
@ -597,9 +583,6 @@ void program::run_graph_compilation() { apply_opt_pass<compile_graph>(); }
void program::pre_optimize_graph(bool is_internal) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "ProgramImpl::PreOptimizeGraph");
if (!is_internal)
load_tuning_cache();
// trim to outputs
apply_opt_pass<trim_to_outputs>(); // ToDo remove hidden dependencies from trimm pass

View File

@ -15,6 +15,20 @@
#include <utility>
#include <tuple>
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <SetupAPI.h>
#include <devguid.h>
#include <cstring>
#else
#include <unistd.h>
#include <limits.h>
#include <link.h>
#include <dlfcn.h>
#endif
namespace kernel_selector {
TuningCache::TuningCache(const std::string& cacheFilePath, bool createMode)
@ -273,52 +287,10 @@ void TuningCache::Save(const std::string& cacheFilePath) {
needsSave = false;
}
std::tuple<std::string, int> AutoTuner::LoadKernelOnline(const TuningMode tuningMode,
const std::string& cacheFilePath,
const Params& params) {
std::lock_guard<std::mutex> lock(mutex);
if (!onlineCache || lastCachePath != cacheFilePath) {
onlineCache = std::make_shared<TuningCache>(cacheFilePath, PerformTuning(tuningMode));
lastCachePath = cacheFilePath;
}
auto result = onlineCache->LoadKernel(params, PerformUpdates(tuningMode));
if (onlineCache->NeedsSave() && PerformUpdates(tuningMode)) {
onlineCache->Save(cacheFilePath);
}
return result;
}
void AutoTuner::StoreKernel(const std::string& cacheFilePath,
const Params& params,
std::string implementationName,
const int tuneIndex) {
std::lock_guard<std::mutex> lock(mutex);
if (!onlineCache || lastCachePath != cacheFilePath) {
onlineCache = std::make_shared<TuningCache>(cacheFilePath, true);
lastCachePath = cacheFilePath;
}
onlineCache->StoreKernel(params, implementationName, tuneIndex);
onlineCache->Save(cacheFilePath);
}
void AutoTuner::RemoveKernel(const std::string& cacheFilePath,
const Params& params) {
std::lock_guard<std::mutex> lock(mutex);
if (!onlineCache || lastCachePath != cacheFilePath) {
onlineCache = std::make_shared<TuningCache>(cacheFilePath, false);
lastCachePath = cacheFilePath;
}
onlineCache->RemoveKernel(params);
if (onlineCache->NeedsSave()) {
onlineCache->Save(cacheFilePath);
}
}
std::tuple<std::string, int> AutoTuner::LoadKernelOffline(TuningCache* deviceCache,
const Params& params) {
std::tuple<std::string, int> AutoTuner::LoadKernelOffline(const Params& params) {
std::lock_guard<std::mutex> lock(mutex);
static const uint32_t defaultComputeUnits = 24;
TuningCache* deviceCache = TuningCache::get();
if (!deviceCache)
return {};
auto result = deviceCache->LoadKernel(params, false);
@ -328,4 +300,37 @@ std::tuple<std::string, int> AutoTuner::LoadKernelOffline(TuningCache* deviceCac
return result;
}
TuningCache* TuningCache::get() {
static std::mutex m;
static std::shared_ptr<TuningCache> cache_instance = nullptr;
std::lock_guard<std::mutex> lock(m);
std::string path = "cache.json";
#ifdef _WIN32
char module_path[MAX_PATH];
HMODULE hm = NULL;
GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
(LPCSTR)&TuningCache::get,
&hm);
GetModuleFileName(hm, module_path, sizeof(module_path));
std::string bin_path(module_path);
path = bin_path.substr(0, bin_path.find_last_of("\\")) + "\\cache.json";
#else
const char* device_info_failed_msg = "Device lookup failed";
Dl_info dl_info;
dladdr((void*)(device_info_failed_msg), &dl_info); // NOLINT
std::string bin_path(dl_info.dli_fname);
path = bin_path.substr(0, bin_path.find_last_of("/")) + "/cache.json";
#endif
if (!cache_instance) {
try {
cache_instance = std::make_shared<kernel_selector::TuningCache>(path, false);
} catch (...) {
cache_instance = std::make_shared<kernel_selector::TuningCache>();
}
}
return cache_instance.get();
}
} // namespace kernel_selector

View File

@ -47,6 +47,8 @@ public:
bool NeedsSave() const { return needsSave; }
static TuningCache* get();
private:
Entry LoadKernel_v1(const Params& params, uint32_t computeUnitsCount);
Entry LoadKernel_v2(const Params& params, uint32_t computeUnitsCount);
@ -65,21 +67,9 @@ private:
class AutoTuner {
public:
AutoTuner() = default;
std::tuple<std::string, int> LoadKernelOnline(const TuningMode tuningMode,
const std::string& cacheFilePath,
const Params& params);
void StoreKernel(const std::string& cacheFilePath,
const Params& params,
std::string implementationName,
const int tuneIndex);
void RemoveKernel(const std::string& cacheFilePath,
const Params& params);
std::tuple<std::string, int> LoadKernelOffline(TuningCache* cache,
const Params& params);
std::tuple<std::string, int> LoadKernelOffline(const Params& params);
private:
std::string lastCachePath;
std::shared_ptr<TuningCache> onlineCache;
std::mutex mutex; // Mutex to synchronize cache updates
/*

View File

@ -443,36 +443,6 @@ struct DimTensor {
DimTensor(T b, T f, T w, T z, T y, T x) : b(b), f(f), w(w), z(z), y(y), x(x) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// AutoTunerMode
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
enum class TuningMode {
TUNING_DISABLED, // Tuning is disabled.
TUNING_USE_CACHE, // Tuning using the cached data (no on-line tuning for non-existing data).
TUNING_TUNE_AND_CACHE, // Tuning using the cached data if exist, tune and update cache otherwise.attention_params
TUNING_USE_AND_UPDATE, // Tuning using the cached data and other updating tasks.
// Performs updating tasks like removal of invalid caches, promoting to new formats, etc.
// No tuning for non-existing data.
TUNING_RETUNE_AND_CACHE // Perform tuning even if the cached data exists.
};
inline bool UseCached(const TuningMode& mode) {
return mode == TuningMode::TUNING_USE_CACHE
|| mode == TuningMode::TUNING_TUNE_AND_CACHE
|| mode == TuningMode::TUNING_USE_AND_UPDATE;
}
inline bool PerformTuning(const TuningMode& mode) {
return mode == TuningMode::TUNING_TUNE_AND_CACHE
|| mode == TuningMode::TUNING_RETUNE_AND_CACHE;
}
inline bool PerformUpdates(const TuningMode& mode) {
return mode == TuningMode::TUNING_TUNE_AND_CACHE
|| mode == TuningMode::TUNING_USE_AND_UPDATE
|| mode == TuningMode::TUNING_RETUNE_AND_CACHE;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Aliases:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1,52 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "device_cache_reader.h"
#include "auto_tuner.h"
#include <limits>
#include "istreamwrapper.h"
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <SetupAPI.h>
#include <devguid.h>
#include <cstring>
#else
#include <unistd.h>
#include <limits.h>
#include <link.h>
#include <dlfcn.h>
#endif
#include <fstream>
#include <iostream>
#include <utility>
namespace kernel_selector {
std::shared_ptr<kernel_selector::TuningCache> CreateTuningCacheFromFile(std::string tuning_cache_path) {
if (tuning_cache_path.compare("cache.json") == 0) {
#ifdef _WIN32
char path[MAX_PATH];
HMODULE hm = NULL;
GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
(LPCSTR)&CreateTuningCacheFromFile,
&hm);
GetModuleFileName(hm, path, sizeof(path));
std::string bin_path(path);
tuning_cache_path = bin_path.substr(0, bin_path.find_last_of("\\")) + "\\cache.json";
#else
const char* device_info_failed_msg = "Device lookup failed";
Dl_info dl_info;
dladdr((void*)(device_info_failed_msg), &dl_info); // NOLINT
std::string bin_path(dl_info.dli_fname);
tuning_cache_path = bin_path.substr(0, bin_path.find_last_of("/")) + "/cache.json";
#endif
}
return std::make_shared<kernel_selector::TuningCache>(tuning_cache_path, false);
}
} // namespace kernel_selector

View File

@ -1,14 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <memory>
#include <string>
namespace kernel_selector {
class TuningCache;
std::shared_ptr<kernel_selector::TuningCache> CreateTuningCacheFromFile(std::string tuning_cache_path);
} // namespace kernel_selector

View File

@ -71,15 +71,12 @@ KernelData kernel_selector_base::get_best_kernel(const Params& params, const opt
return kernels[0];
}
KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
const optional_params& options,
KernelType kType) const {
KernelsData kernel_selector_base::GetNaiveBestKernel(const KernelList& all_impls, const Params& params, const optional_params& options) const {
KernelsData kernelsData;
std::string kernelName;
auto allImplementations = GetAllImplementations(params, options, kType);
for (const auto& implementation : allImplementations) {
for (const auto& implementation : all_impls) {
// TODO: Unify this check with the Validate virtual method. Make
// sure that the method is called here only, not in all the
// GetKernelsData implementations.
@ -87,28 +84,14 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
KernelsData kds = implementation->GetKernelsData(params, options);
if (kds.size() && kds[0].kernels.size()) {
#ifdef ENABLE_ENV
const auto& it = forceKernels.find(implementation->GetName());
if (it != forceKernels.end()) {
if (it->second == true) {
ENV_PRINTF("Force: %s\n", it->first.c_str());
return kds;
} else {
ENV_PRINTF("Deny: %s\n", it->first.c_str());
}
} else {
#endif
kernelsData = kds;
kernelName = implementation->GetName();
break;
#ifdef ENABLE_ENV
}
#endif
kernelsData = kds;
kernelName = implementation->GetName();
break;
}
} catch (std::runtime_error& ex) {
// we have to handle it in order to avoid exception in KernelSelector as much we can
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kernel: " << kernelName << " - " << ex.what() << std::endl;
}
}
@ -120,10 +103,11 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
return kernelsData;
}
KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params, const optional_params& options, KernelType kType) const {
return GetNaiveBestKernel(GetAllImplementations(params, options, kType), params, options);
}
KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
const optional_params& options,
KernelType kType) const {
KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params, const optional_params& options, KernelType kType) const {
KernelsData kernelsData;
std::string kernelName;
@ -131,16 +115,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
auto kernel_params = static_cast<const base_params&>(params);
bool int8_kernel = kernel_params.inputs[0].GetDType() == Datatype::INT8 || kernel_params.inputs[0].GetDType() == Datatype::UINT8;
std::tuple<std::string, int> cachedKernelConfig;
if (options.tuningParams.mode == TuningMode::TUNING_DISABLED && !int8_kernel) { // Try to load kernel/config from offline cache
#if ENABLE_OFFLINE_TUNING_CACHE
cachedKernelConfig = autoTuner.LoadKernelOffline(params.engineInfo.deviceCache.get(), params);
#else
return GetNaiveBestKernel(params, options, kType);
#endif
} else if (UseCached(options.tuningParams.mode)) { // Try to load kernel/config from on-line cache
cachedKernelConfig = autoTuner.LoadKernelOnline(options.tuningParams.mode,
options.tuningParams.cacheFilePath,
params);
if (!int8_kernel) { // Try to load kernel/config from offline cache
cachedKernelConfig = autoTuner.LoadKernelOffline(params);
}
bool hashFoundInCache = !std::get<0>(cachedKernelConfig).empty();
@ -166,83 +142,7 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
}
}
// Cache is not valid, remove it if performing update tasks.
if (hashFoundInCache && PerformUpdates(options.tuningParams.mode)) {
autoTuner.RemoveKernel(options.tuningParams.cacheFilePath, params);
}
if (hashFoundInCache || // Cache is not valid - hash exists in cache but kernelsData was empty or kernel
// doesn't support the required key.
!PerformTuning(options.tuningParams.mode) || // On-line tuning is not allowed.
!options.tuningParams.runner) { // Runner is invalid - can't run on-line tuning
// Fall back to the default path.
return GetNaiveBestKernel(params, options, kType);
}
// Start on-line tuning
assert(options.tuningParams.runner);
for (const auto& implementation : allImplementations) {
const ParamsKey implKey = implementation->GetSupportedKey();
if (implKey.TuningSupport()) {
try {
KernelsData kds = implementation->GetKernelsDataForAutoTune(params, options);
auto runTimes = options.tuningParams.runner->run_kernels(kds);
for (size_t i = 0; i < kds.size(); i++) {
kds[i].runTime = runTimes[i].count();
if (kernelsData.size() == 0 || kds[i].runTime < kernelsData[0].runTime) {
kernelsData = {kds[i]};
kernelName = implementation->GetName();
}
}
} catch (std::runtime_error& ex) {
// we have to handle it in order to avoid exception in KernelSelector as much we can
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
}
}
}
// try to fallback to reference kernels if no optimized were found during tuning
if (!kernelsData.size()) {
for (const auto& implementation : allImplementations) {
const ParamsKey implKey = implementation->GetSupportedKey();
// this time, check only implementations that have disabled tuning
if (!implKey.TuningSupport()) {
try {
KernelsData kds = implementation->GetKernelsDataForAutoTune(params, options);
auto runTimes = options.tuningParams.runner->run_kernels(kds);
for (size_t i = 0; i < kds.size(); i++) {
kds[i].runTime = runTimes[i].count();
if (kernelsData.size() == 0 || kds[i].runTime < kernelsData[0].runTime) {
kernelsData = {kds[i]};
kernelName = implementation->GetName();
}
}
} catch (std::runtime_error& ex) {
// we have to handle it in order to avoid exception in KernelSelector as much we can
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
}
}
}
}
if (kernelsData.size()) {
kernelsData[0].kernelName = kernelName;
kernelsData[0].kernels[0].params.layerID = params.layerID;
autoTuner.StoreKernel(options.tuningParams.cacheFilePath,
params,
kernelName,
kernelsData[0].autoTuneIndex);
} else {
// Tuning failed, fall back to naive path
return GetNaiveBestKernel(params, options, kType);
}
return kernelsData;
return GetNaiveBestKernel(allImplementations, params, options);
}
KernelList kernel_selector_base::GetAllImplementations(const Params& params, const optional_params& options, KernelType kType) const {

View File

@ -5,7 +5,6 @@
#pragma once
#include "kernel_selector_common.h"
#include "kernel_runner_interface.h"
#include "auto_tuner.h"
#include <vector>
#include <memory>
@ -32,13 +31,17 @@ protected:
}
virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const = 0;
virtual KernelsData GetNaiveBestKernel(const Params& params,
const optional_params& options,
KernelType kType) const;
KernelsData GetNaiveBestKernel(const KernelList& all_impls,
const Params& params,
const optional_params& options) const;
virtual KernelsData GetAutoTuneBestKernel(const Params& params,
const optional_params& options,
KernelType kType) const;
KernelsData GetNaiveBestKernel(const Params& params,
const optional_params& options,
KernelType kType) const;
KernelsData GetAutoTuneBestKernel(const Params& params,
const optional_params& options,
KernelType kType) const;
KernelList GetAllImplementations(const Params& params, const optional_params& options, KernelType kType) const;

View File

@ -360,25 +360,6 @@ void ParamsKey::EnableArgMaxMinAxis(ArgMaxMinAxis a) {
}
}
void ParamsKey::EnableIndexSelectAxis(IndexSelectAxis a) {
switch (a) {
case IndexSelectAxis::X:
key.restrict.val.dedicated.idxsel.axisX = 1;
break;
case IndexSelectAxis::Y:
key.restrict.val.dedicated.idxsel.axisY = 1;
break;
case IndexSelectAxis::FEATURE:
key.restrict.val.dedicated.idxsel.axisFeature = 1;
break;
case IndexSelectAxis::BATCH:
key.restrict.val.dedicated.idxsel.axisBatch = 1;
break;
default:
break;
}
}
void ParamsKey::EnableQuantization(QuantizationType q) {
switch (q) {
case QuantizationType::NONE:

View File

@ -25,7 +25,6 @@ using DataBitField = std::bitset<DataLayout::DataLayoutCount>;
using WightsBitField = std::bitset<WeightsLayout::WeightsLayoutCount>;
class JitConstants;
class TuningCache;
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// fuse_params
@ -102,7 +101,6 @@ class ParamsKey {
public:
ParamsKey() {
key.restrict.raw = 0;
key.enableTuning = 1;
key.inputType.raw = 0;
key.outputType.raw = 0;
key.inputWeightsType.raw = 0;
@ -266,7 +264,6 @@ public:
uint32_t raw;
} DataTypesKey;
uint32_t enableTuning;
DataTypesKey inputType;
DataTypesKey outputType;
DataTypesKey inputWeightsType;
@ -345,17 +342,9 @@ public:
void EnableLSTMDyanmicOptionalHiddenOutput() { key.restrict.val.dedicated.lstm_dynamic.last_hidden = 1; }
void EnableLSTMDyanmicOptionalCellOutput() { key.restrict.val.dedicated.lstm_dynamic.last_cell = 1; }
void EnableConcatKernelPerInput() { key.restrict.val.dedicated.concat.kernelPerInput = 1; }
void DisableTuning() { key.enableTuning = 0; }
void EnableConcatOneKernel() { key.restrict.val.dedicated.concat.oneKernel = 1; }
void EnableArgMaxMinAxis(ArgMaxMinAxis a);
void EnableIndexSelectAxis(IndexSelectAxis a);
void EnableFusedConvEltwiseRWOutOpt();
bool Support(const ParamsKey& k) const;
bool TuningSupport() const {
if (key.enableTuning == 1)
return true;
return false;
}
bool isEnabledDifferentInputWeightsTypes() const {
return key.restrict.val.different_input_weights_types ? true : false;
}
@ -405,7 +394,6 @@ struct EngineInfo {
std::string deviceId = "";
std::string driverVersion = "";
std::vector<size_t> supportedSimdSizes = {};
std::shared_ptr<TuningCache> deviceCache;
DeviceFeaturesKey get_supported_device_features_key() const;
};
@ -663,18 +651,6 @@ protected:
explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Auto tuner parameters
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class KernelRunnerInterface;
struct TuningParams {
TuningMode mode;
std::string cacheFilePath;
std::shared_ptr<KernelRunnerInterface> runner;
TuningParams() : mode(TuningMode::TUNING_DISABLED), cacheFilePath(""), runner(nullptr) {}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// optional_params
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -694,8 +670,6 @@ struct optional_params {
bool allowOutputReordering =
false; // allow kernel to ask graph compiler to reorder the output data before executing the next kernel
TuningParams tuningParams;
virtual ParamsKey GetSupportedKey() const;
protected:

View File

@ -18,6 +18,6 @@ binary_convolution_kernel_selector::binary_convolution_kernel_selector() {
KernelsData binary_convolution_kernel_selector::GetBestKernels(const Params& params,
const optional_params& options) const {
return GetAutoTuneBestKernel(params, options, KernelType::BINARY_CONVOLUTION);
return GetNaiveBestKernel(params, options, KernelType::BINARY_CONVOLUTION);
}
} // namespace kernel_selector

View File

@ -64,7 +64,6 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
k.DisableTuning();
return k;
}

View File

@ -375,7 +375,6 @@ ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
k.EnableDilation();
k.DisableTuning();
return k;
}

View File

@ -87,7 +87,6 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
k.DisableTuning();
return k;
}

View File

@ -38,7 +38,6 @@ ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetSupportedKey() co
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}

View File

@ -38,7 +38,6 @@ ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetSupportedKey() co
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.DisableTuning();
return k;
}

View File

@ -40,7 +40,6 @@ ParamsKey ConvolutionKernel_mmad_b_fs_yx_fsv32::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.DisableTuning();
return k;
}

View File

@ -37,7 +37,6 @@ ParamsKey ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
k.EnableDifferentTypes();
k.DisableTuning();
k.EnableGroupedConvolution();
k.EnableDifferentInputWeightsTypes();
return k;

View File

@ -43,7 +43,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetSupportedKey() const
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.DisableTuning();
return k;
}

View File

@ -36,7 +36,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
k.EnableDifferentTypes();
k.EnableDifferentInputWeightsTypes();
k.DisableTuning();
return k;
}

View File

@ -38,7 +38,6 @@ ParamsKey ConvolutionKernel_Ref::GetSupportedKey() const {
k.EnableBiasPerOutput();
k.EnableNonBiasTerm();
k.EnableBatching();
k.DisableTuning();
k.EnableGroupedConvolution();
k.EnableQuantization(QuantizationType::SYMMETRIC);

View File

@ -22,7 +22,6 @@ ParamsKey ConvolutionKernel_yxfb_Ref::GetSupportedKey() const {
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableDilation();
k.DisableTuning();
k.EnableGroupedConvolution();
return k;
}

View File

@ -23,7 +23,6 @@ ParamsKey DeformableConvolutionKernel_bfyx_conv::GetSupportedKey() const {
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
k.DisableTuning();
k.EnableGroupedConvolution();
k.EnableDeformableMode();
k.EnableDeformableMask();

View File

@ -23,7 +23,6 @@ ParamsKey DeformableConvolutionKernel_bfyx_interp::GetSupportedKey() const {
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
k.DisableTuning();
k.EnableGroupedConvolution();
k.EnableDeformableMode();
k.EnableDeformableMask();

View File

@ -27,7 +27,6 @@ ParamsKey DeformableConvolutionKernel_bfyx_Ref::GetSupportedKey() const {
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
k.DisableTuning();
k.EnableGroupedConvolution();
k.EnableDeformableMode();
k.EnableDeformableMask();

View File

@ -65,7 +65,6 @@ void ExecutionConfig::set_default() {
std::make_tuple(ov::intel_gpu::enable_memory_pool, true),
std::make_tuple(ov::intel_gpu::allow_static_input_reorder, false),
std::make_tuple(ov::intel_gpu::custom_outputs, std::vector<std::string>{}),
std::make_tuple(ov::intel_gpu::tuning_config, ov::intel_gpu::TuningConfig{}),
std::make_tuple(ov::intel_gpu::dump_graphs, ""),
std::make_tuple(ov::intel_gpu::force_implementations, ImplForcingMap{}),
std::make_tuple(ov::intel_gpu::partial_build_program, false),

View File

@ -1,353 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include <intel_gpu/primitives/input_layout.hpp>
#include <intel_gpu/primitives/convolution.hpp>
#include <intel_gpu/primitives/data.hpp>
#include <iostream>
#include <fstream>
#include <string>
namespace {
enum class cache_version {
version_1,
version_1_2, // version 1 cache, but version 2 file
version_2,
version_2_invalid,
version_2_from_1,
version_2_empty
};
std::string reference_impl_name = "convolution_gpu_ref";
std::string eus_marker = "__EUs__";
std::string cache_v1 =
R"__a({
"__EUs__": {
"18283230515392601293": ["convolution_gpu_ref", 0]
}
})__a";
std::string cache_v1_2 =
R"__a({
"version_2": {
},
"version_1": {
"__EUs__": {
"18283230515392601293": ["convolution_gpu_ref", 0]
}
}
})__a";
std::string cache_v2 =
R"__a({
"version_2": {
"__EUs__": {
"CONVOLUTION": {
"F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;1_1_1;1_1_1;1_1_1;0_0_0;1;1": ["convolution_gpu_ref", 0]
}
}
}
})__a";
std::string cache_v2_from_v1 =
R"__a({
"version_2": {
"__EUs__": {
"CONVOLUTION": {
"F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;1_1_1;1_1_1;1_1_1;0_0_0;1;1": ["convolution_gpu_ref", 0]
}
}
},
"version_1": {
"__EUs__": {}
}
})__a";
std::string cache_v2_invalid =
R"__a({
"version_2": {
"__EUs__": {
"CONVOLUTION": {
"F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;1_1_1;1_1_1;1_1_1;0_0_0;1;1": ["non_existent", 0]
}
}
}
})__a";
std::string cache_v2_empty =
R"__a({
"version_2": {
"__EUs__": {
"CONVOLUTION": {}
}
}
})__a";
std::string get_cache_version(cache_version version) {
std::string cache;
switch (version) {
case cache_version::version_1:
cache = cache_v1;
break;
case cache_version::version_1_2:
cache = cache_v1_2;
break;
case cache_version::version_2:
cache = cache_v2;
break;
case cache_version::version_2_invalid:
cache = cache_v2_invalid;
break;
case cache_version::version_2_from_1:
cache = cache_v2_from_v1;
break;
case cache_version::version_2_empty:
cache = cache_v2_empty;
break;
default:
throw std::invalid_argument("invalid cache version");
}
return cache;
}
std::string get_temporary_cache_file() {
static int i = 0;
std::string tmp_cache_file = "tmp_cldnn_test_cache_" + std::to_string(i) + ".json";
i += 1;
return tmp_cache_file;
}
template <typename T>
void replace(std::string& text, const std::string& replaced, T replacement) {
auto it = text.find(replaced);
while (it != std::string::npos) {
text.replace(it, replaced.length(), std::to_string(replacement));
it = text.find(replaced);
}
}
void write(const std::string& filename, const std::string& text) {
std::ofstream file;
file.open(filename);
if (!file.is_open())
throw std::runtime_error("Could not open file " + filename);
file << text;
file.close();
if (!file) {
throw std::runtime_error("Failure writing to file " + filename);
}
}
std::string read(const std::string& filename) {
std::stringstream ss;
std::ifstream file;
file.open(filename);
if (!file.is_open())
throw std::runtime_error("Could not open file " + filename);
ss << file.rdbuf();
file.close();
if (!file) {
throw std::runtime_error("Failure reading from file " + filename);
}
return ss.str();
}
void remove(const std::string& filename) {
std::remove(filename.c_str());
}
class cache_test_helper {
public:
cache_test_helper(cldnn::engine& engine, cache_version v)
: _engine(engine)
, _mode(ov::intel_gpu::TuningMode::tuning_disabled)
, cache_filename(get_temporary_cache_file())
{
auto cache = get_cache_version(v);
auto eus = engine.get_device_info().execution_units_count;
replace(cache, eus_marker, eus);
write(cache_filename, cache);
}
virtual ~cache_test_helper() {
remove(cache_filename);
}
cache_test_helper& with_mode(ov::intel_gpu::TuningMode mode) {
_mode = mode;
return *this;
}
cache_test_helper& expect_cache(cache_version version) {
compare_cache = version;
return *this;
}
cache_test_helper& expect_implementation(std::string implementation) {
compare_implementation = implementation;
return *this;
}
cache_test_helper& expect_implementation_not(std::string implementation) {
compare_implementation = implementation;
compare_implementation.not_equal = true;
return *this;
}
void test() {
auto w_mem = _engine.allocate_memory(cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 16, 16, 1, 1 }));
auto topology = cldnn::topology(
cldnn::input_layout("input", cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 16, 3, 3 })),
cldnn::data("weights", w_mem),
cldnn::convolution("conv", input_info("input"), { "weights" })
);
ov::intel_gpu::TuningConfig tune_conf;
tune_conf.cache_file_path = cache_filename;
tune_conf.mode = _mode;
ExecutionConfig config{
ov::intel_gpu::tuning_config(tune_conf),
ov::intel_gpu::optimize_data(true)
};
cldnn::network network(_engine, topology, config);
auto in_mem = _engine.allocate_memory(cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 16, 3, 3 }));
network.set_input_data("input", in_mem);
network.execute();
if (compare_implementation.compare) {
std::string exec_impl = network.get_implementation_info("conv");
auto precision_pos = exec_impl.find("__");
exec_impl = exec_impl.substr(0, precision_pos);
if (compare_implementation.not_equal) {
EXPECT_NE(exec_impl, compare_implementation.value);
} else {
ASSERT_EQ(exec_impl, compare_implementation.value);
}
}
if (compare_cache.compare) {
auto cache = read(cache_filename);
auto expected_cache = get_cache_version(compare_cache.value);
auto eus = _engine.get_device_info().execution_units_count;
replace(expected_cache, eus_marker, eus);
ASSERT_EQ(cache, expected_cache);
}
}
private:
template <typename T>
struct optional_compare {
bool compare;
bool not_equal;
T value;
optional_compare() : compare(false) {}
optional_compare(T v) : compare(true), not_equal(false), value(v) {}
optional_compare(T v, bool neq) : compare(true), not_equal(neq), value(v) {}
};
cldnn::engine& _engine;
ov::intel_gpu::TuningMode _mode;
std::string cache_filename;
optional_compare<cache_version> compare_cache;
optional_compare<std::string> compare_implementation;
};
} // namespace
class cache_version_test : public testing::TestWithParam<cache_version> {
public:
static std::string to_string(const testing::TestParamInfo<cache_version>& param) {
std::string result;
switch (param.param) {
case cache_version::version_1:
result = "version_1";
break;
case cache_version::version_1_2:
result = "version_1_2";
break;
case cache_version::version_2:
result = "version_2";
break;
case cache_version::version_2_invalid:
result = "version_2_invalid";
break;
case cache_version::version_2_from_1:
result = "version_2_from_1";
break;
case cache_version::version_2_empty:
result = "version_2_empty";
break;
default:
result = std::to_string(static_cast<int>(param.param));
break;
}
return result;
}
};
TEST(cache_test, no_cache_baseline) {
SCOPED_TRACE("default implementation same as reference, cache tests may provide invalid pass");
auto& engine = tests::get_test_engine();
auto helper = cache_test_helper(engine, cache_version::version_2);
helper.with_mode(ov::intel_gpu::TuningMode::tuning_disabled)
.expect_implementation_not(reference_impl_name)
.test();
}
TEST_P(cache_version_test, use_only) {
auto version = GetParam();
auto& engine = tests::get_test_engine();
cache_test_helper helper(engine, version);
helper.with_mode(ov::intel_gpu::TuningMode::tuning_use_cache)
.expect_implementation(reference_impl_name)
.expect_cache(version)
.test();
}
TEST_P(cache_version_test, update) {
auto version = GetParam();
auto ex_version = cache_version::version_2;
if (version != cache_version::version_2) {
ex_version = cache_version::version_2_from_1;
}
auto& engine = tests::get_test_engine();
cache_test_helper helper(engine, version);
helper.with_mode(ov::intel_gpu::TuningMode::tuning_use_and_update)
.expect_implementation(reference_impl_name)
.expect_cache(ex_version)
.test();
}
INSTANTIATE_TEST_SUITE_P(
smoke,
cache_version_test,
testing::Values(cache_version::version_1, cache_version::version_1_2, cache_version::version_2),
cache_version_test::to_string);
TEST(cache_test, remove_invalid) {
auto& engine = tests::get_test_engine();
cache_test_helper helper(engine, cache_version::version_2_invalid);
helper.with_mode(ov::intel_gpu::TuningMode::tuning_use_and_update)
.expect_implementation_not(reference_impl_name)
.expect_cache(cache_version::version_2_empty)
.test();
}