[GPU] Move tuning cache loading to kernel selector (#15112)
* [GPU] Move tuning cache loading to kernel selector. Remove tuning modes * [GPU] Removed kernel runner
This commit is contained in:
parent
c1a9152d1c
commit
34d16b8777
@ -17,10 +17,6 @@
|
||||
#include <utility>
|
||||
#include <set>
|
||||
|
||||
namespace kernel_selector {
|
||||
class TuningCache;
|
||||
} // namespace kernel_selector
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
struct topology;
|
||||
@ -248,9 +244,6 @@ public:
|
||||
kernel::ptr get_kernel(kernel_id id);
|
||||
kernels_cache& get_kernels_cache() const;
|
||||
|
||||
void load_tuning_cache();
|
||||
std::shared_ptr<kernel_selector::TuningCache> get_tuning_cache() const { return tuning_cache; }
|
||||
|
||||
// returns {-1, -1} if it failed to estimate by allocating given batch size
|
||||
std::pair<int64_t/*const alloc*/, int64_t/*general alloc*/> get_estimated_device_mem_usage();
|
||||
|
||||
@ -270,7 +263,6 @@ private:
|
||||
std::vector<program_node*> outputs;
|
||||
nodes_ordering processing_order;
|
||||
std::unique_ptr<pass_manager> pm;
|
||||
std::shared_ptr<kernel_selector::TuningCache> tuning_cache;
|
||||
bool is_body_program;
|
||||
int8_t is_subgroup_local_block_io_supported;
|
||||
|
||||
|
@ -48,41 +48,6 @@ static constexpr Property<bool, PropertyMutability::RW> partial_build_program{"G
|
||||
static constexpr Property<bool, PropertyMutability::RW> allow_new_shape_infer{"GPU_ALLOW_NEW_SHAPE_INFER"};
|
||||
static constexpr Property<std::string, PropertyMutability::RW> dump_graphs{"GPU_DUMP_GRAPHS"};
|
||||
static constexpr Property<std::vector<std::string>, PropertyMutability::RW> custom_outputs{"GPU_CUSTOM_OUTPUTS"};
|
||||
|
||||
/// @brief Tuning mode.
|
||||
enum class TuningMode {
|
||||
/// @brief Tuning is disabled.
|
||||
tuning_disabled,
|
||||
|
||||
/// @brief Tuning using the cached data (no on-line tuning for non-existing data).
|
||||
tuning_use_cache,
|
||||
|
||||
/// @brief Tuning using the cached data if exist, tune and update cache otherwise.
|
||||
tuning_tune_and_cache,
|
||||
|
||||
/// @brief Tuning using the cached data and update tasks.
|
||||
/// @details Performs updating tasks like removal of invalid caches, promoting to new format, etc.
|
||||
/// No tuning for non-existing data.
|
||||
tuning_use_and_update,
|
||||
|
||||
/// @brief Retune the cache data even if it exists.
|
||||
tuning_retune_and_cache
|
||||
};
|
||||
|
||||
struct TuningConfig {
|
||||
TuningMode mode;
|
||||
std::string cache_file_path;
|
||||
|
||||
TuningConfig() : mode(TuningMode::tuning_disabled), cache_file_path("") {}
|
||||
};
|
||||
|
||||
inline std::ostream& operator<<(std::ostream& os, const TuningConfig& val) {
|
||||
os << val.cache_file_path;
|
||||
return os;
|
||||
}
|
||||
|
||||
static constexpr Property<TuningConfig, PropertyMutability::RW> tuning_config{"GPU_TUNING_CONFIG"};
|
||||
|
||||
static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implementations{"GPU_FORCE_IMPLEMENTATIONS"};
|
||||
static constexpr Property<std::string, PropertyMutability::RW> config_file{"CONFIG_FILE"};
|
||||
static constexpr Property<bool, PropertyMutability::RW> enable_lp_transformations{"LP_TRANSFORMS_MODE"};
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "arg_max_min/arg_max_min_kernel_selector.h"
|
||||
#include "arg_max_min/arg_max_min_kernel_base.h"
|
||||
#include "kernel_runner.h"
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
|
@ -8,7 +8,6 @@
|
||||
#include "impls/implementation_map.hpp"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "kernel_runner.h"
|
||||
#include "kernel_selector/kernels/binary_convolution/binary_convolution_kernel_selector.h"
|
||||
#include "kernel_selector/kernels/binary_convolution/binary_convolution_params.h"
|
||||
#include <algorithm>
|
||||
@ -75,14 +74,6 @@ public:
|
||||
uint32_t dilation_x = dilation.size() >= 1 ? dilation[dilation.size() - 1] : 1;
|
||||
params.dilation = {dilation_x, dilation_y, dilation_z};
|
||||
|
||||
const auto& tuning_config = impl_param.get_program().get_config().get_property(ov::intel_gpu::tuning_config);
|
||||
|
||||
if (tuning_config.mode == ov::intel_gpu::TuningMode::tuning_tune_and_cache ||
|
||||
tuning_config.mode == ov::intel_gpu::TuningMode::tuning_retune_and_cache) {
|
||||
optional_params.tuningParams.runner =
|
||||
std::make_shared<gpu::kernel_runner>(impl_param.get_program().get_engine(), impl_param.get_program().get_id(), true);
|
||||
}
|
||||
|
||||
return {params, optional_params};
|
||||
}
|
||||
};
|
||||
|
@ -8,7 +8,6 @@
|
||||
#include "impls/implementation_map.hpp"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "kernel_runner.h"
|
||||
#include "convolution/convolution_kernel_selector.h"
|
||||
#include "convolution/convolution_params.h"
|
||||
#include <algorithm>
|
||||
@ -166,14 +165,6 @@ public:
|
||||
|
||||
auto& kernel_selector = kernel_selector::convolution_kernel_selector::Instance();
|
||||
|
||||
const auto& tuning_config = impl_param.get_program().get_config().get_property(ov::intel_gpu::tuning_config);
|
||||
|
||||
if (tuning_config.mode == ov::intel_gpu::TuningMode::tuning_tune_and_cache ||
|
||||
tuning_config.mode == ov::intel_gpu::TuningMode::tuning_retune_and_cache) {
|
||||
conv_optional_params.tuningParams.runner =
|
||||
std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), arg.get_program().get_id(), true, true);
|
||||
}
|
||||
|
||||
auto best_kernel = kernel_selector.get_best_kernel(conv_params, conv_optional_params);
|
||||
|
||||
return make_unique<convolution_impl>(best_kernel);
|
||||
|
@ -7,7 +7,6 @@
|
||||
#include "impls/implementation_map.hpp"
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "kernel_runner.h"
|
||||
#include "convolution/convolution_kernel_selector.h"
|
||||
#include "convolution/convolution_params.h"
|
||||
#include <algorithm>
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include "fully_connected/fully_connected_params.h"
|
||||
|
||||
#include "intel_gpu/runtime/error_handler.hpp"
|
||||
#include "kernel_runner.h"
|
||||
|
||||
#include "intel_gpu/primitives/reorder.hpp"
|
||||
#include "intel_gpu/primitives/input_layout.hpp"
|
||||
@ -119,7 +118,6 @@ public:
|
||||
params.quantization = kernel_selector::QuantizationType::NONE;
|
||||
}
|
||||
|
||||
optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(progam.get_engine(), progam.get_id(), true);
|
||||
return {params, optional_params};
|
||||
}
|
||||
|
||||
|
@ -28,7 +28,6 @@ using namespace cldnn;
|
||||
|
||||
namespace cldnn {
|
||||
enum class data_types : size_t;
|
||||
enum class tuning_mode;
|
||||
struct format;
|
||||
struct layout;
|
||||
struct program;
|
||||
@ -65,7 +64,6 @@ using softmax_dim = kernel_selector::SoftmaxDim;
|
||||
using mean_subtruct_mode = kernel_selector::MeanSubtractMode;
|
||||
using mean_op = kernel_selector::MeanOp;
|
||||
using concat_axis = kernel_selector::ConcatAxis;
|
||||
using tuning_mode = kernel_selector::TuningMode;
|
||||
using sample_type = kernel_selector::ResampleType;
|
||||
using coordinate_transformation_mode = kernel_selector::CoordinateTransformationMode;
|
||||
using nearest_mode = kernel_selector::NearestMode;
|
||||
@ -101,7 +99,6 @@ kernel_selector::data_layout to_data_layout(format f);
|
||||
cldnn::format from_data_layout(kernel_selector::data_layout l);
|
||||
kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped);
|
||||
cldnn::format::type from_weights_layout(kernel_selector::weights_layout l);
|
||||
kernel_selector::tuning_mode to_tuning_mode(ov::intel_gpu::TuningMode mode);
|
||||
kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset = tensor {});
|
||||
kernel_selector::weights_tensor convert_weights_tensor(const layout& l, bool is_grouped = false);
|
||||
layout from_weights_tensor(const kernel_selector::weights_tensor& t);
|
||||
|
@ -1,249 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "kernel_runner.h"
|
||||
#include "runtime/kernels_cache.hpp"
|
||||
#include "intel_gpu/runtime/stream.hpp"
|
||||
#include "weight_bias_params.h"
|
||||
#include "kernel_selector_helper.h"
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <algorithm>
|
||||
|
||||
namespace cldnn {
|
||||
namespace gpu {
|
||||
|
||||
kernel_runner::kernel_runner(engine& engine_ref, uint32_t program_id, bool weights_and_bias_exist, bool zero_points_exist)
|
||||
: _engine(engine_ref), program_id(program_id), weights_and_bias_exist(weights_and_bias_exist), zero_points_exist(zero_points_exist) {}
|
||||
|
||||
void kernel_runner::prepare_kernel_args(const kernel_selector::KernelsData& kernels_data,
|
||||
kernel_arguments_data& args) {
|
||||
const auto& base_params = *static_cast<kernel_selector::base_params*>(kernels_data[0].params.get());
|
||||
// Prepare input buffers
|
||||
if (input_buffers.empty()) {
|
||||
for (const auto& input : base_params.inputs) {
|
||||
int num_of_input_elements = static_cast<int>(input.PhysicalSize());
|
||||
input_buffers.push_back(_engine.allocate_memory(
|
||||
{from_data_type(input.GetDType()), format::bfyx, tensor(1, 1, num_of_input_elements, 1)}));
|
||||
}
|
||||
}
|
||||
for (const auto& input : input_buffers) {
|
||||
args.inputs.push_back(input);
|
||||
}
|
||||
// Prepare fused operations buffers
|
||||
if (fused_ops_buffers.empty()) {
|
||||
for (auto& fused_op : base_params.fused_ops) {
|
||||
for (auto& fused_ops_input : fused_op.tensors) {
|
||||
auto num_of_elements = static_cast<int>(fused_ops_input.PhysicalSize());
|
||||
fused_ops_buffers.push_back(_engine.allocate_memory(
|
||||
{ from_data_type(fused_ops_input.GetDType()), format::bfyx, tensor(1, 1, num_of_elements, 1) }));
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const auto& fused_op_input : fused_ops_buffers) {
|
||||
args.fused_op_inputs.push_back(fused_op_input);
|
||||
}
|
||||
// Prepare output buffer
|
||||
if (output_buffers.empty()) {
|
||||
for (size_t i = 0; i < base_params.outputs.size(); ++i) {
|
||||
int num_of_output_elements = static_cast<int>(base_params.outputs[i].PhysicalSize());
|
||||
output_buffers.push_back(_engine.allocate_memory({from_data_type(base_params.outputs[0].GetDType()),
|
||||
format::bfyx, tensor(1, 1, num_of_output_elements, 1)}));
|
||||
}
|
||||
}
|
||||
for (const auto& output : output_buffers) {
|
||||
args.outputs.push_back(output);
|
||||
}
|
||||
|
||||
|
||||
if (weights_and_bias_exist) {
|
||||
// Prepare weight buffer
|
||||
const auto& weights_bias_params =
|
||||
*static_cast<kernel_selector::weight_bias_params*>(kernels_data[0].params.get());
|
||||
int num_of_weight_elements_ifm = static_cast<int>(weights_bias_params.weights.IFM().v);
|
||||
int num_of_weight_elements_spatial_y = static_cast<int>(weights_bias_params.weights.Y().v);
|
||||
int num_of_weight_elements_spatial_x = static_cast<int>(weights_bias_params.weights.X().v);
|
||||
int num_of_weight_elements_spatial = static_cast<int>(weights_bias_params.weights.PhysicalSize());
|
||||
int num_of_weight_elements_ofm = 1;
|
||||
|
||||
cldnn::format::type fmt = cldnn::format::bfyx;
|
||||
|
||||
if (!cldnn::format::is_image_2d(from_weights_layout(weights_bias_params.weights.GetLayout()))) {
|
||||
if (weight_buffers.empty())
|
||||
weight_buffers.push_back(
|
||||
_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
|
||||
fmt,
|
||||
tensor(num_of_weight_elements_ofm, 1, num_of_weight_elements_spatial, 1)}));
|
||||
|
||||
if (weight_buffers[0]->get_layout().format != fmt)
|
||||
weight_buffers[0] =
|
||||
_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
|
||||
fmt,
|
||||
tensor(num_of_weight_elements_ofm, 1, num_of_weight_elements_spatial, 1)});
|
||||
|
||||
while (weight_buffers[0]->get_layout().bytes_count() < weights_bias_params.weights.PhysicalSizeInBytes()) {
|
||||
// Weights layout depends on the kernel. Multiply the buffer size by 2 until it is big enough
|
||||
// (to avoid complex computations of the exact buffer size according to the chosen layout).
|
||||
weight_buffers.clear();
|
||||
num_of_weight_elements_spatial *= 2;
|
||||
weight_buffers.push_back(
|
||||
_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
|
||||
fmt,
|
||||
tensor(num_of_weight_elements_ofm, 1, num_of_weight_elements_spatial, 1)}));
|
||||
}
|
||||
} else {
|
||||
weight_buffers.clear();
|
||||
fmt = from_weights_layout(weights_bias_params.weights.GetLayout());
|
||||
num_of_weight_elements_ofm = static_cast<int>(weights_bias_params.weights.OFM().v);
|
||||
weight_buffers.push_back(_engine.allocate_memory({from_weights_type(weights_bias_params.weights.GetDType()),
|
||||
fmt,
|
||||
tensor(num_of_weight_elements_ofm,
|
||||
num_of_weight_elements_ifm,
|
||||
num_of_weight_elements_spatial_x,
|
||||
num_of_weight_elements_spatial_y)}));
|
||||
}
|
||||
args.weights = weight_buffers[0];
|
||||
|
||||
// Prepare bias buffer
|
||||
if (!weights_bias_params.bias.empty()) {
|
||||
if (bias_buffers.empty()) {
|
||||
int num_of_bias_elements = static_cast<int>(weights_bias_params.bias[0].PhysicalSize());
|
||||
bias_buffers.push_back(_engine.allocate_memory({from_data_type(weights_bias_params.bias[0].GetDType()),
|
||||
format::bfyx,
|
||||
tensor(1, num_of_bias_elements, 1, 1)}));
|
||||
}
|
||||
args.bias = bias_buffers[0];
|
||||
}
|
||||
if (zero_points_exist) {
|
||||
const auto& zero_point_params =
|
||||
static_cast<const kernel_selector::weight_bias_zero_point_params&>(weights_bias_params);
|
||||
if (!zero_point_params.weights_zero_points.empty()) {
|
||||
if (weight_zero_point_buffers.empty()) {
|
||||
auto& weight_zero_point = zero_point_params.weights_zero_points[0];
|
||||
auto num_of_elements = static_cast<int>(weight_zero_point.PhysicalSize());
|
||||
weight_zero_point_buffers.push_back(
|
||||
_engine.allocate_memory({
|
||||
from_data_type(weight_zero_point.GetDType()),
|
||||
format::bfyx,
|
||||
tensor(1, num_of_elements, 1, 1) }));
|
||||
}
|
||||
args.weights_zero_points = weight_zero_point_buffers[0];
|
||||
}
|
||||
if (!zero_point_params.activations_zero_points.empty()) {
|
||||
if (activation_zero_point_buffers.empty()) {
|
||||
auto& activation_zero_point = zero_point_params.activations_zero_points[0];
|
||||
auto num_of_elements = static_cast<int>(activation_zero_point.PhysicalSize());
|
||||
activation_zero_point_buffers.push_back(
|
||||
_engine.allocate_memory({
|
||||
from_data_type(activation_zero_point.GetDType()),
|
||||
format::bfyx,
|
||||
tensor(1, num_of_elements, 1, 1) }));
|
||||
}
|
||||
args.activations_zero_points = activation_zero_point_buffers[0];
|
||||
}
|
||||
if (!zero_point_params.compensation.empty()) {
|
||||
if (compensation_buffers.empty()) {
|
||||
auto& compensation = zero_point_params.compensation[0];
|
||||
auto num_of_elements = static_cast<int>(compensation.PhysicalSize());
|
||||
compensation_buffers.push_back(
|
||||
_engine.allocate_memory({
|
||||
from_data_type(compensation.GetDType()),
|
||||
format::bfyx,
|
||||
tensor(1, num_of_elements, 1, 1) }));
|
||||
}
|
||||
args.compensation = compensation_buffers[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::chrono::nanoseconds> kernel_runner::run_kernels(const kernel_selector::KernelsData& kernels_data) {
|
||||
std::vector<std::chrono::nanoseconds> run_times;
|
||||
|
||||
stream::ptr stream = _engine.create_stream({});
|
||||
|
||||
int num_of_kernels_to_run = static_cast<int>(kernels_data.size());
|
||||
int num_of_kernels_run = 0;
|
||||
|
||||
kernel_selector::KernelsData::const_iterator batch_start = kernels_data.begin();
|
||||
kernel_selector::KernelsData::const_iterator batch_end;
|
||||
while (num_of_kernels_to_run > 0) {
|
||||
int current_compilation_batch = std::min(num_of_kernels_to_run, compilation_batch_size);
|
||||
batch_end = batch_start + current_compilation_batch;
|
||||
|
||||
std::vector<kernel::ptr> kernels;
|
||||
kernels_cache cache(_engine, {}, program_id);
|
||||
|
||||
for (auto it = batch_start; it < batch_end; it++) {
|
||||
auto kernel_id = cache.set_kernel_source(it->kernels[0].code.kernelString, false);
|
||||
|
||||
kernels.push_back(cache.get_kernel(kernel_id));
|
||||
}
|
||||
|
||||
kernel_arguments_data args;
|
||||
|
||||
prepare_kernel_args(kernels_data, args);
|
||||
stream->finish();
|
||||
|
||||
int i = 0;
|
||||
for (auto it = batch_start; it < batch_end; it++) {
|
||||
std::vector<event::ptr> events;
|
||||
auto kernel_run_time = std::chrono::nanoseconds::max();
|
||||
int num_of_runs = 0;
|
||||
|
||||
for (int iteration = 0; iteration < runs_per_kernel; iteration++) {
|
||||
event::ptr event;
|
||||
try {
|
||||
stream->set_arguments(*kernels[i], it->kernels[0].params, args);
|
||||
event = stream->enqueue_kernel(*kernels[i], it->kernels[0].params, args, {});
|
||||
} catch (std::exception& e) {
|
||||
std::cout << "[clDNN] Could not run kernel for auto-tune: " << it->kernelName
|
||||
<< " with auto-tune index " << it->autoTuneIndex << std::endl
|
||||
<< ", error message:" << e.what();
|
||||
} catch (...) {
|
||||
// Could not run this kernel. Push back NULL event (will be ignored later).
|
||||
std::cout << "[clDNN] Could not run kernel for auto-tune: " << it->kernelName
|
||||
<< " with auto-tune index " << it->autoTuneIndex << std::endl;
|
||||
}
|
||||
events.push_back(event);
|
||||
}
|
||||
stream->finish();
|
||||
|
||||
for (auto& event : events) {
|
||||
if (event.get() != NULL) {
|
||||
auto profiling_intervals = event->get_profiling_info();
|
||||
for (auto const& profiling_interval : profiling_intervals) {
|
||||
if (profiling_interval.stage == instrumentation::profiling_stage::executing) {
|
||||
kernel_run_time = std::min(profiling_interval.value->value(), kernel_run_time);
|
||||
num_of_runs++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (num_of_runs > 0) {
|
||||
run_times.push_back(kernel_run_time);
|
||||
num_of_kernels_run += 1;
|
||||
} else {
|
||||
run_times.push_back(std::chrono::nanoseconds::max());
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
num_of_kernels_to_run -= current_compilation_batch;
|
||||
batch_start += current_compilation_batch;
|
||||
}
|
||||
|
||||
if (num_of_kernels_run == 0) {
|
||||
// If all kernels failed to run throw to avoid corrupting cache
|
||||
throw std::runtime_error("kernel_runner::run_kernels - could not run any of provided kernels");
|
||||
}
|
||||
|
||||
return run_times;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace cldnn
|
@ -1,45 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
#include "kernel_selector_common.h"
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "kernel_runner_interface.h"
|
||||
#include <vector>
|
||||
|
||||
namespace cldnn {
|
||||
namespace gpu {
|
||||
|
||||
class kernel_runner : public kernel_selector::KernelRunnerInterface {
|
||||
public:
|
||||
kernel_runner(engine& engine_ref, uint32_t program_id, bool weights_and_bias_exist = false, bool zero_points_exist = false);
|
||||
|
||||
std::vector<std::chrono::nanoseconds> run_kernels(const kernel_selector::KernelsData& kernelsData) override;
|
||||
|
||||
private:
|
||||
const int compilation_batch_size = 50;
|
||||
const int runs_per_kernel = 15;
|
||||
|
||||
void prepare_kernel_args(const kernel_selector::KernelsData& kernels_data,
|
||||
kernel_arguments_data& args);
|
||||
|
||||
engine& _engine;
|
||||
uint32_t program_id;
|
||||
bool weights_and_bias_exist;
|
||||
bool zero_points_exist;
|
||||
std::vector<memory::cptr> input_buffers;
|
||||
std::vector<memory::cptr> fused_ops_buffers;
|
||||
std::vector<memory::ptr> output_buffers;
|
||||
std::vector<memory::cptr> weight_buffers;
|
||||
std::vector<memory::cptr> bias_buffers;
|
||||
std::vector<memory::cptr> weight_zero_point_buffers;
|
||||
std::vector<memory::cptr> activation_zero_point_buffers;
|
||||
std::vector<memory::cptr> compensation_buffers;
|
||||
};
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
} // namespace gpu
|
||||
} // namespace cldnn
|
@ -837,23 +837,6 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
|
||||
}
|
||||
}
|
||||
|
||||
kernel_selector::tuning_mode to_tuning_mode(ov::intel_gpu::TuningMode mode) {
|
||||
switch (mode) {
|
||||
case ov::intel_gpu::TuningMode::tuning_disabled:
|
||||
return kernel_selector::tuning_mode::TUNING_DISABLED;
|
||||
case ov::intel_gpu::TuningMode::tuning_use_cache:
|
||||
return kernel_selector::tuning_mode::TUNING_USE_CACHE;
|
||||
case ov::intel_gpu::TuningMode::tuning_tune_and_cache:
|
||||
return kernel_selector::tuning_mode::TUNING_TUNE_AND_CACHE;
|
||||
case ov::intel_gpu::TuningMode::tuning_use_and_update:
|
||||
return kernel_selector::tuning_mode::TUNING_USE_AND_UPDATE;
|
||||
case ov::intel_gpu::TuningMode::tuning_retune_and_cache:
|
||||
return kernel_selector::tuning_mode::TUNING_RETUNE_AND_CACHE;
|
||||
default:
|
||||
return kernel_selector::tuning_mode::TUNING_DISABLED;
|
||||
}
|
||||
}
|
||||
|
||||
kernel_selector::data_tensor convert_data_tensor(const layout& l, const tensor view_offset) {
|
||||
const auto& pad = l.data_padding;
|
||||
const auto& vals_original = l.get_partial_shape();
|
||||
@ -1103,7 +1086,6 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
|
||||
params.engineInfo.computeUnitsCount = device_info.execution_units_count;
|
||||
params.engineInfo.maxThreadsPerExecutionUnit = device_info.num_threads_per_eu > 0 ? device_info.num_threads_per_eu : 7;
|
||||
params.engineInfo.maxThreadsPerDevice = params.engineInfo.maxThreadsPerExecutionUnit * device_info.execution_units_count;
|
||||
params.engineInfo.deviceCache = program->get_tuning_cache();
|
||||
params.engineInfo.driverVersion = device_info.driver_version;
|
||||
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
|
||||
params.engineInfo.vendor_id = device_info.vendor_id;
|
||||
@ -1121,10 +1103,6 @@ void set_optional_params(const program& program, kernel_selector::optional_param
|
||||
program.get_config().get_property(ov::intel_gpu::allow_static_input_reorder);
|
||||
params.allowInputReordering = false;
|
||||
params.allowOutputReordering = false;
|
||||
|
||||
const auto& tuning_config = program.get_config().get_property(ov::intel_gpu::tuning_config);
|
||||
params.tuningParams.mode = to_tuning_mode(tuning_config.mode);
|
||||
params.tuningParams.cacheFilePath = tuning_config.cache_file_path;
|
||||
}
|
||||
|
||||
void kernel_impl_params::save(BinaryOutputBuffer& ob) const {
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <ie_system_conf.h>
|
||||
|
||||
#include "kernel_selector_helper.h"
|
||||
#include "device_cache_reader.h"
|
||||
#include "auto_tuner.h"
|
||||
#include "layout_optimizer.h"
|
||||
#include "pass_manager.h"
|
||||
@ -108,7 +107,6 @@ program::program(engine& engine_ref,
|
||||
_stream(_engine.create_stream(config)),
|
||||
_config(config),
|
||||
processing_order(),
|
||||
tuning_cache(nullptr),
|
||||
is_body_program(is_body_program),
|
||||
is_subgroup_local_block_io_supported(-1) {
|
||||
init_primitives();
|
||||
@ -141,7 +139,6 @@ program::program(engine& engine_ref,
|
||||
_config(config),
|
||||
_task_executor(task_executor),
|
||||
processing_order(),
|
||||
tuning_cache(nullptr),
|
||||
is_subgroup_local_block_io_supported(-1) {
|
||||
init_primitives();
|
||||
set_options();
|
||||
@ -161,7 +158,6 @@ program::program(engine& engine)
|
||||
_stream(_engine.create_stream({})),
|
||||
_config(),
|
||||
processing_order(),
|
||||
tuning_cache(nullptr),
|
||||
is_subgroup_local_block_io_supported(-1) { }
|
||||
program::~program() {
|
||||
query_local_block_io_supported();
|
||||
@ -231,16 +227,6 @@ void program::init_kernels() {
|
||||
}
|
||||
}
|
||||
|
||||
void program::load_tuning_cache() {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "ProgramImpl::LoadTuningCache");
|
||||
GPU_DEBUG_DEFINE_MEM_LOGGER("ProgramImpl::LoadTuningCache");
|
||||
try {
|
||||
tuning_cache = kernel_selector::CreateTuningCacheFromFile("cache.json");
|
||||
} catch (...) {
|
||||
tuning_cache = std::make_shared<kernel_selector::TuningCache>();
|
||||
}
|
||||
}
|
||||
|
||||
kernel_id program::add_kernel(const std::shared_ptr<kernel_string>& kernelSring) {
|
||||
return _kernels_cache->set_kernel_source(kernelSring, false);
|
||||
}
|
||||
@ -597,9 +583,6 @@ void program::run_graph_compilation() { apply_opt_pass<compile_graph>(); }
|
||||
void program::pre_optimize_graph(bool is_internal) {
|
||||
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "ProgramImpl::PreOptimizeGraph");
|
||||
|
||||
if (!is_internal)
|
||||
load_tuning_cache();
|
||||
|
||||
// trim to outputs
|
||||
apply_opt_pass<trim_to_outputs>(); // ToDo remove hidden dependencies from trimm pass
|
||||
|
||||
|
@ -15,6 +15,20 @@
|
||||
#include <utility>
|
||||
#include <tuple>
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <SetupAPI.h>
|
||||
#include <devguid.h>
|
||||
#include <cstring>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <link.h>
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
TuningCache::TuningCache(const std::string& cacheFilePath, bool createMode)
|
||||
@ -273,52 +287,10 @@ void TuningCache::Save(const std::string& cacheFilePath) {
|
||||
needsSave = false;
|
||||
}
|
||||
|
||||
std::tuple<std::string, int> AutoTuner::LoadKernelOnline(const TuningMode tuningMode,
|
||||
const std::string& cacheFilePath,
|
||||
const Params& params) {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!onlineCache || lastCachePath != cacheFilePath) {
|
||||
onlineCache = std::make_shared<TuningCache>(cacheFilePath, PerformTuning(tuningMode));
|
||||
lastCachePath = cacheFilePath;
|
||||
}
|
||||
auto result = onlineCache->LoadKernel(params, PerformUpdates(tuningMode));
|
||||
|
||||
if (onlineCache->NeedsSave() && PerformUpdates(tuningMode)) {
|
||||
onlineCache->Save(cacheFilePath);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void AutoTuner::StoreKernel(const std::string& cacheFilePath,
|
||||
const Params& params,
|
||||
std::string implementationName,
|
||||
const int tuneIndex) {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!onlineCache || lastCachePath != cacheFilePath) {
|
||||
onlineCache = std::make_shared<TuningCache>(cacheFilePath, true);
|
||||
lastCachePath = cacheFilePath;
|
||||
}
|
||||
onlineCache->StoreKernel(params, implementationName, tuneIndex);
|
||||
onlineCache->Save(cacheFilePath);
|
||||
}
|
||||
|
||||
void AutoTuner::RemoveKernel(const std::string& cacheFilePath,
|
||||
const Params& params) {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (!onlineCache || lastCachePath != cacheFilePath) {
|
||||
onlineCache = std::make_shared<TuningCache>(cacheFilePath, false);
|
||||
lastCachePath = cacheFilePath;
|
||||
}
|
||||
onlineCache->RemoveKernel(params);
|
||||
if (onlineCache->NeedsSave()) {
|
||||
onlineCache->Save(cacheFilePath);
|
||||
}
|
||||
}
|
||||
|
||||
std::tuple<std::string, int> AutoTuner::LoadKernelOffline(TuningCache* deviceCache,
|
||||
const Params& params) {
|
||||
std::tuple<std::string, int> AutoTuner::LoadKernelOffline(const Params& params) {
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
static const uint32_t defaultComputeUnits = 24;
|
||||
TuningCache* deviceCache = TuningCache::get();
|
||||
if (!deviceCache)
|
||||
return {};
|
||||
auto result = deviceCache->LoadKernel(params, false);
|
||||
@ -328,4 +300,37 @@ std::tuple<std::string, int> AutoTuner::LoadKernelOffline(TuningCache* deviceCac
|
||||
return result;
|
||||
}
|
||||
|
||||
TuningCache* TuningCache::get() {
|
||||
static std::mutex m;
|
||||
static std::shared_ptr<TuningCache> cache_instance = nullptr;
|
||||
std::lock_guard<std::mutex> lock(m);
|
||||
std::string path = "cache.json";
|
||||
#ifdef _WIN32
|
||||
char module_path[MAX_PATH];
|
||||
HMODULE hm = NULL;
|
||||
GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
|
||||
(LPCSTR)&TuningCache::get,
|
||||
&hm);
|
||||
GetModuleFileName(hm, module_path, sizeof(module_path));
|
||||
std::string bin_path(module_path);
|
||||
path = bin_path.substr(0, bin_path.find_last_of("\\")) + "\\cache.json";
|
||||
#else
|
||||
const char* device_info_failed_msg = "Device lookup failed";
|
||||
Dl_info dl_info;
|
||||
dladdr((void*)(device_info_failed_msg), &dl_info); // NOLINT
|
||||
std::string bin_path(dl_info.dli_fname);
|
||||
path = bin_path.substr(0, bin_path.find_last_of("/")) + "/cache.json";
|
||||
#endif
|
||||
|
||||
if (!cache_instance) {
|
||||
try {
|
||||
cache_instance = std::make_shared<kernel_selector::TuningCache>(path, false);
|
||||
} catch (...) {
|
||||
cache_instance = std::make_shared<kernel_selector::TuningCache>();
|
||||
}
|
||||
}
|
||||
|
||||
return cache_instance.get();
|
||||
}
|
||||
|
||||
} // namespace kernel_selector
|
||||
|
@ -47,6 +47,8 @@ public:
|
||||
|
||||
bool NeedsSave() const { return needsSave; }
|
||||
|
||||
static TuningCache* get();
|
||||
|
||||
private:
|
||||
Entry LoadKernel_v1(const Params& params, uint32_t computeUnitsCount);
|
||||
Entry LoadKernel_v2(const Params& params, uint32_t computeUnitsCount);
|
||||
@ -65,21 +67,9 @@ private:
|
||||
class AutoTuner {
|
||||
public:
|
||||
AutoTuner() = default;
|
||||
std::tuple<std::string, int> LoadKernelOnline(const TuningMode tuningMode,
|
||||
const std::string& cacheFilePath,
|
||||
const Params& params);
|
||||
void StoreKernel(const std::string& cacheFilePath,
|
||||
const Params& params,
|
||||
std::string implementationName,
|
||||
const int tuneIndex);
|
||||
void RemoveKernel(const std::string& cacheFilePath,
|
||||
const Params& params);
|
||||
std::tuple<std::string, int> LoadKernelOffline(TuningCache* cache,
|
||||
const Params& params);
|
||||
std::tuple<std::string, int> LoadKernelOffline(const Params& params);
|
||||
|
||||
private:
|
||||
std::string lastCachePath;
|
||||
std::shared_ptr<TuningCache> onlineCache;
|
||||
std::mutex mutex; // Mutex to synchronize cache updates
|
||||
|
||||
/*
|
||||
|
@ -443,36 +443,6 @@ struct DimTensor {
|
||||
DimTensor(T b, T f, T w, T z, T y, T x) : b(b), f(f), w(w), z(z), y(y), x(x) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// AutoTunerMode
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
enum class TuningMode {
|
||||
TUNING_DISABLED, // Tuning is disabled.
|
||||
TUNING_USE_CACHE, // Tuning using the cached data (no on-line tuning for non-existing data).
|
||||
TUNING_TUNE_AND_CACHE, // Tuning using the cached data if exist, tune and update cache otherwise.attention_params
|
||||
TUNING_USE_AND_UPDATE, // Tuning using the cached data and other updating tasks.
|
||||
// Performs updating tasks like removal of invalid caches, promoting to new formats, etc.
|
||||
// No tuning for non-existing data.
|
||||
TUNING_RETUNE_AND_CACHE // Perform tuning even if the cached data exists.
|
||||
};
|
||||
|
||||
inline bool UseCached(const TuningMode& mode) {
|
||||
return mode == TuningMode::TUNING_USE_CACHE
|
||||
|| mode == TuningMode::TUNING_TUNE_AND_CACHE
|
||||
|| mode == TuningMode::TUNING_USE_AND_UPDATE;
|
||||
}
|
||||
|
||||
inline bool PerformTuning(const TuningMode& mode) {
|
||||
return mode == TuningMode::TUNING_TUNE_AND_CACHE
|
||||
|| mode == TuningMode::TUNING_RETUNE_AND_CACHE;
|
||||
}
|
||||
|
||||
inline bool PerformUpdates(const TuningMode& mode) {
|
||||
return mode == TuningMode::TUNING_TUNE_AND_CACHE
|
||||
|| mode == TuningMode::TUNING_USE_AND_UPDATE
|
||||
|| mode == TuningMode::TUNING_RETUNE_AND_CACHE;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Aliases:
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -1,52 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "device_cache_reader.h"
|
||||
#include "auto_tuner.h"
|
||||
#include <limits>
|
||||
#include "istreamwrapper.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#define NOMINMAX
|
||||
#include <windows.h>
|
||||
#include <SetupAPI.h>
|
||||
#include <devguid.h>
|
||||
#include <cstring>
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <link.h>
|
||||
#include <dlfcn.h>
|
||||
#endif
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <utility>
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
std::shared_ptr<kernel_selector::TuningCache> CreateTuningCacheFromFile(std::string tuning_cache_path) {
|
||||
if (tuning_cache_path.compare("cache.json") == 0) {
|
||||
#ifdef _WIN32
|
||||
char path[MAX_PATH];
|
||||
HMODULE hm = NULL;
|
||||
GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
|
||||
(LPCSTR)&CreateTuningCacheFromFile,
|
||||
&hm);
|
||||
GetModuleFileName(hm, path, sizeof(path));
|
||||
std::string bin_path(path);
|
||||
tuning_cache_path = bin_path.substr(0, bin_path.find_last_of("\\")) + "\\cache.json";
|
||||
#else
|
||||
const char* device_info_failed_msg = "Device lookup failed";
|
||||
Dl_info dl_info;
|
||||
dladdr((void*)(device_info_failed_msg), &dl_info); // NOLINT
|
||||
std::string bin_path(dl_info.dli_fname);
|
||||
tuning_cache_path = bin_path.substr(0, bin_path.find_last_of("/")) + "/cache.json";
|
||||
#endif
|
||||
}
|
||||
|
||||
return std::make_shared<kernel_selector::TuningCache>(tuning_cache_path, false);
|
||||
}
|
||||
} // namespace kernel_selector
|
@ -1,14 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
namespace kernel_selector {
|
||||
class TuningCache;
|
||||
|
||||
std::shared_ptr<kernel_selector::TuningCache> CreateTuningCacheFromFile(std::string tuning_cache_path);
|
||||
|
||||
} // namespace kernel_selector
|
@ -71,15 +71,12 @@ KernelData kernel_selector_base::get_best_kernel(const Params& params, const opt
|
||||
return kernels[0];
|
||||
}
|
||||
|
||||
KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
|
||||
const optional_params& options,
|
||||
KernelType kType) const {
|
||||
|
||||
KernelsData kernel_selector_base::GetNaiveBestKernel(const KernelList& all_impls, const Params& params, const optional_params& options) const {
|
||||
KernelsData kernelsData;
|
||||
std::string kernelName;
|
||||
|
||||
auto allImplementations = GetAllImplementations(params, options, kType);
|
||||
|
||||
for (const auto& implementation : allImplementations) {
|
||||
for (const auto& implementation : all_impls) {
|
||||
// TODO: Unify this check with the Validate virtual method. Make
|
||||
// sure that the method is called here only, not in all the
|
||||
// GetKernelsData implementations.
|
||||
@ -87,28 +84,14 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
|
||||
KernelsData kds = implementation->GetKernelsData(params, options);
|
||||
|
||||
if (kds.size() && kds[0].kernels.size()) {
|
||||
#ifdef ENABLE_ENV
|
||||
const auto& it = forceKernels.find(implementation->GetName());
|
||||
if (it != forceKernels.end()) {
|
||||
if (it->second == true) {
|
||||
ENV_PRINTF("Force: %s\n", it->first.c_str());
|
||||
return kds;
|
||||
} else {
|
||||
ENV_PRINTF("Deny: %s\n", it->first.c_str());
|
||||
}
|
||||
} else {
|
||||
#endif
|
||||
kernelsData = kds;
|
||||
kernelName = implementation->GetName();
|
||||
break;
|
||||
#ifdef ENABLE_ENV
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} catch (std::runtime_error& ex) {
|
||||
// we have to handle it in order to avoid exception in KernelSelector as much we can
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kernel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@ -120,10 +103,11 @@ KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params,
|
||||
|
||||
return kernelsData;
|
||||
}
|
||||
KernelsData kernel_selector_base::GetNaiveBestKernel(const Params& params, const optional_params& options, KernelType kType) const {
|
||||
return GetNaiveBestKernel(GetAllImplementations(params, options, kType), params, options);
|
||||
}
|
||||
|
||||
KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
|
||||
const optional_params& options,
|
||||
KernelType kType) const {
|
||||
KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params, const optional_params& options, KernelType kType) const {
|
||||
KernelsData kernelsData;
|
||||
std::string kernelName;
|
||||
|
||||
@ -131,16 +115,8 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
|
||||
auto kernel_params = static_cast<const base_params&>(params);
|
||||
bool int8_kernel = kernel_params.inputs[0].GetDType() == Datatype::INT8 || kernel_params.inputs[0].GetDType() == Datatype::UINT8;
|
||||
std::tuple<std::string, int> cachedKernelConfig;
|
||||
if (options.tuningParams.mode == TuningMode::TUNING_DISABLED && !int8_kernel) { // Try to load kernel/config from offline cache
|
||||
#if ENABLE_OFFLINE_TUNING_CACHE
|
||||
cachedKernelConfig = autoTuner.LoadKernelOffline(params.engineInfo.deviceCache.get(), params);
|
||||
#else
|
||||
return GetNaiveBestKernel(params, options, kType);
|
||||
#endif
|
||||
} else if (UseCached(options.tuningParams.mode)) { // Try to load kernel/config from on-line cache
|
||||
cachedKernelConfig = autoTuner.LoadKernelOnline(options.tuningParams.mode,
|
||||
options.tuningParams.cacheFilePath,
|
||||
params);
|
||||
if (!int8_kernel) { // Try to load kernel/config from offline cache
|
||||
cachedKernelConfig = autoTuner.LoadKernelOffline(params);
|
||||
}
|
||||
bool hashFoundInCache = !std::get<0>(cachedKernelConfig).empty();
|
||||
|
||||
@ -166,83 +142,7 @@ KernelsData kernel_selector_base::GetAutoTuneBestKernel(const Params& params,
|
||||
}
|
||||
}
|
||||
|
||||
// Cache is not valid, remove it if performing update tasks.
|
||||
if (hashFoundInCache && PerformUpdates(options.tuningParams.mode)) {
|
||||
autoTuner.RemoveKernel(options.tuningParams.cacheFilePath, params);
|
||||
}
|
||||
|
||||
if (hashFoundInCache || // Cache is not valid - hash exists in cache but kernelsData was empty or kernel
|
||||
// doesn't support the required key.
|
||||
!PerformTuning(options.tuningParams.mode) || // On-line tuning is not allowed.
|
||||
!options.tuningParams.runner) { // Runner is invalid - can't run on-line tuning
|
||||
// Fall back to the default path.
|
||||
return GetNaiveBestKernel(params, options, kType);
|
||||
}
|
||||
|
||||
// Start on-line tuning
|
||||
assert(options.tuningParams.runner);
|
||||
|
||||
for (const auto& implementation : allImplementations) {
|
||||
const ParamsKey implKey = implementation->GetSupportedKey();
|
||||
if (implKey.TuningSupport()) {
|
||||
try {
|
||||
KernelsData kds = implementation->GetKernelsDataForAutoTune(params, options);
|
||||
auto runTimes = options.tuningParams.runner->run_kernels(kds);
|
||||
|
||||
for (size_t i = 0; i < kds.size(); i++) {
|
||||
kds[i].runTime = runTimes[i].count();
|
||||
if (kernelsData.size() == 0 || kds[i].runTime < kernelsData[0].runTime) {
|
||||
kernelsData = {kds[i]};
|
||||
kernelName = implementation->GetName();
|
||||
}
|
||||
}
|
||||
} catch (std::runtime_error& ex) {
|
||||
// we have to handle it in order to avoid exception in KernelSelector as much we can
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// try to fallback to reference kernels if no optimized were found during tuning
|
||||
if (!kernelsData.size()) {
|
||||
for (const auto& implementation : allImplementations) {
|
||||
const ParamsKey implKey = implementation->GetSupportedKey();
|
||||
// this time, check only implementations that have disabled tuning
|
||||
if (!implKey.TuningSupport()) {
|
||||
try {
|
||||
KernelsData kds = implementation->GetKernelsDataForAutoTune(params, options);
|
||||
auto runTimes = options.tuningParams.runner->run_kernels(kds);
|
||||
|
||||
for (size_t i = 0; i < kds.size(); i++) {
|
||||
kds[i].runTime = runTimes[i].count();
|
||||
if (kernelsData.size() == 0 || kds[i].runTime < kernelsData[0].runTime) {
|
||||
kernelsData = {kds[i]};
|
||||
kernelName = implementation->GetName();
|
||||
}
|
||||
}
|
||||
} catch (std::runtime_error& ex) {
|
||||
// we have to handle it in order to avoid exception in KernelSelector as much we can
|
||||
kernelName = (implementation != nullptr)? implementation->GetName() : "[impl is null]";
|
||||
GPU_DEBUG_TRACE << "layerID: " << params.layerID << " kenrel: " << kernelName << " - " << ex.what() << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (kernelsData.size()) {
|
||||
kernelsData[0].kernelName = kernelName;
|
||||
kernelsData[0].kernels[0].params.layerID = params.layerID;
|
||||
autoTuner.StoreKernel(options.tuningParams.cacheFilePath,
|
||||
params,
|
||||
kernelName,
|
||||
kernelsData[0].autoTuneIndex);
|
||||
} else {
|
||||
// Tuning failed, fall back to naive path
|
||||
return GetNaiveBestKernel(params, options, kType);
|
||||
}
|
||||
|
||||
return kernelsData;
|
||||
return GetNaiveBestKernel(allImplementations, params, options);
|
||||
}
|
||||
|
||||
KernelList kernel_selector_base::GetAllImplementations(const Params& params, const optional_params& options, KernelType kType) const {
|
||||
|
@ -5,7 +5,6 @@
|
||||
#pragma once
|
||||
|
||||
#include "kernel_selector_common.h"
|
||||
#include "kernel_runner_interface.h"
|
||||
#include "auto_tuner.h"
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
@ -32,11 +31,15 @@ protected:
|
||||
}
|
||||
virtual KernelsData GetBestKernels(const Params& params, const optional_params& options) const = 0;
|
||||
|
||||
virtual KernelsData GetNaiveBestKernel(const Params& params,
|
||||
KernelsData GetNaiveBestKernel(const KernelList& all_impls,
|
||||
const Params& params,
|
||||
const optional_params& options) const;
|
||||
|
||||
KernelsData GetNaiveBestKernel(const Params& params,
|
||||
const optional_params& options,
|
||||
KernelType kType) const;
|
||||
|
||||
virtual KernelsData GetAutoTuneBestKernel(const Params& params,
|
||||
KernelsData GetAutoTuneBestKernel(const Params& params,
|
||||
const optional_params& options,
|
||||
KernelType kType) const;
|
||||
|
||||
|
@ -360,25 +360,6 @@ void ParamsKey::EnableArgMaxMinAxis(ArgMaxMinAxis a) {
|
||||
}
|
||||
}
|
||||
|
||||
void ParamsKey::EnableIndexSelectAxis(IndexSelectAxis a) {
|
||||
switch (a) {
|
||||
case IndexSelectAxis::X:
|
||||
key.restrict.val.dedicated.idxsel.axisX = 1;
|
||||
break;
|
||||
case IndexSelectAxis::Y:
|
||||
key.restrict.val.dedicated.idxsel.axisY = 1;
|
||||
break;
|
||||
case IndexSelectAxis::FEATURE:
|
||||
key.restrict.val.dedicated.idxsel.axisFeature = 1;
|
||||
break;
|
||||
case IndexSelectAxis::BATCH:
|
||||
key.restrict.val.dedicated.idxsel.axisBatch = 1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ParamsKey::EnableQuantization(QuantizationType q) {
|
||||
switch (q) {
|
||||
case QuantizationType::NONE:
|
||||
|
@ -25,7 +25,6 @@ using DataBitField = std::bitset<DataLayout::DataLayoutCount>;
|
||||
using WightsBitField = std::bitset<WeightsLayout::WeightsLayoutCount>;
|
||||
|
||||
class JitConstants;
|
||||
class TuningCache;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// fuse_params
|
||||
@ -102,7 +101,6 @@ class ParamsKey {
|
||||
public:
|
||||
ParamsKey() {
|
||||
key.restrict.raw = 0;
|
||||
key.enableTuning = 1;
|
||||
key.inputType.raw = 0;
|
||||
key.outputType.raw = 0;
|
||||
key.inputWeightsType.raw = 0;
|
||||
@ -266,7 +264,6 @@ public:
|
||||
uint32_t raw;
|
||||
} DataTypesKey;
|
||||
|
||||
uint32_t enableTuning;
|
||||
DataTypesKey inputType;
|
||||
DataTypesKey outputType;
|
||||
DataTypesKey inputWeightsType;
|
||||
@ -345,17 +342,9 @@ public:
|
||||
void EnableLSTMDyanmicOptionalHiddenOutput() { key.restrict.val.dedicated.lstm_dynamic.last_hidden = 1; }
|
||||
void EnableLSTMDyanmicOptionalCellOutput() { key.restrict.val.dedicated.lstm_dynamic.last_cell = 1; }
|
||||
void EnableConcatKernelPerInput() { key.restrict.val.dedicated.concat.kernelPerInput = 1; }
|
||||
void DisableTuning() { key.enableTuning = 0; }
|
||||
void EnableConcatOneKernel() { key.restrict.val.dedicated.concat.oneKernel = 1; }
|
||||
void EnableArgMaxMinAxis(ArgMaxMinAxis a);
|
||||
void EnableIndexSelectAxis(IndexSelectAxis a);
|
||||
void EnableFusedConvEltwiseRWOutOpt();
|
||||
bool Support(const ParamsKey& k) const;
|
||||
bool TuningSupport() const {
|
||||
if (key.enableTuning == 1)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
bool isEnabledDifferentInputWeightsTypes() const {
|
||||
return key.restrict.val.different_input_weights_types ? true : false;
|
||||
}
|
||||
@ -405,7 +394,6 @@ struct EngineInfo {
|
||||
std::string deviceId = "";
|
||||
std::string driverVersion = "";
|
||||
std::vector<size_t> supportedSimdSizes = {};
|
||||
std::shared_ptr<TuningCache> deviceCache;
|
||||
|
||||
DeviceFeaturesKey get_supported_device_features_key() const;
|
||||
};
|
||||
@ -663,18 +651,6 @@ protected:
|
||||
explicit base_params(KernelType kt) : Params(kt, ""), inputs(1), outputs(1) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// Auto tuner parameters
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
class KernelRunnerInterface;
|
||||
struct TuningParams {
|
||||
TuningMode mode;
|
||||
std::string cacheFilePath;
|
||||
std::shared_ptr<KernelRunnerInterface> runner;
|
||||
|
||||
TuningParams() : mode(TuningMode::TUNING_DISABLED), cacheFilePath(""), runner(nullptr) {}
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// optional_params
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
@ -694,8 +670,6 @@ struct optional_params {
|
||||
bool allowOutputReordering =
|
||||
false; // allow kernel to ask graph compiler to reorder the output data before executing the next kernel
|
||||
|
||||
TuningParams tuningParams;
|
||||
|
||||
virtual ParamsKey GetSupportedKey() const;
|
||||
|
||||
protected:
|
||||
|
@ -18,6 +18,6 @@ binary_convolution_kernel_selector::binary_convolution_kernel_selector() {
|
||||
|
||||
KernelsData binary_convolution_kernel_selector::GetBestKernels(const Params& params,
|
||||
const optional_params& options) const {
|
||||
return GetAutoTuneBestKernel(params, options, KernelType::BINARY_CONVOLUTION);
|
||||
return GetNaiveBestKernel(params, options, KernelType::BINARY_CONVOLUTION);
|
||||
}
|
||||
} // namespace kernel_selector
|
||||
|
@ -64,7 +64,6 @@ ParamsKey Convolution_kernel_b_fs_yx_fsv16_imad_1x1::GetSupportedKey() const {
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -375,7 +375,6 @@ ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.EnableDilation();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -87,7 +87,6 @@ ParamsKey ConvolutionKernel_imad::GetSupportedKey() const {
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,6 @@ ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_1x1::GetSupportedKey() co
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,6 @@ ParamsKey Convolution_kernel_imad_bs_fs_yx_bsv16_fsv16_3x3::GetSupportedKey() co
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -40,7 +40,6 @@ ParamsKey ConvolutionKernel_mmad_b_fs_yx_fsv32::GetSupportedKey() const {
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,6 @@ ParamsKey ConvolutionKernel_mmad_b_fs_yx_fsv32_dw::GetSupportedKey() const {
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_WEIGHTS);
|
||||
k.EnableDifferentTypes();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
return k;
|
||||
|
@ -43,7 +43,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv32::GetSupportedKey() const
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA_AND_WEIGHTS);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,6 @@ ParamsKey ConvolutionKernel_mmad_bfyx_to_b_fs_yx_fsv4::GetSupportedKey() const {
|
||||
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
|
||||
k.EnableDifferentTypes();
|
||||
k.EnableDifferentInputWeightsTypes();
|
||||
k.DisableTuning();
|
||||
return k;
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,6 @@ ParamsKey ConvolutionKernel_Ref::GetSupportedKey() const {
|
||||
k.EnableBiasPerOutput();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
|
||||
k.EnableQuantization(QuantizationType::SYMMETRIC);
|
||||
|
@ -22,7 +22,6 @@ ParamsKey ConvolutionKernel_yxfb_Ref::GetSupportedKey() const {
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.EnableDilation();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
return k;
|
||||
}
|
||||
|
@ -23,7 +23,6 @@ ParamsKey DeformableConvolutionKernel_bfyx_conv::GetSupportedKey() const {
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
k.EnableDeformableMode();
|
||||
k.EnableDeformableMask();
|
||||
|
@ -23,7 +23,6 @@ ParamsKey DeformableConvolutionKernel_bfyx_interp::GetSupportedKey() const {
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
k.EnableDeformableMode();
|
||||
k.EnableDeformableMask();
|
||||
|
@ -27,7 +27,6 @@ ParamsKey DeformableConvolutionKernel_bfyx_Ref::GetSupportedKey() const {
|
||||
k.EnableBiasPerFeature();
|
||||
k.EnableNonBiasTerm();
|
||||
k.EnableBatching();
|
||||
k.DisableTuning();
|
||||
k.EnableGroupedConvolution();
|
||||
k.EnableDeformableMode();
|
||||
k.EnableDeformableMask();
|
||||
|
@ -65,7 +65,6 @@ void ExecutionConfig::set_default() {
|
||||
std::make_tuple(ov::intel_gpu::enable_memory_pool, true),
|
||||
std::make_tuple(ov::intel_gpu::allow_static_input_reorder, false),
|
||||
std::make_tuple(ov::intel_gpu::custom_outputs, std::vector<std::string>{}),
|
||||
std::make_tuple(ov::intel_gpu::tuning_config, ov::intel_gpu::TuningConfig{}),
|
||||
std::make_tuple(ov::intel_gpu::dump_graphs, ""),
|
||||
std::make_tuple(ov::intel_gpu::force_implementations, ImplForcingMap{}),
|
||||
std::make_tuple(ov::intel_gpu::partial_build_program, false),
|
||||
|
@ -1,353 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/convolution.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
|
||||
namespace {
|
||||
|
||||
enum class cache_version {
|
||||
version_1,
|
||||
version_1_2, // version 1 cache, but version 2 file
|
||||
version_2,
|
||||
version_2_invalid,
|
||||
version_2_from_1,
|
||||
version_2_empty
|
||||
};
|
||||
|
||||
std::string reference_impl_name = "convolution_gpu_ref";
|
||||
std::string eus_marker = "__EUs__";
|
||||
|
||||
std::string cache_v1 =
|
||||
R"__a({
|
||||
"__EUs__": {
|
||||
"18283230515392601293": ["convolution_gpu_ref", 0]
|
||||
}
|
||||
})__a";
|
||||
|
||||
std::string cache_v1_2 =
|
||||
R"__a({
|
||||
"version_2": {
|
||||
},
|
||||
"version_1": {
|
||||
"__EUs__": {
|
||||
"18283230515392601293": ["convolution_gpu_ref", 0]
|
||||
}
|
||||
}
|
||||
})__a";
|
||||
|
||||
std::string cache_v2 =
|
||||
R"__a({
|
||||
"version_2": {
|
||||
"__EUs__": {
|
||||
"CONVOLUTION": {
|
||||
"F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;1_1_1;1_1_1;1_1_1;0_0_0;1;1": ["convolution_gpu_ref", 0]
|
||||
}
|
||||
}
|
||||
}
|
||||
})__a";
|
||||
|
||||
std::string cache_v2_from_v1 =
|
||||
R"__a({
|
||||
"version_2": {
|
||||
"__EUs__": {
|
||||
"CONVOLUTION": {
|
||||
"F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;1_1_1;1_1_1;1_1_1;0_0_0;1;1": ["convolution_gpu_ref", 0]
|
||||
}
|
||||
}
|
||||
},
|
||||
"version_1": {
|
||||
"__EUs__": {}
|
||||
}
|
||||
})__a";
|
||||
|
||||
std::string cache_v2_invalid =
|
||||
R"__a({
|
||||
"version_2": {
|
||||
"__EUs__": {
|
||||
"CONVOLUTION": {
|
||||
"F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;F32_BFYX_v3_p0_0_v3_p0_0_v16_p0_0_v1_p0_0;1_1_1;1_1_1;1_1_1;0_0_0;1;1": ["non_existent", 0]
|
||||
}
|
||||
}
|
||||
}
|
||||
})__a";
|
||||
|
||||
std::string cache_v2_empty =
|
||||
R"__a({
|
||||
"version_2": {
|
||||
"__EUs__": {
|
||||
"CONVOLUTION": {}
|
||||
}
|
||||
}
|
||||
})__a";
|
||||
|
||||
std::string get_cache_version(cache_version version) {
|
||||
std::string cache;
|
||||
switch (version) {
|
||||
case cache_version::version_1:
|
||||
cache = cache_v1;
|
||||
break;
|
||||
case cache_version::version_1_2:
|
||||
cache = cache_v1_2;
|
||||
break;
|
||||
case cache_version::version_2:
|
||||
cache = cache_v2;
|
||||
break;
|
||||
case cache_version::version_2_invalid:
|
||||
cache = cache_v2_invalid;
|
||||
break;
|
||||
case cache_version::version_2_from_1:
|
||||
cache = cache_v2_from_v1;
|
||||
break;
|
||||
case cache_version::version_2_empty:
|
||||
cache = cache_v2_empty;
|
||||
break;
|
||||
default:
|
||||
throw std::invalid_argument("invalid cache version");
|
||||
}
|
||||
return cache;
|
||||
}
|
||||
|
||||
std::string get_temporary_cache_file() {
|
||||
static int i = 0;
|
||||
std::string tmp_cache_file = "tmp_cldnn_test_cache_" + std::to_string(i) + ".json";
|
||||
i += 1;
|
||||
return tmp_cache_file;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void replace(std::string& text, const std::string& replaced, T replacement) {
|
||||
auto it = text.find(replaced);
|
||||
while (it != std::string::npos) {
|
||||
text.replace(it, replaced.length(), std::to_string(replacement));
|
||||
it = text.find(replaced);
|
||||
}
|
||||
}
|
||||
|
||||
void write(const std::string& filename, const std::string& text) {
|
||||
std::ofstream file;
|
||||
file.open(filename);
|
||||
if (!file.is_open())
|
||||
throw std::runtime_error("Could not open file " + filename);
|
||||
file << text;
|
||||
file.close();
|
||||
if (!file) {
|
||||
throw std::runtime_error("Failure writing to file " + filename);
|
||||
}
|
||||
}
|
||||
|
||||
std::string read(const std::string& filename) {
|
||||
std::stringstream ss;
|
||||
std::ifstream file;
|
||||
file.open(filename);
|
||||
if (!file.is_open())
|
||||
throw std::runtime_error("Could not open file " + filename);
|
||||
|
||||
ss << file.rdbuf();
|
||||
file.close();
|
||||
if (!file) {
|
||||
throw std::runtime_error("Failure reading from file " + filename);
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
void remove(const std::string& filename) {
|
||||
std::remove(filename.c_str());
|
||||
}
|
||||
|
||||
class cache_test_helper {
|
||||
public:
|
||||
cache_test_helper(cldnn::engine& engine, cache_version v)
|
||||
: _engine(engine)
|
||||
, _mode(ov::intel_gpu::TuningMode::tuning_disabled)
|
||||
, cache_filename(get_temporary_cache_file())
|
||||
{
|
||||
auto cache = get_cache_version(v);
|
||||
auto eus = engine.get_device_info().execution_units_count;
|
||||
replace(cache, eus_marker, eus);
|
||||
|
||||
write(cache_filename, cache);
|
||||
}
|
||||
|
||||
virtual ~cache_test_helper() {
|
||||
remove(cache_filename);
|
||||
}
|
||||
|
||||
cache_test_helper& with_mode(ov::intel_gpu::TuningMode mode) {
|
||||
_mode = mode;
|
||||
return *this;
|
||||
}
|
||||
|
||||
cache_test_helper& expect_cache(cache_version version) {
|
||||
compare_cache = version;
|
||||
return *this;
|
||||
}
|
||||
|
||||
cache_test_helper& expect_implementation(std::string implementation) {
|
||||
compare_implementation = implementation;
|
||||
return *this;
|
||||
}
|
||||
|
||||
cache_test_helper& expect_implementation_not(std::string implementation) {
|
||||
compare_implementation = implementation;
|
||||
compare_implementation.not_equal = true;
|
||||
return *this;
|
||||
}
|
||||
|
||||
void test() {
|
||||
auto w_mem = _engine.allocate_memory(cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 16, 16, 1, 1 }));
|
||||
auto topology = cldnn::topology(
|
||||
cldnn::input_layout("input", cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 16, 3, 3 })),
|
||||
cldnn::data("weights", w_mem),
|
||||
cldnn::convolution("conv", input_info("input"), { "weights" })
|
||||
);
|
||||
|
||||
ov::intel_gpu::TuningConfig tune_conf;
|
||||
tune_conf.cache_file_path = cache_filename;
|
||||
tune_conf.mode = _mode;
|
||||
ExecutionConfig config{
|
||||
ov::intel_gpu::tuning_config(tune_conf),
|
||||
ov::intel_gpu::optimize_data(true)
|
||||
};
|
||||
cldnn::network network(_engine, topology, config);
|
||||
auto in_mem = _engine.allocate_memory(cldnn::layout(cldnn::data_types::f32, cldnn::format::bfyx, { 1, 16, 3, 3 }));
|
||||
network.set_input_data("input", in_mem);
|
||||
network.execute();
|
||||
|
||||
if (compare_implementation.compare) {
|
||||
std::string exec_impl = network.get_implementation_info("conv");
|
||||
auto precision_pos = exec_impl.find("__");
|
||||
exec_impl = exec_impl.substr(0, precision_pos);
|
||||
|
||||
if (compare_implementation.not_equal) {
|
||||
EXPECT_NE(exec_impl, compare_implementation.value);
|
||||
} else {
|
||||
ASSERT_EQ(exec_impl, compare_implementation.value);
|
||||
}
|
||||
}
|
||||
|
||||
if (compare_cache.compare) {
|
||||
auto cache = read(cache_filename);
|
||||
auto expected_cache = get_cache_version(compare_cache.value);
|
||||
auto eus = _engine.get_device_info().execution_units_count;
|
||||
replace(expected_cache, eus_marker, eus);
|
||||
|
||||
ASSERT_EQ(cache, expected_cache);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
struct optional_compare {
|
||||
bool compare;
|
||||
bool not_equal;
|
||||
T value;
|
||||
|
||||
optional_compare() : compare(false) {}
|
||||
optional_compare(T v) : compare(true), not_equal(false), value(v) {}
|
||||
optional_compare(T v, bool neq) : compare(true), not_equal(neq), value(v) {}
|
||||
};
|
||||
|
||||
cldnn::engine& _engine;
|
||||
|
||||
ov::intel_gpu::TuningMode _mode;
|
||||
|
||||
std::string cache_filename;
|
||||
|
||||
optional_compare<cache_version> compare_cache;
|
||||
optional_compare<std::string> compare_implementation;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
class cache_version_test : public testing::TestWithParam<cache_version> {
|
||||
public:
|
||||
static std::string to_string(const testing::TestParamInfo<cache_version>& param) {
|
||||
std::string result;
|
||||
switch (param.param) {
|
||||
case cache_version::version_1:
|
||||
result = "version_1";
|
||||
break;
|
||||
case cache_version::version_1_2:
|
||||
result = "version_1_2";
|
||||
break;
|
||||
case cache_version::version_2:
|
||||
result = "version_2";
|
||||
break;
|
||||
case cache_version::version_2_invalid:
|
||||
result = "version_2_invalid";
|
||||
break;
|
||||
case cache_version::version_2_from_1:
|
||||
result = "version_2_from_1";
|
||||
break;
|
||||
case cache_version::version_2_empty:
|
||||
result = "version_2_empty";
|
||||
break;
|
||||
default:
|
||||
result = std::to_string(static_cast<int>(param.param));
|
||||
break;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
TEST(cache_test, no_cache_baseline) {
|
||||
SCOPED_TRACE("default implementation same as reference, cache tests may provide invalid pass");
|
||||
auto& engine = tests::get_test_engine();
|
||||
auto helper = cache_test_helper(engine, cache_version::version_2);
|
||||
|
||||
helper.with_mode(ov::intel_gpu::TuningMode::tuning_disabled)
|
||||
.expect_implementation_not(reference_impl_name)
|
||||
.test();
|
||||
}
|
||||
|
||||
TEST_P(cache_version_test, use_only) {
|
||||
auto version = GetParam();
|
||||
auto& engine = tests::get_test_engine();
|
||||
|
||||
cache_test_helper helper(engine, version);
|
||||
helper.with_mode(ov::intel_gpu::TuningMode::tuning_use_cache)
|
||||
.expect_implementation(reference_impl_name)
|
||||
.expect_cache(version)
|
||||
.test();
|
||||
}
|
||||
|
||||
TEST_P(cache_version_test, update) {
|
||||
auto version = GetParam();
|
||||
auto ex_version = cache_version::version_2;
|
||||
if (version != cache_version::version_2) {
|
||||
ex_version = cache_version::version_2_from_1;
|
||||
}
|
||||
|
||||
auto& engine = tests::get_test_engine();
|
||||
|
||||
cache_test_helper helper(engine, version);
|
||||
helper.with_mode(ov::intel_gpu::TuningMode::tuning_use_and_update)
|
||||
.expect_implementation(reference_impl_name)
|
||||
.expect_cache(ex_version)
|
||||
.test();
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
smoke,
|
||||
cache_version_test,
|
||||
testing::Values(cache_version::version_1, cache_version::version_1_2, cache_version::version_2),
|
||||
cache_version_test::to_string);
|
||||
|
||||
TEST(cache_test, remove_invalid) {
|
||||
auto& engine = tests::get_test_engine();
|
||||
|
||||
cache_test_helper helper(engine, cache_version::version_2_invalid);
|
||||
helper.with_mode(ov::intel_gpu::TuningMode::tuning_use_and_update)
|
||||
.expect_implementation_not(reference_impl_name)
|
||||
.expect_cache(cache_version::version_2_empty)
|
||||
.test();
|
||||
}
|
Loading…
Reference in New Issue
Block a user