[GPU] Reorder weights refactoring (#17787)

This commit is contained in:
Roman Lyamin 2023-06-23 16:01:55 +04:00 committed by GitHub
parent cca8cf15ef
commit d00c7d30f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 382 additions and 661 deletions

View File

@ -125,7 +125,7 @@ struct kernel_impl_params {
void save(BinaryOutputBuffer& ob) const;
void load(BinaryInputBuffer& ib);
const program& get_program() const {
OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params in not initialized");
OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params is not initialized");
return *prog;
}
stream& get_stream() const { return *strm; }

View File

@ -43,6 +43,7 @@ struct program {
friend class prepare_conv_eltw_fusing; // to be removed when possible
friend class reorder_inputs; // to be removed when possible
friend class remove_redundant_reorders; // to be removed when possible
friend class post_optimize_weights; // to be removed when possible
friend class program_wrapper; // this class is intended to extend the interface of program for
// the usage within tests_core_internal project only
friend class prepare_primitive_fusing_through; // to be removed when possible

View File

@ -1,104 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/primitives/primitive.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include <vector>
namespace cldnn {
struct WeightsReorderParams {
WeightsReorderParams(layout in_layout, layout out_layout) : _in_layout(in_layout), _out_layout(out_layout) {}
virtual size_t hash() const {
return hash_combine(_in_layout.hash(), _out_layout.hash());
}
virtual bool operator==(const WeightsReorderParams& rhs) const {
if (typeid(*this) != typeid(rhs))
return false;
return _in_layout == rhs._in_layout &&
_out_layout == rhs._out_layout;
}
layout get_input_layout() const { return _in_layout; }
layout get_output_layout() const { return _out_layout; }
virtual ~WeightsReorderParams() = default;
protected:
layout _in_layout;
layout _out_layout;
};
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
/// @details Corresponding values are bitwise equal before/after reorder.
struct generic_layer : public primitive_base<generic_layer> {
CLDNN_DECLARE_PRIMITIVE(generic_layer)
generic_layer() : primitive_base("", {}) {}
DECLARE_OBJECT_TYPE_SERIALIZATION
/// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive.
/// @param id This primitive id.
/// @param input Input primitive id.
/// @param output_layout Requested memory layout.
/// @param mean Primitive id to get mean subtract values.
generic_layer(const primitive_id& id,
const primitive_id& input,
std::shared_ptr<WeightsReorderParams> params,
const padding& output_padding = padding())
: primitive_base(id, {input}, {output_padding}), params(params) {}
std::shared_ptr<WeightsReorderParams> params;
size_t hash() const override {
size_t seed = primitive::hash();
if (params)
seed = hash_combine(seed, params->hash());
return seed;
}
bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;
auto rhs_casted = downcast<const generic_layer>(rhs);
if ((params == nullptr) != (rhs_casted.params == nullptr))
return false;
if (params != nullptr)
return *params == *rhs_casted.params;
return true;
}
void save(BinaryOutputBuffer& ob) const override {
primitive_base<generic_layer>::save(ob);
ob << params->get_input_layout();
ob << params->get_output_layout();
}
void load(BinaryInputBuffer& ib) override {
primitive_base<generic_layer>::load(ib);
layout input_layout, output_layout;
ib >> input_layout;
ib >> output_layout;
params = std::make_shared<WeightsReorderParams>(input_layout, output_layout);
}
protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
};
/// @}
/// @}
/// @}
} // namespace cldnn

View File

@ -19,6 +19,39 @@ enum class reorder_mean_mode {
div, // val/mean
};
struct WeightsReorderParams {
WeightsReorderParams(const layout& in_layout, const layout& out_layout, bool transposed)
: _in_layout(in_layout),
_out_layout(out_layout),
_transposed(transposed) {}
size_t hash() const {
size_t seed = hash_combine(_in_layout.hash(), _out_layout.hash());
seed = hash_combine(seed, _transposed);
return seed;
}
bool operator==(const WeightsReorderParams& rhs) const {
if (typeid(*this) != typeid(rhs))
return false;
return _in_layout == rhs._in_layout &&
_out_layout == rhs._out_layout &&
_transposed == rhs._transposed;
}
layout get_input_layout() const { return _in_layout; }
layout get_output_layout() const { return _out_layout; }
bool should_be_transposed() const { return _transposed; }
void set_input_layout(const layout& layout) { _in_layout = layout; }
protected:
layout _in_layout;
layout _out_layout;
bool _transposed;
};
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
/// @details Corresponding values are bitwise equal before/after reorder.
/// Also merged with subtraction layer, which can subtract, multiply or divide values based on mean_mode value, while doing reordering.
@ -144,16 +177,32 @@ struct reorder : public primitive_base<reorder> {
mean(mean),
mean_mode(mode) {}
/// @brief Constructs weights reorder primitive.
/// @param id This primitive id.
/// @param input Input primitive id.
/// @param weights_reorder_params Parameters required for reorder weights.
reorder(const primitive_id& id,
const input_info& input,
std::shared_ptr<WeightsReorderParams> weights_reorder_params)
: primitive_base(id, {input}),
output_format(weights_reorder_params->get_output_layout().format),
mean(""),
subtract_per_feature({}),
mean_mode(reorder_mean_mode::none),
weights_reorder_params(weights_reorder_params) {}
/// @brief Requested memory format.
format output_format;
/// @brief Primitive id to get mean subtract values. Ignored if subtract_per_featrue is set.
/// @brief Primitive id to get mean subtract values. Ignored if subtract_per_feature is set.
primitive_id mean;
/// @brief Array of mean subtract values.
std::vector<float> subtract_per_feature;
/// @brief Mode of mean execution
/// @brief Mode of mean execution.
reorder_mean_mode mean_mode;
/// @brief Input memory type
/// @brief Input memory type.
memory_type input_mem_type = memory_type::buffer;
/// @brief Parameters required for reorder weights.
std::shared_ptr<WeightsReorderParams> weights_reorder_params = {};
inline bool has_surface_input() const {
return input.size() == 1 &&
@ -170,6 +219,10 @@ struct reorder : public primitive_base<reorder> {
seed = hash_combine(seed, truncate);
seed = hash_range(seed, subtract_per_feature.begin(), subtract_per_feature.end());
seed = hash_combine(seed, mean.empty());
if (weights_reorder_params) {
seed = hash_combine(seed, weights_reorder_params->hash());
}
return seed;
}
@ -179,11 +232,18 @@ struct reorder : public primitive_base<reorder> {
auto rhs_casted = downcast<const reorder>(rhs);
bool reorder_weights_eq = (weights_reorder_params == nullptr) == (rhs_casted.weights_reorder_params == nullptr);
if (reorder_weights_eq && weights_reorder_params) {
reorder_weights_eq = *weights_reorder_params == *rhs_casted.weights_reorder_params;
}
return subtract_per_feature == rhs_casted.subtract_per_feature &&
mean_mode == rhs_casted.mean_mode &&
input_mem_type == rhs_casted.input_mem_type &&
truncate == rhs_casted.truncate &&
mean.empty() == rhs_casted.mean.empty();
output_format == rhs_casted.output_format &&
mean.empty() == rhs_casted.mean.empty() &&
reorder_weights_eq;
}
void save(BinaryOutputBuffer& ob) const override {

View File

@ -217,6 +217,7 @@ struct format {
iy_xs_os_xsv2_osv16__ao32,
i_yxs_os_yxsv2_osv16,
os_i_yxs_osv4_yxsv4,
os_i_osv16, ///< format used only for fully connected weights
os_i_osv16__ai8, ///< format used only for fully connected weights
os_i_osv8__ai8, ///< format used only for fully connected weights
os_y_is_x_osv8_isv2,

View File

@ -471,13 +471,12 @@ public:
* @endcode
*/
tensor transform(cldnn::format new_fmt, value_type default_size) const {
cldnn::format format = cldnn::format::bfvuwzyx;
auto val_order = format.internal_order();
cldnn::format default_fmt = cldnn::format::bfvuwzyx;
auto val_order = default_fmt.internal_order();
auto new_order = new_fmt.internal_order();
std::vector<value_type> old_sizes = sizes();
std::vector<value_type> new_sizes(old_sizes.size(), default_size);
const auto& new_traits = format::traits(new_fmt);
const cldnn::format default_fmt = cldnn::format::bfvuwzyx;
static const std::map<char, char> flatten_mapping = {
{ 'v', 'u'},
{ 'u', 'w'},

View File

@ -1,39 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "generic_layer_inst.h"
#include "primitive_type_base.h"
#include "json_object.h"
#include <algorithm>
#include <string>
#include <memory>
namespace cldnn {
GPU_DEFINE_PRIMITIVE_TYPE_ID(generic_layer)
generic_layer_node::typed_program_node(const std::shared_ptr<generic_layer> prim, program& prog)
: parent(prim, prog) {
can_share_buffer(false);
}
generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node)
: parent(network, node) {}
generic_layer_inst::typed_primitive_inst(network& network)
: parent(network) {
_type = generic_layer::type_id();
}
std::string generic_layer_inst::to_string(generic_layer_node const& node) {
auto node_info = node.desc_to_json();
std::stringstream primitive_description;
node_info->dump(primitive_description);
return primitive_description.str();
}
} // namespace cldnn

View File

@ -4,6 +4,8 @@
#include "pass_manager.h"
#include "program_helpers.h"
#include "implementation_map.hpp"
#include "convolution_inst.h"
#include "binary_convolution_inst.h"
#include "deconvolution_inst.h"
@ -38,53 +40,82 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
if (impl->is_dynamic())
return;
// Don't run impl selection to avoid double compilation of reorder kernels
// in main program and internal program for constant propagation
auto set_implementation = [&p, &impl](program_node& weights_reorder_node) {
if (!weights_reorder_node.is_constant()) {
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
reorder_kernel_params->prog = &p;
auto reorder_impl = factory(*reorder_kernel_params);
weights_reorder_node.set_selected_impl(reorder_impl->clone());
if (auto impl = weights_reorder_node.get_selected_impl()) {
auto params = weights_reorder_node.get_kernel_impl_params();
p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
}
}
};
auto output_layout = node.get_output_layout();
auto weights_reorder_params = impl->get_weights_reorder_params();
for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) {
auto& weights_node = node.get_dependency(i);
program_node& prev_node = node.get_dependency(i);
auto reorder = _rf.get_weights_reorder(weights_node.id(), weights_reorder_params);
if (weights_reorder_params != nullptr) {
bool can_be_fused = prev_node.is_type<reorder>() &&
prev_node.get_users().size() == 1 &&
prev_node.get_dependencies().size() == 1 &&
!prev_node.has_fused_primitives() &&
!prev_node.as<reorder>().has_mean() &&
prev_node.as<reorder>().get_primitive()->subtract_per_feature.empty();
if (can_be_fused) {
// Need to update input data_type for correct merging format reorder with precision reorder
data_types input_dtype = prev_node.get_input_layouts()[0].data_type;
auto updated_input_layout = weights_reorder_params->get_input_layout();
updated_input_layout.data_type = input_dtype;
weights_reorder_params->set_input_layout(updated_input_layout);
if (reorder.first) {
// insert new generic_layer node to topology
p.add_intermediate(reorder.first, node, i, !reorder.second);
// set generic_layer's node output layout and implementation
auto& g_node = node.get_dependency(i);
g_node.get_output_layout(false);
auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid,
weights_reorder_params);
auto& weights_reorder_node = p.get_or_create(weights_reorder.first);
p.replace(prev_node, weights_reorder_node);
weights_reorder_node.recalc_output_layout(false);
// Don't run impl selection to avoid double compilation of reorder kernels
// in main program and internal program for constant propagation
if ((!g_node.is_constant()) && (!reorder.second)) {
g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
if (auto impl = g_node.get_selected_impl()) {
auto params = g_node.get_kernel_impl_params();
p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
if (!weights_reorder.second) {
set_implementation(weights_reorder_node);
}
} else {
auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
// insert new weights reorder node to topology
p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
// set weights reorder's node output layout and implementation
auto& weights_reorder_node = node.get_dependency(i);
weights_reorder_node.get_output_layout(false);
if (!weights_reorder.second) {
set_implementation(weights_reorder_node);
}
}
}
}
// Reset weights reorder params to not keep source code pointer
impl->reset_weights_reorder_params();
// set the old output layout and do not invalidate users as change of weights will not affect output layout
node.set_output_layout(output_layout, false);
}
void post_optimize_weights::run(program& p) {
for (auto& node : p.get_processing_order()) {
if (node->type() == convolution::type_id()) {
if (node->is_type<convolution>()) {
optimize_weights(node->as<convolution>(), p);
}
if (node->type() == binary_convolution::type_id()) {
} else if (node->is_type<binary_convolution>()) {
optimize_weights(node->as<binary_convolution>(), p);
} else if (node->type() == deconvolution::type_id()) {
} else if (node->is_type<deconvolution>()) {
optimize_weights(node->as<deconvolution>(), p);
} else if (node->type() == deformable_conv::type_id()) {
} else if (node->is_type<deformable_conv>()) {
optimize_weights(node->as<deformable_conv>(), p);
} else if (node->type() == fully_connected::type_id()) {
} else if (node->is_type<fully_connected>()) {
optimize_weights(node->as<fully_connected>(), p);
} else if (node->type() == lstm_dynamic_input::type_id()) {
} else if (node->is_type<lstm_dynamic_input>()) {
optimize_weights(node->as<lstm_dynamic_input>(), p);
}
}

View File

@ -25,12 +25,10 @@ void pre_replace_deconv::run(program& p) {
while (itr != p.nodes_map.end()) {
auto node_itr = itr++;
auto& node = (*node_itr).second;
// find deconvolution primitives with stride 1 and change them to convolution with trasposed weights
// find deconvolution primitives with stride 1 and change them to convolution with transposed weights
if (node->is_type<deconvolution>()) {
if (node->is_dynamic())
continue;
if (!p.get_config().get_property(ov::intel_gpu::optimize_data))
continue;
auto& deconv_node = node->as<deconvolution>();
auto& weights_node = deconv_node.weights();
@ -61,7 +59,6 @@ void pre_replace_deconv::run(program& p) {
if (!perform_opt)
continue;
// setting convolution parameters based on deconvolution params
auto output_layout = deconv_node.get_output_layout();
auto output_pshape = output_layout.get_partial_shape();
@ -73,8 +70,7 @@ void pre_replace_deconv::run(program& p) {
auto output_padding = deconv_prim->output_paddings[0];
auto grouped_weights_shape = deconv_prim->grouped_weights_shape;
// remove deconvolution node and its connections to weights and biases, rename it and move to the optimized
// list
// remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list
p.remove_connection(input_node, deconv_node);
std::vector<std::shared_ptr<program_node>> weight_connections;
for (auto& weights_id : weights_nodes_id) {

View File

@ -166,7 +166,7 @@ void propagate_constants::add_deps_to_tpl(program& prog, const std::vector<std::
if (dep.first->is_type<data>()) {
auto dep_ptr = prog.get_node_ptr(dep.first->get_primitive()->id);
if (nodes.find(dep_ptr) == nodes.end()) {
nodes.insert(prog.get_node_ptr(dep.first->get_primitive()->id));
nodes.insert(dep_ptr);
const_inputs.push_back(&dep.first->as<data>());
}
}

View File

@ -275,7 +275,9 @@ void remove_redundant_reorders::run(program& p) {
!r_node.get_primitive()->subtract_per_feature.empty() ||
no_output_optimization ||
r_node.has_fused_primitives() ||
r_node.get_primitive()->has_surface_input())
r_node.get_primitive()->has_surface_input() ||
(r_node.get_primitive()->weights_reorder_params &&
r_node.get_primitive()->weights_reorder_params->should_be_transposed()))
continue;
auto o_layout = r_node.get_output_layout();

View File

@ -1,137 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "primitive_base.hpp"
#include "generic_layer_inst.h"
namespace cldnn {
namespace ocl {
struct generic_layer_impl : typed_primitive_impl<generic_layer> {
using parent = typed_primitive_impl<generic_layer>;
using parent::parent;
kernel_selector::cl_kernel_data _cl_kernel_data;
kernel::ptr _kernel;
kernel_id _cached_kernel_id;
DECLARE_OBJECT_TYPE_SERIALIZATION
std::unique_ptr<primitive_impl> clone() const override {
return make_unique<generic_layer_impl>(*this);
}
generic_layer_impl() : parent() {}
generic_layer_impl(const generic_layer_impl& other)
: _cl_kernel_data(other._cl_kernel_data)
, _kernel(nullptr)
, _cached_kernel_id(other._cached_kernel_id) {
OPENVINO_ASSERT(other._kernel, "[GPU] Can't copy generic_layer_impl node: kernel is nullptr");
_kernel = other._kernel->clone();
}
generic_layer_impl(const kernel_impl_params& params)
: _cl_kernel_data()
, _kernel(nullptr)
, _cached_kernel_id() {
auto reorder_params = params.typed_desc<generic_layer>()->params;
auto casted_params = std::dynamic_pointer_cast<WeightsReorderParamsOCL>(reorder_params);
OPENVINO_ASSERT(casted_params, "[GPU] Invalid weights reorder parameters type for ", params.desc->id, " node");
_cl_kernel_data = *casted_params->get_cl_kernel();
}
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
kernel_strings.push_back(_cl_kernel_data.code.kernelString);
return kernel_strings;
}
std::vector<kernel::ptr> get_kernels() const override {
return {_kernel};
}
void save(BinaryOutputBuffer& ob) const override {
ob << _cl_kernel_data;
ob << _cached_kernel_id;
}
void load(BinaryInputBuffer& ib) override {
ib >> _cl_kernel_data;
ib >> _cached_kernel_id;
}
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
_kernel = nullptr;
auto compiled_kernels = kernels_cache.get_kernels(params);
OPENVINO_ASSERT(compiled_kernels.size() == 1, "[GPU] Unexpected number of kernels for generic_layer during init_kernels() call");
_kernel = compiled_kernels.front();
}
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
_kernel = kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id);
}
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernel);
}
void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
OPENVINO_ASSERT(kernels.size() == 1 &&
kernels.begin()->second.size() == 1, "[GPU] Unexpected number of kernels for generic_layer");
_kernel = kernels.begin()->second[0].first;
}
void set_arguments_impl(generic_layer_inst& instance) override {
kernel_arguments_data args;
args.scalars = &_cl_kernel_data.params.scalars;
for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
args.inputs.push_back(instance.input_memory_ptr(i));
}
args.outputs.push_back(instance.output_memory_ptr());
set_arguments_impl(instance, args);
}
void set_arguments_impl(generic_layer_inst& instance, kernel_arguments_data& args) override {
stream& stream = instance.get_network().get_stream();
stream.set_arguments(*_kernel, _cl_kernel_data.params, args);
}
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
stream& stream = instance.get_network().get_stream();
kernel_arguments_data args;
args.scalars = &_cl_kernel_data.params.scalars;
for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
args.inputs.push_back(instance.input_memory_ptr(i));
}
args.outputs.push_back(instance.output_memory_ptr());
return stream.enqueue_kernel(*_kernel, _cl_kernel_data.params, args, events, true);
}
static std::unique_ptr<primitive_impl> create(const kernel_impl_params& params) {
return make_unique<generic_layer_impl>(params);
}
};
static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params& params) {
return make_unique<generic_layer_impl>(params);
}
namespace detail {
attach_generic_layer_impl::attach_generic_layer_impl() {
implementation_map<generic_layer>::add(cldnn::impl_types::ocl, create, {});
WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, generic_layer_impl::create);
}
} // namespace detail
} // namespace ocl
} // namespace cldnn
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl)
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::generic_layer)

View File

@ -387,6 +387,7 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
case format::iyxo:
case format::fyxb:
return kernel_selector::weights_layout::iyxo;
case format::oyxi:
case format::byxf:
return kernel_selector::weights_layout::oyxi;
case format::byfx:
@ -408,6 +409,8 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
return kernel_selector::weights_layout::os_is_yx_osv16_isv16;
case format::os_iyx_osv32:
return kernel_selector::weights_layout::os_iyx_osv32;
case format::os_iyx_osv32__ai32:
return kernel_selector::weights_layout::os_iyx_osv32__ai32;
case format::os_iyx_osv64:
return kernel_selector::weights_layout::os_iyx_osv64;
case format::image_2d_weights_c4_fyx_b:
@ -509,18 +512,26 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
return kernel_selector::weights_layout::os_i_osv8__ai8;
case format::os_i_osv16__ai8:
return kernel_selector::weights_layout::os_i_osv16__ai8;
case format::bs_f_bsv16:
case format::os_i_osv16:
return kernel_selector::weights_layout::os_i_osv16;
case format::os_is_zyx_isv16_osv16:
return kernel_selector::weights_layout::os_is_zyx_isv16_osv16;
case format::is_os_zyx_isv16_osv16:
return kernel_selector::weights_layout::is_os_zyx_isv16_osv16;
case format::os_is_zyx_osv32_isv16:
return kernel_selector::weights_layout::os_is_zyx_osv32_isv16;
case format::is_os_yx_isv16_osv16:
return kernel_selector::weights_layout::is_os_yx_isv16_osv16;
case format::is_os_yx_isv16_osv8:
return kernel_selector::weights_layout::is_os_yx_isv16_osv8;
case format::i_yxs_os_yxsv2_osv16:
return kernel_selector::weights_layout::i_yxs_os_yxsv2_osv16;
case format::is_os_yx_osa4_isa8_osv8_isv4:
return kernel_selector::weights_layout::is_os_yx_osa4_isa8_osv8_isv4;
case format::iy_xs_os_xsv2_osv8__ao32:
return kernel_selector::weights_layout::iy_xs_os_xsv2_osv8__ao32;
case format::iy_xs_os_xsv2_osv16__ao32:
return kernel_selector::weights_layout::iy_xs_os_xsv2_osv16__ao32;
case format::os_is_osv32_isv32_swizzled_by_4:
return kernel_selector::weights_layout::os_is_osv32_isv32_swizzled_by_4;
case format::os_is_zyx_isv8_osv16_isv2:
@ -551,6 +562,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
return kernel_selector::weights_layout::gs_oizyx_gsv16;
case format::gs_oiyx_gsv32:
return kernel_selector::weights_layout::gs_oiyx_gsv32;
case format::gs_oi_yxs_gsv4_yxsv4:
return kernel_selector::weights_layout::gs_oi_yxs_gsv4_yxsv4;
case format::gs_oi_yxs_gsv16_yxsv4:
return kernel_selector::weights_layout::gs_oi_yxs_gsv16_yxsv4;
case format::gs_oi_yxs_gsv32_yxsv4:
return kernel_selector::weights_layout::gs_oi_yxs_gsv32_yxsv4;
case format::gs_oizyx_gsv32:
return kernel_selector::weights_layout::gs_oizyx_gsv32;
case format::gyxio:
@ -647,6 +664,10 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
return kernel_selector::weights_layout::g_os_y_is_x_osv8_isv2;
case format::g_os_y_is_x_osv8_isv4:
return kernel_selector::weights_layout::g_os_y_is_x_osv8_isv4;
case format::g_os_is_yx_isv16_osv16:
return kernel_selector::weights_layout::g_os_is_yx_isv16_osv16;
case format::lstm_weights_dio:
return kernel_selector::weights_layout::dlstm_dir_io;
default:
throw std::invalid_argument("Unable to convert tensor layout " + fmt_to_str(f) + " to weights layout");
}
@ -686,7 +707,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
case kernel_selector::weights_layout::os_iyx_osv64:
return cldnn::format::os_iyx_osv64;
case kernel_selector::weights_layout::os_i_osv16:
return cldnn::format::bs_f_bsv16;
return cldnn::format::os_i_osv16;
case kernel_selector::weights_layout::os_i_osv8__ai8:
return cldnn::format::os_i_osv8__ai8;
case kernel_selector::weights_layout::os_i_osv16__ai8:

View File

@ -14,7 +14,7 @@
#include "intel_gpu/primitives/eltwise.hpp"
#include "intel_gpu/primitives/quantize.hpp"
#include "intel_gpu/primitives/activation.hpp"
#include "intel_gpu/primitives/generic_layer.hpp"
#include "intel_gpu/primitives/reorder.hpp"
#include "intel_gpu/primitives/primitive.hpp"
#include "kernel_selector_params.h"
@ -80,7 +80,6 @@ using multi_data_tensor = kernel_selector::MultiDataTensor;
using params = kernel_selector::Params;
using weights_reorder_params = kernel_selector::WeightsReorderParams;
using generic_kernel_params = kernel_selector::GenericKernelParams;
} // namespace kernel_selector
@ -272,106 +271,12 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
return updated_impl_params;
}
class WeightsReorderParamsOCL : public WeightsReorderParams {
public:
explicit WeightsReorderParamsOCL(const kernel_selector::WeightsReorderParams& params)
: WeightsReorderParams(from_weights_tensor(params.src), from_weights_tensor(params.dest)) {
cl_kernel = params.clKernel;
}
size_t hash() const override {
size_t seed = WeightsReorderParams::hash();
if (cl_kernel == nullptr)
return seed;
seed = hash_combine(seed, cl_kernel->skip_execution);
auto& gws = cl_kernel->params.workGroups.global;
seed = hash_range(seed, gws.begin(), gws.end());
auto& lws = cl_kernel->params.workGroups.local;
seed = hash_range(seed, lws.begin(), lws.end());
auto& arguments = cl_kernel->params.arguments;
for (auto& args : arguments) {
seed = hash_combine(seed, args.index);
seed = hash_combine(seed, args.t);
}
auto& scalars = cl_kernel->params.scalars;
for (auto& s : scalars) {
seed = hash_combine(seed, s.t);
}
return seed;
}
bool operator==(const WeightsReorderParams& rhs) const override {
if (typeid(*this) != typeid(rhs))
return false;
if (!WeightsReorderParams::operator==(rhs))
return false;
auto rhs_casted = downcast<const WeightsReorderParamsOCL>(rhs);
if (cl_kernel != nullptr && rhs_casted.cl_kernel != nullptr) {
auto& clKernel_rhs = rhs_casted.cl_kernel;
if (cl_kernel->skip_execution != clKernel_rhs->skip_execution)
return false;
auto& gws = cl_kernel->params.workGroups.global;
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
if (gws != gws_rhs)
return false;
auto& lws = cl_kernel->params.workGroups.local;
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
if (lws != lws_rhs)
return false;
auto& arguments = cl_kernel->params.arguments;
auto& arguments_rhs = clKernel_rhs->params.arguments;
if (arguments.size() != arguments_rhs.size())
return false;
for (size_t idx = 0; idx < arguments.size(); idx++) {
if (arguments[idx].index != arguments_rhs[idx].index)
return false;
if (arguments[idx].t != arguments_rhs[idx].t)
return false;
}
auto& scalars = cl_kernel->params.scalars;
auto& scalars_rhs = clKernel_rhs->params.scalars;
if (scalars.size() != scalars_rhs.size())
return false;
for (size_t idx = 0; idx < scalars.size(); idx++) {
if (scalars[idx].t != scalars_rhs[idx].t)
return false;
}
}
return true;
}
std::shared_ptr<kernel_selector::clKernelData> get_cl_kernel() {
return cl_kernel;
}
private:
std::shared_ptr<kernel_selector::clKernelData> cl_kernel;
};
inline std::shared_ptr<WeightsReorderParams> create_weights_reorder_params(const kernel_selector::WeightsReorderParams& params) {
if (params.engine == kernel_selector::generic_kernel_params::Engine::NONE) {
if (!params.is_initialized) {
return nullptr;
}
return std::make_shared<WeightsReorderParamsOCL>(params);
return std::make_shared<WeightsReorderParams>(from_weights_tensor(params.src), from_weights_tensor(params.dest), params.rotate);
}
} // namespace cldnn

View File

@ -38,10 +38,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
// a pair of batch program hash and kernel entry hash of each ocl impl.
std::pair<std::string, std::string> kernel_dump_info;
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
_kernel_data.weightsReorderParams.clKernel = nullptr;
}
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {}
typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
: typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name, other._is_dynamic)
@ -58,10 +55,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd)
: typed_primitive_impl<PType>(create_weights_reorder_params(kd.weightsReorderParams), kd.kernelName),
_kernel_data(kd) {
// weights reorder params got copied to parent, clear in _kernel_data to release shared ptr
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
_kernel_data.weightsReorderParams.clKernel = nullptr;
this->can_reuse_memory = _kernel_data.can_reuse_memory;
}

View File

@ -81,7 +81,6 @@ void register_implementations() {
REGISTER_OCL(tile);
REGISTER_OCL(lstm_dynamic_input);
REGISTER_OCL(lstm_dynamic_timeloop);
REGISTER_OCL(generic_layer);
REGISTER_OCL(gather_tree);
REGISTER_OCL(resample);
REGISTER_OCL(grn);

View File

@ -28,7 +28,6 @@
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
#include "intel_gpu/primitives/eye.hpp"
#include "intel_gpu/primitives/fully_connected.hpp"
#include "intel_gpu/primitives/generic_layer.hpp"
#include "intel_gpu/primitives/gather.hpp"
#include "intel_gpu/primitives/gather_elements.hpp"
#include "intel_gpu/primitives/gather_nd.hpp"
@ -162,7 +161,6 @@ REGISTER_OCL(strided_slice);
REGISTER_OCL(tile);
REGISTER_OCL(lstm_dynamic_input);
REGISTER_OCL(lstm_dynamic_timeloop);
REGISTER_OCL(generic_layer);
REGISTER_OCL(gather_tree);
REGISTER_OCL(resample);
REGISTER_OCL(grn);

View File

@ -7,6 +7,7 @@
#include "reorder_inst.h"
#include "reorder/reorder_kernel_selector.h"
#include "reorder/reorder_kernel_base.h"
#include "reorder/reorder_weights_kernel_selector.h"
namespace cldnn {
namespace ocl {
@ -26,9 +27,10 @@ struct reorder_impl : typed_primitive_impl_ocl<reorder> {
protected:
kernel_arguments_data get_arguments(const reorder_inst& instance) const override {
kernel_arguments_data args = parent::get_arguments(instance);
auto input = &instance.input_memory();
auto input_layout = input->get_layout();
if (instance.has_mean()) {
if (instance.has_node() && instance.has_mean()) {
auto input = &instance.input_memory();
auto input_layout = input->get_layout();
if (input_layout.format == cldnn::format::nv12) {
args.bias = instance.mean_nv12_memory();
} else {
@ -108,12 +110,45 @@ public:
auto kernel_params = get_kernel_params(impl_param, true);
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
}
static std::unique_ptr<primitive_impl> create(const reorder_node& arg, const kernel_impl_params& impl_param) {
bool is_reorder_weights = format::is_weights_format(impl_param.get_input_layout().format) ||
format::is_weights_format(impl_param.get_output_layout().format);
if (is_reorder_weights) {
return create_reorder_weights(impl_param);
} else {
return typed_primitive_impl_ocl<reorder>::create<reorder_impl>(arg, impl_param);
}
}
static std::unique_ptr<primitive_impl> create_reorder_weights(const kernel_impl_params& impl_param) {
const auto& prim = impl_param.typed_desc<reorder>();
const auto& weights_params = prim->weights_reorder_params;
auto& kernel_selector = kernel_selector::ReorderWeightsKernelSelector::Instance();
OPENVINO_ASSERT(impl_param.get_input_layout().bytes_count() == weights_params->get_input_layout().bytes_count(),
"[GPU] Input layout doesn't match required reorder weights layout");
kernel_selector::reorder_weights_params r_params;
set_params(impl_param, r_params);
r_params.input = convert_weights_tensor(weights_params->get_input_layout());
r_params.output = convert_weights_tensor(weights_params->get_output_layout());
r_params.layerID = impl_param.desc->id + "_reorder_weigths";
r_params.uniqueID = std::to_string(impl_param.unique_id) + "_weight";
r_params.rotate_180 = weights_params->should_be_transposed();
kernel_selector::reorder_optional_params optional_params;
auto best_kernel = kernel_selector.get_best_kernel(r_params, optional_params);
return make_unique<reorder_impl>(best_kernel);
}
};
namespace detail {
attach_reorder_impl::attach_reorder_impl() {
implementation_map<reorder>::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl<reorder>::create<reorder_impl>, {});
implementation_map<reorder>::add(impl_types::ocl, shape_types::static_shape, reorder_impl::create, {});
auto types = {
data_types::f32,
@ -129,7 +164,9 @@ attach_reorder_impl::attach_reorder_impl() {
format::bfzyx,
format::bfwzyx,
};
implementation_map<reorder>::add(impl_types::ocl, shape_types::dynamic_shape, typed_primitive_impl_ocl<reorder>::create<reorder_impl>, types, formats);
implementation_map<reorder>::add(impl_types::ocl, shape_types::dynamic_shape, reorder_impl::create, types, formats);
WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, reorder_impl::create_reorder_weights);
}
} // namespace detail

View File

@ -130,38 +130,17 @@ protected:
return attrs;
}
static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd, bool rotate) {
kernel_selector::WeightsReorderParams weights_reorder_params;
auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance();
kernel_selector::reorder_weights_params r_params;
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd, bool rotate) {
auto cldnn_prim = impl_params.typed_desc<convolution>();
auto weights_layout = impl_params.get_input_layout(1);
auto grouped_weights = format::is_grouped(weights_layout.format) || cldnn_prim->grouped_weights_shape;
cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape);
set_params(impl_params, r_params);
r_params.layerID = cldnn_prim->id + "_reorder_";
r_params.input = convert_weights_tensor(weights_layout, cldnn_prim->grouped_weights_shape);
r_params.output = r_params.input.TransformIgnorePadding(reqLayout, r_params.input.GetDType(), cldnn_prim->groups, false);
r_params.rotate_180 = rotate;
auto input_weights_layout = impl_params.get_input_layout(1);
auto grouped_weights = format::is_grouped(input_weights_layout.format) || cldnn_prim->grouped_weights_shape;
format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
kernel_selector::reorder_optional_params op;
kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
auto output_weights_layout = input_weights_layout;
output_weights_layout.format = out_fmt;
if (kernels_data.empty()) {
throw std::runtime_error("No suitable kernel found for weights reorder from " +
kernel_selector::toString(r_params.input.GetLayout()) + " to " +
kernel_selector::toString(r_params.output.GetLayout()));
}
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
weights_reorder_params.src = r_params.input;
weights_reorder_params.dest = r_params.output;
return weights_reorder_params;
return std::make_shared<WeightsReorderParams>(input_weights_layout, output_weights_layout, rotate);
}
public:

View File

@ -51,38 +51,17 @@ protected:
return arg.get_onednn_primitive_attributes();
}
static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
kernel_selector::WeightsReorderParams weights_reorder_params;
auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance();
kernel_selector::reorder_weights_params r_params;
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
auto cldnn_prim = impl_params.typed_desc<deconvolution>();
auto weights_layout = impl_params.get_input_layout(1);
auto grouped_weights = format::is_grouped(weights_layout.format) || cldnn_prim->grouped_weights_shape;
cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape);
set_params(impl_params, r_params);
r_params.layerID = cldnn_prim->id + "_reorder_";
r_params.input = convert_weights_tensor(weights_layout, cldnn_prim->grouped_weights_shape);
r_params.output = r_params.input.TransformIgnorePadding(reqLayout, r_params.input.GetDType(), cldnn_prim->groups, false);
r_params.rotate_180 = false;
auto input_weights_layout = impl_params.get_input_layout(1);
auto grouped_weights = format::is_grouped(input_weights_layout.format) || cldnn_prim->grouped_weights_shape;
format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
kernel_selector::reorder_optional_params op;
kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
auto output_weights_layout = input_weights_layout;
output_weights_layout.format = out_fmt;
if (kernels_data.empty()) {
throw std::runtime_error("No suitable kernel found for weights reorder from " +
kernel_selector::toString(r_params.input.GetLayout()) + " to " +
kernel_selector::toString(r_params.output.GetLayout()));
}
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
weights_reorder_params.src = r_params.input;
weights_reorder_params.dest = r_params.output;
return weights_reorder_params;
return std::make_shared<WeightsReorderParams>(input_weights_layout, output_weights_layout, false);
}
public:

View File

@ -51,7 +51,7 @@ protected:
return args;
}
static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
auto input_layout = impl_params.get_input_layout(0);
auto weights_layout = impl_params.get_input_layout(1);
auto cldnn_prim = impl_params.typed_desc<fully_connected>();
@ -66,35 +66,12 @@ protected:
weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature));
}
kernel_selector::WeightsReorderParams weights_reorder_params;
auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance();
kernel_selector::reorder_weights_params r_params;
format out_fmt = onednn::find_format(pd.weights_desc(0));
cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0));
kernel_selector::WeightsLayout req_layout = to_weights_layout(out_fmt, false);
auto output_weights_layout = weights_layout;
output_weights_layout.format = out_fmt;
// set engine info & forcing
set_params(impl_params, r_params);
r_params.layerID = cldnn_prim->id + "_reorder_";
r_params.input = convert_weights_tensor(weights_layout, false);
r_params.output = r_params.input.TransformIgnorePadding(req_layout, r_params.input.GetDType(), 1, false);
r_params.rotate_180 = false;
kernel_selector::reorder_optional_params op;
kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
if (kernels_data.empty()) {
throw std::runtime_error("No suitable kernel found for weights reorder from " +
kernel_selector::toString(r_params.input.GetLayout()) + " to " +
kernel_selector::toString(r_params.output.GetLayout()));
}
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
weights_reorder_params.src = r_params.input;
weights_reorder_params.dest = r_params.output;
return weights_reorder_params;
return std::make_shared<WeightsReorderParams>(weights_layout, output_weights_layout, false);
}
static std::shared_ptr<dnnl::inner_product_forward::primitive_desc> get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params,

View File

@ -46,8 +46,8 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
const ExecutionConfig& config,
std::shared_ptr<dnnl::primitive_attr> attrs,
const PrimDescType& pd,
kernel_selector::WeightsReorderParams weights_reorder = {})
: typed_primitive_impl<PType>(create_weights_reorder_params(weights_reorder), pd.impl_info_str()),
std::shared_ptr<WeightsReorderParams> weights_reorder = {})
: typed_primitive_impl<PType>(weights_reorder, pd.impl_info_str()),
_engine(&engine),
_attrs(attrs),
_pd(pd) {

View File

@ -1,45 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/primitives/generic_layer.hpp"
#include "primitive_inst.h"
#include <string>
#include <memory>
namespace cldnn {
template <>
struct typed_program_node<generic_layer> : public typed_program_node_base<generic_layer> {
using parent = typed_program_node_base<generic_layer>;
typed_program_node(const std::shared_ptr<generic_layer> prim, program& prog);
public:
using parent::parent;
program_node& input() const { return get_dependency(0); }
};
using generic_layer_node = typed_program_node<generic_layer>;
template <>
class typed_primitive_inst<generic_layer> : public typed_primitive_inst_base<generic_layer> {
using parent = typed_primitive_inst_base<generic_layer>;
using parent::parent;
public:
static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) {
return impl_param.typed_desc<generic_layer>()->params->get_output_layout();
}
static std::string to_string(generic_layer_node const& node);
typed_primitive_inst(network& network, generic_layer_node const& node);
typed_primitive_inst(network& network);
};
using generic_layer_inst = typed_primitive_inst<generic_layer>;
} // namespace cldnn

View File

@ -10,7 +10,6 @@
#include "intel_gpu/runtime/lru_cache.hpp"
#include "data_inst.h"
#include "generic_layer_inst.h"
#include "reorder_inst.h"
#include "convolution_inst.h"
#include "deconvolution_inst.h"
@ -73,7 +72,6 @@ private:
};
std::map<cache_key, std::shared_ptr<reorder>> _cached_reorders;
std::map<cache_key, std::shared_ptr<generic_layer>> _cached_generic_reorders;
};
class layout_optimizer {

View File

@ -5,7 +5,6 @@
#pragma once
#include "intel_gpu/primitives/primitive.hpp"
#include "intel_gpu/primitives/concatenation.hpp"
#include "intel_gpu/primitives/generic_layer.hpp"
#include "intel_gpu/runtime/event.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/lru_cache.hpp"
@ -99,7 +98,6 @@ struct primitive_impl {
bool need_weights_reorder() const { return _weights_reorder_params != nullptr; }
std::shared_ptr<WeightsReorderParams> get_weights_reorder_params() const { return _weights_reorder_params; }
void reset_weights_reorder_params() { _weights_reorder_params = nullptr; }
std::shared_ptr<kernel_impl_params> get_weights_reorder_kernel_params() const;
@ -232,6 +230,7 @@ public:
bool is_constant() const { return _is_constant; }
bool needs_completion_event() const { return _needs_completion_event; }
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
bool has_node() const { return _node != nullptr; }
void allocate_internal_buffers();
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,

View File

@ -56,7 +56,6 @@ struct program_node {
friend class prepare_conv_eltw_fusing; // to be removed when possible
friend class prepare_conv_eltw_read_write_opt; // to be removed when possible
friend class propagate_constants; // to be removed when possible
friend class post_optimize_weights; // to be removed when possible - requires an access to selected_impl
template <class PType>
friend struct typed_program_node;

View File

@ -77,7 +77,9 @@ public:
static std::string to_string(reorder_node const& node);
public:
typed_primitive_inst(network& network);
typed_primitive_inst(network& network, reorder_node const& node);
memory::ptr mean_nv12_memory() const { return dep_memory_ptr(2); }
memory::ptr mean_memory() const { return dep_memory_ptr(1); }

View File

@ -158,22 +158,18 @@ std::pair<std::shared_ptr<reorder>, bool> reorder_factory::get_reorder(primitive
std::pair<std::shared_ptr<primitive>, bool> reorder_factory::get_weights_reorder(primitive_id input_id,
std::shared_ptr<WeightsReorderParams> reorder_params) {
if (reorder_params == nullptr)
return {};
OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized.");
layout expected_layout = reorder_params->get_output_layout();
cache_key ckey{ input_id, expected_layout, false };
auto itr = _cached_generic_reorders.find(ckey);
if (itr != _cached_generic_reorders.end()) {
cache_key ckey{ input_id, reorder_params->get_output_layout(), false };
auto itr = _cached_reorders.find(ckey);
if (itr != _cached_reorders.end()) {
return std::make_pair(itr->second, true);
} else {
auto count = _cached_generic_reorders.size();
std::stringstream ss;
ss << input_id << "_generic_layer_" << count;
auto count = _cached_reorders.size();
std::string reorder_id = input_id + "_weights_reorder_" + std::to_string(count);
auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), input_id, reorder_params);
_cached_generic_reorders[ckey] = reorder;
auto reorder = std::make_shared<cldnn::reorder>(reorder_id, input_id, reorder_params);
_cached_reorders[ckey] = reorder;
return std::make_pair(reorder, false);
}
}
@ -941,8 +937,8 @@ bool layout_optimizer::deps_for_convolution_byxf_opt(program_node const& node, u
return true;
for (auto& dep : node.get_dependencies()) {
// skip data and generic_layers
if (dep.first->is_type<data>() || dep.first->is_type<generic_layer>())
// skip data layers
if (dep.first->is_type<data>())
continue;
if (dep.first->is_type<convolution>()) {

View File

@ -5,7 +5,7 @@
#include "primitive_inst.h"
#include "data_inst.h"
#include "mutable_data_inst.h"
#include "generic_layer_inst.h"
#include "reorder_inst.h"
#include "input_layout_inst.h"
#include "arg_max_min_inst.h"
#include "fully_connected_inst.h"
@ -115,7 +115,7 @@ std::shared_ptr<kernel_impl_params> primitive_impl::get_weights_reorder_kernel_p
return nullptr;
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
auto prim = std::make_shared<generic_layer>("", "", _weights_reorder_params);
auto prim = std::make_shared<reorder>("", input_info(), _weights_reorder_params);
reorder_kernel_params->desc = prim;
reorder_kernel_params->unique_id = _weights_reorder_params->hash();
reorder_kernel_params->input_layouts.push_back(_weights_reorder_params->get_input_layout());
@ -955,6 +955,9 @@ event::ptr primitive_inst::update_weights() {
auto& engine = _network.get_engine();
auto reorder_kernel_params = _impl->get_weights_reorder_kernel_params();
if (reorder_kernel_params)
reorder_kernel_params->prog = get_network().get_program().get();
auto weights_idx = _node->get_primitive()->input.size();
auto original_weights_memory = dep_memory_ptr(weights_idx);
auto original_layout = original_weights_memory->get_layout();
@ -983,7 +986,7 @@ event::ptr primitive_inst::update_weights() {
} else {
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false);
auto& cache = get_network().get_program()->get_implementations_cache();
auto reorder_inst = std::make_shared<generic_layer_inst>(get_network());
auto reorder_inst = std::make_shared<cldnn::reorder_inst>(get_network());
if (auto cached_impl = cache.get(*reorder_kernel_params)) {
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
@ -996,7 +999,7 @@ event::ptr primitive_inst::update_weights() {
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
auto reorder_impl = factory(*reorder_kernel_params);
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
auto kernels = kernels_cache.compile(*_impl_params, reorder_impl->get_kernels_source());
auto kernels = kernels_cache.compile(*reorder_kernel_params, reorder_impl->get_kernels_source());
OPENVINO_ASSERT(kernels.size() == 1, "[GPU] Expected number of compiled kernels is 1, but got ", kernels.size());
reorder_impl->set_kernels(kernels);
@ -1106,10 +1109,11 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
: !usm_device_allocatable ? lockable_mem_type : allocation_type::usm_device;
if (is_internal) {
if (_node.can_be_optimized() || _node.is_type<generic_layer>()) {
bool is_reorder_weights = _node.is_type<reorder>() && _node.as<reorder>().get_primitive()->weights_reorder_params;
if (_node.can_be_optimized() || is_reorder_weights) {
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
// Use usm_device memory for weights reordering
if (is_internal && _node.is_type<generic_layer>() &&
if (is_internal && is_reorder_weights &&
_engine.supports_allocation(allocation_type::usm_device))
alloc_type = allocation_type::usm_device;
return get_memory_from_pool(_engine,
@ -1120,7 +1124,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
false,
reset);
} else {
if ((_node.is_output() && _node.is_type<generic_layer>()) || (!_node.is_output() && _node.is_type<input_layout>()))
if ((_node.is_output() && is_reorder_weights) || (!_node.is_output() && _node.is_type<input_layout>()))
reset = false;
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
return _engine.allocate_memory(layout, alloc_type, reset);

View File

@ -1469,7 +1469,6 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
prim.type() != cldnn::grid_sample::type_id() &&
prim.type() != cldnn::softmax::type_id() &&
prim.type() != cldnn::fully_connected::type_id() &&
prim.type() != cldnn::generic_layer::type_id() &&
prim.type() != cldnn::scatter_nd_update::type_id() &&
prim.type() != cldnn::broadcast::type_id() &&
prim.type() != cldnn::quantize::type_id() &&
@ -1608,10 +1607,7 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
if (node->can_be_optimized())
continue;
if (node->is_type<data>() && node->get_users().size() == 1 && node->have_user_with_type<generic_layer>()) {
continue;
}
if (node->is_type<data>() || (node->is_type<generic_layer>() && node->get_dependency(0).is_type<data>())) {
if (node->is_type<data>()) {
const_sum += out_size;
} else if (node->have_user_with_type<concatenation>() && node->get_users().size() == 1 && node->get_users().front()->can_be_optimized()) {
continue;

View File

@ -19,7 +19,7 @@ layout reorder_inst::calc_output_layout(reorder_node const& node, kernel_impl_pa
auto ifmt = input_layout.format;
auto desc = impl_param.typed_desc<reorder>();
auto odt = *desc->output_data_types[0];
auto odt = desc->output_data_types[0].value_or(input_layout.data_type);
auto ofmt = desc->output_format;
auto op = desc->output_paddings[0];
@ -146,7 +146,11 @@ layout reorder_inst::calc_output_layout(reorder_node const& node, kernel_impl_pa
"Conversion of weights from winograd to standard domain is currently unsupported");
}
if ((ofmt == format::bs_fs_fsv8_bsv8 || ofmt == format::os_i_osv8__ai8 || ofmt == format::os_i_osv16__ai8 || ofmt == format::bs_f_bsv16 ||
if (desc->weights_reorder_params) {
return desc->weights_reorder_params->get_output_layout();
}
if ((ofmt == format::bs_fs_fsv8_bsv8 || ofmt == format::os_i_osv8__ai8 || ofmt == format::os_i_osv16__ai8 || ofmt == format::os_i_osv16 ||
ofmt == format::bfzyx || ifmt == format::bfzyx || ofmt == format::b_fs_zyx_fsv16 || ifmt == format::b_fs_zyx_fsv16 ||
ofmt == format::bs_fs_zyx_bsv16_fsv16 || ifmt == format::bs_fs_zyx_bsv16_fsv16 ||
ofmt == format::bs_fs_zyx_bsv16_fsv32 || ifmt == format::bs_fs_zyx_bsv16_fsv32 ||
@ -169,7 +173,11 @@ std::vector<layout> reorder_inst::calc_output_layouts(reorder_node const& /*node
auto ifmt = input_layout.format;
auto ofmt = desc->output_format == format::any ? ifmt : desc->output_format;
return { layout(input_layout.get<ShapeType>(), desc->output_data_types[0].value(), ofmt, desc->output_paddings[0]) };
if (desc->weights_reorder_params) {
return { desc->weights_reorder_params->get_output_layout() };
} else {
return { layout(input_layout.get<ShapeType>(), desc->output_data_types[0].value(), ofmt, desc->output_paddings[0]) };
}
}
std::string reorder_inst::to_string(reorder_node const& node) {
@ -197,6 +205,10 @@ std::string reorder_inst::to_string(reorder_node const& node) {
return primitive_description.str();
}
reorder_inst::typed_primitive_inst(network& network) : parent(network) {
_type = reorder::type_id();
}
reorder_inst::typed_primitive_inst(network& network, reorder_node const& node)
: parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false)
, _req_reinterpr(node.requires_reinterpret()) {

View File

@ -67,22 +67,14 @@ struct clKernelData {
bool skip_execution = false;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// GenericKernelParams
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct GenericKernelParams {
enum class Engine { NONE, GPU };
Engine engine = Engine::NONE;
std::shared_ptr<clKernelData> clKernel;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// WeightsReorderParams
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct WeightsReorderParams : public GenericKernelParams {
struct WeightsReorderParams {
WeightsTensor src;
WeightsTensor dest;
bool rotate;
bool is_initialized = false;
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -4,6 +4,7 @@
#include "kernel_selector_utils.h"
#include "reorder/reorder_weights_kernel_selector.h"
#include "reorder/reorder_kernel_selector.h"
#include "reorder/reorder_kernel_base.h"
#include "convolution/convolution_params.h"
#include <vector>
@ -110,31 +111,10 @@ bool UpdateWeightsParams(weight_bias_params& newParams,
if (!optParams.allowStaticInputReordering) {
return false;
}
auto& reorderKS = ReorderWeightsKernelSelctor::Instance();
reorder_weights_params r_params;
r_params.layerID = newParams.layerID + "_reorder_";
r_params.input = newParams.weights;
r_params.output = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups, false);
r_params.rotate_180 = rotate;
r_params.engineInfo = newParams.engineInfo;
r_params.uniqueID = newParams.uniqueID + "_weight";
reorder_optional_params op;
KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
if (kernels_data.empty()) {
throw std::runtime_error("No suitable kernel found for weights reorder from " +
toString(r_params.input.GetLayout()) + " to " +
toString(r_params.output.GetLayout()) +
(rotate ? " with rotate" : ""));
}
weightsReorderParams.engine = WeightsReorderParams::Engine::GPU;
weightsReorderParams.clKernel = std::make_shared<clKernelData>(kernels_data[0].kernels[0]);
weightsReorderParams.src = r_params.input;
weightsReorderParams.dest = r_params.output;
weightsReorderParams.is_initialized = true;
weightsReorderParams.src = newParams.weights;
weightsReorderParams.dest = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups, false);
weightsReorderParams.rotate = rotate;
newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups);
return true;

View File

@ -13,7 +13,7 @@
namespace kernel_selector {
ReorderWeightsKernelSelctor::ReorderWeightsKernelSelctor() {
ReorderWeightsKernelSelector::ReorderWeightsKernelSelector() {
Attach<ReorderWeightsKernel>();
Attach<ReorderWeightsWinograd2x3Kernel>();
Attach<ReorderWeightsWinograd6x3Kernel>();
@ -23,7 +23,7 @@ ReorderWeightsKernelSelctor::ReorderWeightsKernelSelctor() {
Attach<ReorderWeightsOpt>();
}
KernelsData ReorderWeightsKernelSelctor::GetBestKernels(const Params& params, const optional_params& options) const {
KernelsData ReorderWeightsKernelSelector::GetBestKernels(const Params& params, const optional_params& options) const {
return GetNaiveBestKernel(params, options, KernelType::REORDER);
}
} // namespace kernel_selector

View File

@ -7,16 +7,16 @@
#include "kernel_selector.h"
namespace kernel_selector {
class ReorderWeightsKernelSelctor : public kernel_selector_base {
class ReorderWeightsKernelSelector : public kernel_selector_base {
public:
static ReorderWeightsKernelSelctor& Instance() {
static ReorderWeightsKernelSelctor instance_;
static ReorderWeightsKernelSelector& Instance() {
static ReorderWeightsKernelSelector instance_;
return instance_;
}
ReorderWeightsKernelSelctor();
ReorderWeightsKernelSelector();
virtual ~ReorderWeightsKernelSelctor() {}
virtual ~ReorderWeightsKernelSelector() {}
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
};

View File

@ -109,7 +109,7 @@ enum WeightsLayout {
os_i_osv16__ai8,
os_i_osv16,
os_is_yx_osv16_isv2,
os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
os_is_yx_osv16_isv16, // weights for int8 blocked conv
os_is_zyx_osv16_isv16,
os_is_zyx_osv32_isv16,
os_is_zyx_osv64_isv16,

View File

@ -152,6 +152,7 @@ static const std::map<format::type, format_traits> format_traits_map {
FMT_TRAITS(iy_xs_os_xsv2_osv8__ao32, 1, 1, 2, 0, {1, 2, 3, 0}, "iyxo", "oixy", {{2, 2}, {0, 8}}),
FMT_TRAITS(iy_xs_os_xsv2_osv16__ao32, 1, 1, 2, 0, {1, 2, 3, 0}, "iyxo", "oixy", {{2, 2}, {0, 16}}),
FMT_TRAITS(os_i_yxs_osv4_yxsv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 4}}),
FMT_TRAITS(os_i_osv16, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{0, 16}}),
FMT_TRAITS(os_i_osv16__ai8, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{1, 8}, {0, 16}}),
FMT_TRAITS(os_i_osv8__ai8, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{1, 8}, {0, 8}}),
FMT_TRAITS(os_y_is_x_osv8_isv2, 1, 1, 2, 0, {0, 2, 1, 3}, "oyix", "oixy", {{0, 8}, {1, 2}}),

View File

@ -10,7 +10,6 @@
#include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/gather.hpp>
#include <intel_gpu/primitives/permute.hpp>
#include <intel_gpu/primitives/generic_layer.hpp>
using namespace cldnn;
using namespace ::tests;
@ -111,18 +110,20 @@ TEST(primitive_comparison, permute) {
ASSERT_NE(permute_prim, permute_prim_order);
}
TEST(primitive_comparison, generic_layer) {
TEST(primitive_comparison, reorder_weights) {
auto shape = ov::PartialShape{1, 2, 3, 4};
auto data_type = data_types::f32;
auto format_in = format::bfyx;
auto format_out = format::os_iyx_osv16;
auto input_layout = layout{shape, data_type, format_in};
auto output_layout = layout{shape, data_type, format_out};
auto generic_layer_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
auto generic_layer_eq_prim = generic_layer("generic_layer_eq", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
auto generic_layer_different_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(output_layout, input_layout));
auto format_osv16 = format::os_iyx_osv16;
auto format_osv32 = format::os_iyx_osv32;
ASSERT_EQ(generic_layer_prim, generic_layer_eq_prim);
ASSERT_NE(generic_layer_prim, generic_layer_different_prim);
auto layout_osv16 = layout{shape, data_type, format_osv16};
auto layout_osv32 = layout{shape, data_type, format_osv32};
auto reorder_weights_prim = reorder("reorder_weights", input_info("input"), layout_osv16);
auto reorder_weights_eq_prim = reorder("reorder_weights_eq", input_info("input"), layout_osv16);
auto reorder_weights_diff_prim = reorder("reorder_weights_neq", input_info("input"), layout_osv32);
ASSERT_EQ(reorder_weights_prim, reorder_weights_eq_prim);
ASSERT_NE(reorder_weights_prim, reorder_weights_diff_prim);
}

View File

@ -9,7 +9,7 @@
#include "intel_gpu/primitives/input_layout.hpp"
#include "intel_gpu/primitives/data.hpp"
#include "generic_layer_inst.h"
#include "reorder_inst.h"
#include "fully_connected_inst.h"
#include "implementation_map.hpp"
#include "graph/impls/ocl/register.hpp"
@ -67,10 +67,11 @@ TEST(weights_factory, reorder_test) {
// Constuct kernel_impl_params for weights reorder based requested WeightsReorderParams
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
reorder_kernel_params->desc = std::make_shared<generic_layer>("weights_reorder", "", weights_reorder_params);
reorder_kernel_params->desc = std::make_shared<reorder>("weights_reorder", input_info(), weights_reorder_params);
reorder_kernel_params->unique_id = weights_reorder_params->hash();
reorder_kernel_params->input_layouts.push_back(weights_reorder_params->get_input_layout());
reorder_kernel_params->output_layouts.push_back(weights_reorder_params->get_output_layout());
reorder_kernel_params->prog = network.get_program().get();
// Create new generic_layer_impl
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
@ -91,7 +92,7 @@ TEST(weights_factory, reorder_test) {
args.inputs.push_back(weights_data_input);
args.outputs.push_back(weights_data_output);
auto reorder_inst = std::make_shared<generic_layer_inst>(network);
auto reorder_inst = std::make_shared<cldnn::reorder_inst>(network);
reorder_inst->set_impl(reorder_impl->clone());
reorder_inst->get_impl()->set_arguments(*reorder_inst, args);

View File

@ -0,0 +1,87 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include "program_wrapper.h"
#include "fully_connected_inst.h"
using namespace cldnn;
using namespace ::tests;
TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) {
auto& engine = get_test_engine();
auto input = engine.allocate_memory({ { 2, 32 }, data_types::f16, format::bfyx });
auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx });
topology topology(
input_layout("input", input->get_layout()),
input_layout("weights", weights->get_layout()),
reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16),
fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16)
);
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, true);
reorder_factory rf;
program_wrapper::apply_opt_pass<compile_graph>(*prog);
program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
ASSERT_TRUE(has_node(*prog, "reorder_dt"));
ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
}
TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
auto& engine = get_test_engine();
ov::Shape pshape = { 4, 16 };
auto input = engine.allocate_memory({ pshape, data_types::f32, format::bfyx });
auto weights = engine.allocate_memory({ pshape, data_types::f32, format::bfyx });
std::vector<float> weights_data(pshape[0] * pshape[1]);
std::iota(weights_data.begin(), weights_data.end(), 0.f);
set_values(weights, weights_data);
topology topology(
input_layout("input", input->get_layout()),
data("weights", weights),
fully_connected("fc", input_info("input"), { "weights" })
);
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
auto prog = program::build_program(engine, topology, config, false, true);
reorder_factory rf;
program_wrapper::apply_opt_pass<compile_graph>(*prog);
program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
program_wrapper::apply_opt_pass<propagate_constants>(*prog);
ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0"));
auto& weights_node = prog->get_node("weights_weights_reorder_0");
ASSERT_TRUE(weights_node.is_type<data>());
size_t align = 16; // os_iyx_osv16 format
size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0]
: pshape[0] - pshape[0] % align + align;
std::vector<float> expected(aligned_b_size * pshape[1], 0.f);
size_t input_idx = 0;
for (size_t i = 0; i < pshape[0]; ++i) {
for (size_t j = 0; j < pshape[1]; ++j) {
expected[j * align + i] = weights_data[input_idx++];
}
}
auto weights_mem_ptr = weights_node.as<data>().get_attached_memory_ptr();
cldnn::mem_lock<float, mem_lock_type::read> weights_mem(weights_mem_ptr, get_test_stream());
for (size_t i = 0; i < expected.size(); ++i) {
ASSERT_EQ(weights_mem[i], expected[i]);
}
}