* [GPU] Reorder weights refactoring (#17787) * [GPU] Fix DG2 with weights optimization * [GPU] Fix DG2 with weights optimization * [GPU] Fix DG2 with weights optimization * [GPU] Fix DG2 with weights optimization * [GPU] Fix inner order description for some of formats * [GPU] Fix expected number of primitives in test --------- Co-authored-by: Roman Lyamin <Roman.Lyamin@intel.com> Co-authored-by: Sergey Shlyapnikov <sergey.shlyapnikov@intel.com>
This commit is contained in:
parent
d363660e67
commit
9d28dfd79d
@ -136,7 +136,7 @@ virtual primitive_type_id type() const { return desc->type; }
|
||||
void save(BinaryOutputBuffer& ob) const;
|
||||
void load(BinaryInputBuffer& ib);
|
||||
const program& get_program() const {
|
||||
OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params in not initialized");
|
||||
OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params is not initialized");
|
||||
return *prog;
|
||||
}
|
||||
stream& get_stream() const { return *strm; }
|
||||
|
@ -45,6 +45,7 @@ struct program {
|
||||
friend class prepare_conv_eltw_fusing; // to be removed when possible
|
||||
friend class reorder_inputs; // to be removed when possible
|
||||
friend class remove_redundant_reorders; // to be removed when possible
|
||||
friend class post_optimize_weights; // to be removed when possible
|
||||
friend class program_wrapper; // this class is intended to extend the interface of program for
|
||||
// the usage within tests_core_internal project only
|
||||
friend class prepare_primitive_fusing_through; // to be removed when possible
|
||||
|
@ -1,104 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include "intel_gpu/primitives/primitive.hpp"
|
||||
#include "intel_gpu/runtime/memory.hpp"
|
||||
|
||||
#include <vector>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
struct WeightsReorderParams {
|
||||
WeightsReorderParams(layout in_layout, layout out_layout) : _in_layout(in_layout), _out_layout(out_layout) {}
|
||||
|
||||
virtual size_t hash() const {
|
||||
return hash_combine(_in_layout.hash(), _out_layout.hash());
|
||||
}
|
||||
|
||||
virtual bool operator==(const WeightsReorderParams& rhs) const {
|
||||
if (typeid(*this) != typeid(rhs))
|
||||
return false;
|
||||
|
||||
return _in_layout == rhs._in_layout &&
|
||||
_out_layout == rhs._out_layout;
|
||||
}
|
||||
|
||||
layout get_input_layout() const { return _in_layout; }
|
||||
layout get_output_layout() const { return _out_layout; }
|
||||
|
||||
virtual ~WeightsReorderParams() = default;
|
||||
|
||||
protected:
|
||||
layout _in_layout;
|
||||
layout _out_layout;
|
||||
};
|
||||
|
||||
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
|
||||
/// @details Corresponding values are bitwise equal before/after reorder.
|
||||
struct generic_layer : public primitive_base<generic_layer> {
|
||||
CLDNN_DECLARE_PRIMITIVE(generic_layer)
|
||||
|
||||
generic_layer() : primitive_base("", {}) {}
|
||||
|
||||
DECLARE_OBJECT_TYPE_SERIALIZATION
|
||||
|
||||
/// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive.
|
||||
/// @param id This primitive id.
|
||||
/// @param input Input primitive id.
|
||||
/// @param output_layout Requested memory layout.
|
||||
/// @param mean Primitive id to get mean subtract values.
|
||||
generic_layer(const primitive_id& id,
|
||||
const primitive_id& input,
|
||||
std::shared_ptr<WeightsReorderParams> params,
|
||||
const padding& output_padding = padding())
|
||||
: primitive_base(id, {input}, {output_padding}), params(params) {}
|
||||
|
||||
std::shared_ptr<WeightsReorderParams> params;
|
||||
|
||||
size_t hash() const override {
|
||||
size_t seed = primitive::hash();
|
||||
|
||||
if (params)
|
||||
seed = hash_combine(seed, params->hash());
|
||||
|
||||
return seed;
|
||||
}
|
||||
|
||||
bool operator==(const primitive& rhs) const override {
|
||||
if (!compare_common_params(rhs))
|
||||
return false;
|
||||
|
||||
auto rhs_casted = downcast<const generic_layer>(rhs);
|
||||
|
||||
if ((params == nullptr) != (rhs_casted.params == nullptr))
|
||||
return false;
|
||||
|
||||
if (params != nullptr)
|
||||
return *params == *rhs_casted.params;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
primitive_base<generic_layer>::save(ob);
|
||||
ob << params->get_input_layout();
|
||||
ob << params->get_output_layout();
|
||||
}
|
||||
|
||||
void load(BinaryInputBuffer& ib) override {
|
||||
primitive_base<generic_layer>::load(ib);
|
||||
layout input_layout, output_layout;
|
||||
ib >> input_layout;
|
||||
ib >> output_layout;
|
||||
params = std::make_shared<WeightsReorderParams>(input_layout, output_layout);
|
||||
}
|
||||
|
||||
protected:
|
||||
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
|
||||
};
|
||||
/// @}
|
||||
/// @}
|
||||
/// @}
|
||||
} // namespace cldnn
|
@ -19,6 +19,44 @@ enum class reorder_mean_mode {
|
||||
div, // val/mean
|
||||
};
|
||||
|
||||
struct WeightsReorderParams {
|
||||
WeightsReorderParams(const layout& in_layout, const layout& out_layout, bool transposed, bool grouped = false)
|
||||
: _in_layout(in_layout),
|
||||
_out_layout(out_layout),
|
||||
_transposed(transposed),
|
||||
_grouped(grouped) {}
|
||||
|
||||
size_t hash() const {
|
||||
size_t seed = hash_combine(_in_layout.hash(), _out_layout.hash());
|
||||
seed = hash_combine(seed, _transposed);
|
||||
seed = hash_combine(seed, _grouped);
|
||||
return seed;
|
||||
}
|
||||
|
||||
bool operator==(const WeightsReorderParams& rhs) const {
|
||||
if (typeid(*this) != typeid(rhs))
|
||||
return false;
|
||||
|
||||
return _in_layout == rhs._in_layout &&
|
||||
_out_layout == rhs._out_layout &&
|
||||
_transposed == rhs._transposed &&
|
||||
_grouped == rhs._grouped;
|
||||
}
|
||||
|
||||
layout get_input_layout() const { return _in_layout; }
|
||||
layout get_output_layout() const { return _out_layout; }
|
||||
bool should_be_transposed() const { return _transposed; }
|
||||
bool get_grouped() const { return _grouped; }
|
||||
|
||||
void set_input_layout(const layout& layout) { _in_layout = layout; }
|
||||
|
||||
protected:
|
||||
layout _in_layout;
|
||||
layout _out_layout;
|
||||
bool _transposed;
|
||||
bool _grouped;
|
||||
};
|
||||
|
||||
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
|
||||
/// @details Corresponding values are bitwise equal before/after reorder.
|
||||
/// Also merged with subtraction layer, which can subtract, multiply or divide values based on mean_mode value, while doing reordering.
|
||||
@ -144,16 +182,32 @@ struct reorder : public primitive_base<reorder> {
|
||||
mean(mean),
|
||||
mean_mode(mode) {}
|
||||
|
||||
/// @brief Constructs weights reorder primitive.
|
||||
/// @param id This primitive id.
|
||||
/// @param input Input primitive id.
|
||||
/// @param weights_reorder_params Parameters required for reorder weights.
|
||||
reorder(const primitive_id& id,
|
||||
const input_info& input,
|
||||
std::shared_ptr<WeightsReorderParams> weights_reorder_params)
|
||||
: primitive_base(id, {input}),
|
||||
output_format(weights_reorder_params->get_output_layout().format),
|
||||
mean(""),
|
||||
subtract_per_feature({}),
|
||||
mean_mode(reorder_mean_mode::none),
|
||||
weights_reorder_params(weights_reorder_params) {}
|
||||
|
||||
/// @brief Requested memory format.
|
||||
format output_format;
|
||||
/// @brief Primitive id to get mean subtract values. Ignored if subtract_per_featrue is set.
|
||||
/// @brief Primitive id to get mean subtract values. Ignored if subtract_per_feature is set.
|
||||
primitive_id mean;
|
||||
/// @brief Array of mean subtract values.
|
||||
std::vector<float> subtract_per_feature;
|
||||
/// @brief Mode of mean execution
|
||||
/// @brief Mode of mean execution.
|
||||
reorder_mean_mode mean_mode;
|
||||
/// @brief Input memory type
|
||||
/// @brief Input memory type.
|
||||
memory_type input_mem_type = memory_type::buffer;
|
||||
/// @brief Parameters required for reorder weights.
|
||||
std::shared_ptr<WeightsReorderParams> weights_reorder_params = {};
|
||||
|
||||
inline bool has_surface_input() const {
|
||||
return input.size() == 1 &&
|
||||
@ -170,6 +224,10 @@ struct reorder : public primitive_base<reorder> {
|
||||
seed = hash_combine(seed, truncate);
|
||||
seed = hash_range(seed, subtract_per_feature.begin(), subtract_per_feature.end());
|
||||
seed = hash_combine(seed, mean.empty());
|
||||
|
||||
if (weights_reorder_params) {
|
||||
seed = hash_combine(seed, weights_reorder_params->hash());
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
@ -179,11 +237,18 @@ struct reorder : public primitive_base<reorder> {
|
||||
|
||||
auto rhs_casted = downcast<const reorder>(rhs);
|
||||
|
||||
bool reorder_weights_eq = (weights_reorder_params == nullptr) == (rhs_casted.weights_reorder_params == nullptr);
|
||||
if (reorder_weights_eq && weights_reorder_params) {
|
||||
reorder_weights_eq = *weights_reorder_params == *rhs_casted.weights_reorder_params;
|
||||
}
|
||||
|
||||
return subtract_per_feature == rhs_casted.subtract_per_feature &&
|
||||
mean_mode == rhs_casted.mean_mode &&
|
||||
input_mem_type == rhs_casted.input_mem_type &&
|
||||
truncate == rhs_casted.truncate &&
|
||||
mean.empty() == rhs_casted.mean.empty();
|
||||
output_format == rhs_casted.output_format &&
|
||||
mean.empty() == rhs_casted.mean.empty() &&
|
||||
reorder_weights_eq;
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
|
@ -217,6 +217,7 @@ struct format {
|
||||
iy_xs_os_xsv2_osv16__ao32,
|
||||
i_yxs_os_yxsv2_osv16,
|
||||
os_i_yxs_osv4_yxsv4,
|
||||
os_i_osv16, ///< format used only for fully connected weights
|
||||
os_i_osv16__ai8, ///< format used only for fully connected weights
|
||||
os_i_osv8__ai8, ///< format used only for fully connected weights
|
||||
os_y_is_x_osv8_isv2,
|
||||
|
@ -471,13 +471,12 @@ public:
|
||||
* @endcode
|
||||
*/
|
||||
tensor transform(cldnn::format new_fmt, value_type default_size) const {
|
||||
cldnn::format format = cldnn::format::bfvuwzyx;
|
||||
auto val_order = format.internal_order();
|
||||
cldnn::format default_fmt = cldnn::format::bfvuwzyx;
|
||||
auto val_order = default_fmt.internal_order();
|
||||
auto new_order = new_fmt.internal_order();
|
||||
std::vector<value_type> old_sizes = sizes();
|
||||
std::vector<value_type> new_sizes(old_sizes.size(), default_size);
|
||||
const auto& new_traits = format::traits(new_fmt);
|
||||
const cldnn::format default_fmt = cldnn::format::bfvuwzyx;
|
||||
static const std::map<char, char> flatten_mapping = {
|
||||
{ 'v', 'u'},
|
||||
{ 'u', 'w'},
|
||||
|
@ -1,39 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#include "generic_layer_inst.h"
|
||||
#include "primitive_type_base.h"
|
||||
|
||||
#include "json_object.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
namespace cldnn {
|
||||
GPU_DEFINE_PRIMITIVE_TYPE_ID(generic_layer)
|
||||
|
||||
generic_layer_node::typed_program_node(const std::shared_ptr<generic_layer> prim, program& prog)
|
||||
: parent(prim, prog) {
|
||||
can_share_buffer(false);
|
||||
}
|
||||
|
||||
generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node)
|
||||
: parent(network, node) {}
|
||||
|
||||
generic_layer_inst::typed_primitive_inst(network& network)
|
||||
: parent(network) {
|
||||
_type = generic_layer::type_id();
|
||||
}
|
||||
|
||||
std::string generic_layer_inst::to_string(generic_layer_node const& node) {
|
||||
auto node_info = node.desc_to_json();
|
||||
|
||||
std::stringstream primitive_description;
|
||||
|
||||
node_info->dump(primitive_description);
|
||||
|
||||
return primitive_description.str();
|
||||
}
|
||||
|
||||
} // namespace cldnn
|
@ -4,6 +4,8 @@
|
||||
|
||||
#include "pass_manager.h"
|
||||
#include "program_helpers.h"
|
||||
#include "implementation_map.hpp"
|
||||
|
||||
#include "convolution_inst.h"
|
||||
#include "binary_convolution_inst.h"
|
||||
#include "deconvolution_inst.h"
|
||||
@ -38,53 +40,82 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
|
||||
if (impl->is_dynamic())
|
||||
return;
|
||||
|
||||
auto output_layout = node.get_output_layout();
|
||||
auto weights_reorder_params = impl->get_weights_reorder_params();
|
||||
for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) {
|
||||
auto& weights_node = node.get_dependency(i);
|
||||
|
||||
auto reorder = _rf.get_weights_reorder(weights_node.id(), weights_reorder_params);
|
||||
|
||||
if (reorder.first) {
|
||||
// insert new generic_layer node to topology
|
||||
p.add_intermediate(reorder.first, node, i, !reorder.second);
|
||||
// set generic_layer's node output layout and implementation
|
||||
auto& g_node = node.get_dependency(i);
|
||||
g_node.get_output_layout(false);
|
||||
|
||||
// Don't run impl selection to avoid double compilation of reorder kernels
|
||||
// in main program and internal program for constant propagation
|
||||
if ((!g_node.is_constant()) && (!reorder.second)) {
|
||||
g_node.set_selected_impl(g_node.type()->choose_impl(g_node));
|
||||
if (auto impl = g_node.get_selected_impl()) {
|
||||
auto params = g_node.get_kernel_impl_params();
|
||||
auto set_implementation = [&p, &impl](program_node& weights_reorder_node) {
|
||||
if (!weights_reorder_node.is_constant()) {
|
||||
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
|
||||
auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
|
||||
reorder_kernel_params->prog = &p;
|
||||
auto reorder_impl = factory(*reorder_kernel_params);
|
||||
|
||||
weights_reorder_node.set_selected_impl(reorder_impl->clone());
|
||||
if (auto impl = weights_reorder_node.get_selected_impl()) {
|
||||
auto params = weights_reorder_node.get_kernel_impl_params();
|
||||
p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
auto output_layout = node.get_output_layout();
|
||||
auto weights_reorder_params = impl->get_weights_reorder_params();
|
||||
for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) {
|
||||
program_node& prev_node = node.get_dependency(i);
|
||||
|
||||
if (weights_reorder_params != nullptr) {
|
||||
bool can_be_fused = prev_node.is_type<reorder>() &&
|
||||
prev_node.get_users().size() == 1 &&
|
||||
prev_node.get_dependencies().size() == 1 &&
|
||||
!prev_node.has_fused_primitives() &&
|
||||
!prev_node.as<reorder>().has_mean() &&
|
||||
prev_node.as<reorder>().get_primitive()->subtract_per_feature.empty();
|
||||
if (can_be_fused) {
|
||||
// Need to update input data_type for correct merging format reorder with precision reorder
|
||||
data_types input_dtype = prev_node.get_input_layouts()[0].data_type;
|
||||
auto updated_input_layout = weights_reorder_params->get_input_layout();
|
||||
updated_input_layout.data_type = input_dtype;
|
||||
weights_reorder_params->set_input_layout(updated_input_layout);
|
||||
|
||||
auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid,
|
||||
weights_reorder_params);
|
||||
auto& weights_reorder_node = p.get_or_create(weights_reorder.first);
|
||||
p.replace(prev_node, weights_reorder_node);
|
||||
weights_reorder_node.recalc_output_layout(false);
|
||||
|
||||
if (!weights_reorder.second) {
|
||||
set_implementation(weights_reorder_node);
|
||||
}
|
||||
} else {
|
||||
auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params);
|
||||
// insert new weights reorder node to topology
|
||||
p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second);
|
||||
// set weights reorder's node output layout and implementation
|
||||
auto& weights_reorder_node = node.get_dependency(i);
|
||||
weights_reorder_node.get_output_layout(false);
|
||||
|
||||
if (!weights_reorder.second) {
|
||||
set_implementation(weights_reorder_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reset weights reorder params to not keep source code pointer
|
||||
impl->reset_weights_reorder_params();
|
||||
|
||||
// set the old output layout and do not invalidate users as change of weights will not affect output layout
|
||||
node.set_output_layout(output_layout, false);
|
||||
}
|
||||
|
||||
void post_optimize_weights::run(program& p) {
|
||||
for (auto& node : p.get_processing_order()) {
|
||||
if (node->type() == convolution::type_id()) {
|
||||
if (node->is_type<convolution>()) {
|
||||
optimize_weights(node->as<convolution>(), p);
|
||||
}
|
||||
if (node->type() == binary_convolution::type_id()) {
|
||||
} else if (node->is_type<binary_convolution>()) {
|
||||
optimize_weights(node->as<binary_convolution>(), p);
|
||||
} else if (node->type() == deconvolution::type_id()) {
|
||||
} else if (node->is_type<deconvolution>()) {
|
||||
optimize_weights(node->as<deconvolution>(), p);
|
||||
} else if (node->type() == deformable_conv::type_id()) {
|
||||
} else if (node->is_type<deformable_conv>()) {
|
||||
optimize_weights(node->as<deformable_conv>(), p);
|
||||
} else if (node->type() == fully_connected::type_id()) {
|
||||
} else if (node->is_type<fully_connected>()) {
|
||||
optimize_weights(node->as<fully_connected>(), p);
|
||||
} else if (node->type() == lstm_dynamic_input::type_id()) {
|
||||
} else if (node->is_type<lstm_dynamic_input>()) {
|
||||
optimize_weights(node->as<lstm_dynamic_input>(), p);
|
||||
}
|
||||
}
|
||||
|
@ -25,12 +25,10 @@ void pre_replace_deconv::run(program& p) {
|
||||
while (itr != p.nodes_map.end()) {
|
||||
auto node_itr = itr++;
|
||||
auto& node = (*node_itr).second;
|
||||
// find deconvolution primitives with stride 1 and change them to convolution with trasposed weights
|
||||
// find deconvolution primitives with stride 1 and change them to convolution with transposed weights
|
||||
if (node->is_type<deconvolution>()) {
|
||||
if (node->is_dynamic())
|
||||
continue;
|
||||
if (!p.get_config().get_property(ov::intel_gpu::optimize_data))
|
||||
continue;
|
||||
|
||||
auto& deconv_node = node->as<deconvolution>();
|
||||
auto& weights_node = deconv_node.weights();
|
||||
@ -61,7 +59,6 @@ void pre_replace_deconv::run(program& p) {
|
||||
if (!perform_opt)
|
||||
continue;
|
||||
|
||||
|
||||
// setting convolution parameters based on deconvolution params
|
||||
auto output_layout = deconv_node.get_output_layout();
|
||||
auto output_pshape = output_layout.get_partial_shape();
|
||||
@ -73,8 +70,7 @@ void pre_replace_deconv::run(program& p) {
|
||||
auto output_padding = deconv_prim->output_paddings[0];
|
||||
auto grouped_weights_shape = deconv_prim->grouped_weights_shape;
|
||||
|
||||
// remove deconvolution node and its connections to weights and biases, rename it and move to the optimized
|
||||
// list
|
||||
// remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list
|
||||
p.remove_connection(input_node, deconv_node);
|
||||
std::vector<std::shared_ptr<program_node>> weight_connections;
|
||||
for (auto& weights_id : weights_nodes_id) {
|
||||
|
@ -166,7 +166,7 @@ void propagate_constants::add_deps_to_tpl(program& prog, const std::vector<std::
|
||||
if (dep.first->is_type<data>()) {
|
||||
auto dep_ptr = prog.get_node_ptr(dep.first->get_primitive()->id);
|
||||
if (nodes.find(dep_ptr) == nodes.end()) {
|
||||
nodes.insert(prog.get_node_ptr(dep.first->get_primitive()->id));
|
||||
nodes.insert(dep_ptr);
|
||||
const_inputs.push_back(&dep.first->as<data>());
|
||||
}
|
||||
}
|
||||
|
@ -275,7 +275,9 @@ void remove_redundant_reorders::run(program& p) {
|
||||
!r_node.get_primitive()->subtract_per_feature.empty() ||
|
||||
no_output_optimization ||
|
||||
r_node.has_fused_primitives() ||
|
||||
r_node.get_primitive()->has_surface_input())
|
||||
r_node.get_primitive()->has_surface_input() ||
|
||||
(r_node.get_primitive()->weights_reorder_params &&
|
||||
r_node.get_primitive()->weights_reorder_params->should_be_transposed()))
|
||||
continue;
|
||||
|
||||
auto o_layout = r_node.get_output_layout();
|
||||
|
@ -60,13 +60,14 @@ public:
|
||||
feature = input0_pshape[primitive->input_size - 1ul];
|
||||
}
|
||||
|
||||
// TO DO, to remove WA
|
||||
if (primitive->input_size > 3) {
|
||||
input0_layout.set_partial_shape(reshape_to_2d(input0_pshape, feature, primitive->input_size));
|
||||
input0_layout.format = format::bfyx;
|
||||
}
|
||||
if (input1_pshape.size() != 2) {
|
||||
input1_layout.set_partial_shape(reshape_to_2d(input1_pshape, feature, primitive->weights_rank));
|
||||
input1_layout.format = format::bfyx;
|
||||
// input1_layout.format = format::bfyx;
|
||||
}
|
||||
|
||||
std::vector<layout> layouts{input0_layout, input1_layout};
|
||||
|
@ -1,137 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "primitive_base.hpp"
|
||||
|
||||
#include "generic_layer_inst.h"
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
|
||||
struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
||||
using parent = typed_primitive_impl<generic_layer>;
|
||||
using parent::parent;
|
||||
|
||||
kernel_selector::cl_kernel_data _cl_kernel_data;
|
||||
kernel::ptr _kernel;
|
||||
kernel_id _cached_kernel_id;
|
||||
|
||||
DECLARE_OBJECT_TYPE_SERIALIZATION
|
||||
|
||||
std::unique_ptr<primitive_impl> clone() const override {
|
||||
return make_unique<generic_layer_impl>(*this);
|
||||
}
|
||||
|
||||
generic_layer_impl() : parent() {}
|
||||
|
||||
generic_layer_impl(const generic_layer_impl& other)
|
||||
: _cl_kernel_data(other._cl_kernel_data)
|
||||
, _kernel(nullptr)
|
||||
, _cached_kernel_id(other._cached_kernel_id) {
|
||||
OPENVINO_ASSERT(other._kernel, "[GPU] Can't copy generic_layer_impl node: kernel is nullptr");
|
||||
_kernel = other._kernel->clone();
|
||||
}
|
||||
|
||||
generic_layer_impl(const kernel_impl_params& params)
|
||||
: _cl_kernel_data()
|
||||
, _kernel(nullptr)
|
||||
, _cached_kernel_id() {
|
||||
auto reorder_params = params.typed_desc<generic_layer>()->params;
|
||||
auto casted_params = std::dynamic_pointer_cast<WeightsReorderParamsOCL>(reorder_params);
|
||||
OPENVINO_ASSERT(casted_params, "[GPU] Invalid weights reorder parameters type for ", params.desc->id, " node");
|
||||
_cl_kernel_data = *casted_params->get_cl_kernel();
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
|
||||
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
|
||||
kernel_strings.push_back(_cl_kernel_data.code.kernelString);
|
||||
return kernel_strings;
|
||||
}
|
||||
|
||||
std::vector<kernel::ptr> get_kernels() const override {
|
||||
return {_kernel};
|
||||
}
|
||||
|
||||
void save(BinaryOutputBuffer& ob) const override {
|
||||
ob << _cl_kernel_data;
|
||||
ob << _cached_kernel_id;
|
||||
}
|
||||
|
||||
void load(BinaryInputBuffer& ib) override {
|
||||
ib >> _cl_kernel_data;
|
||||
ib >> _cached_kernel_id;
|
||||
}
|
||||
|
||||
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
|
||||
_kernel = nullptr;
|
||||
auto compiled_kernels = kernels_cache.get_kernels(params);
|
||||
OPENVINO_ASSERT(compiled_kernels.size() == 1, "[GPU] Unexpected number of kernels for generic_layer during init_kernels() call");
|
||||
_kernel = compiled_kernels.front();
|
||||
}
|
||||
|
||||
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
|
||||
_kernel = kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id);
|
||||
}
|
||||
|
||||
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
|
||||
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernel);
|
||||
}
|
||||
|
||||
void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
|
||||
OPENVINO_ASSERT(kernels.size() == 1 &&
|
||||
kernels.begin()->second.size() == 1, "[GPU] Unexpected number of kernels for generic_layer");
|
||||
_kernel = kernels.begin()->second[0].first;
|
||||
}
|
||||
|
||||
void set_arguments_impl(generic_layer_inst& instance) override {
|
||||
kernel_arguments_data args;
|
||||
args.scalars = &_cl_kernel_data.params.scalars;
|
||||
|
||||
for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
|
||||
args.inputs.push_back(instance.input_memory_ptr(i));
|
||||
}
|
||||
args.outputs.push_back(instance.output_memory_ptr());
|
||||
|
||||
set_arguments_impl(instance, args);
|
||||
}
|
||||
|
||||
void set_arguments_impl(generic_layer_inst& instance, kernel_arguments_data& args) override {
|
||||
stream& stream = instance.get_network().get_stream();
|
||||
stream.set_arguments(*_kernel, _cl_kernel_data.params, args);
|
||||
}
|
||||
|
||||
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
|
||||
stream& stream = instance.get_network().get_stream();
|
||||
kernel_arguments_data args;
|
||||
args.scalars = &_cl_kernel_data.params.scalars;
|
||||
|
||||
for (size_t i = 0; i < instance.inputs_memory_count(); i++) {
|
||||
args.inputs.push_back(instance.input_memory_ptr(i));
|
||||
}
|
||||
args.outputs.push_back(instance.output_memory_ptr());
|
||||
return stream.enqueue_kernel(*_kernel, _cl_kernel_data.params, args, events, true);
|
||||
}
|
||||
|
||||
static std::unique_ptr<primitive_impl> create(const kernel_impl_params& params) {
|
||||
return make_unique<generic_layer_impl>(params);
|
||||
}
|
||||
};
|
||||
|
||||
static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params& params) {
|
||||
return make_unique<generic_layer_impl>(params);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
attach_generic_layer_impl::attach_generic_layer_impl() {
|
||||
implementation_map<generic_layer>::add(cldnn::impl_types::ocl, create, {});
|
||||
|
||||
WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, generic_layer_impl::create);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace ocl
|
||||
} // namespace cldnn
|
||||
|
||||
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl)
|
||||
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::generic_layer)
|
@ -387,6 +387,7 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
|
||||
case format::iyxo:
|
||||
case format::fyxb:
|
||||
return kernel_selector::weights_layout::iyxo;
|
||||
case format::oyxi:
|
||||
case format::byxf:
|
||||
return kernel_selector::weights_layout::oyxi;
|
||||
case format::byfx:
|
||||
@ -408,6 +409,8 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
|
||||
return kernel_selector::weights_layout::os_is_yx_osv16_isv16;
|
||||
case format::os_iyx_osv32:
|
||||
return kernel_selector::weights_layout::os_iyx_osv32;
|
||||
case format::os_iyx_osv32__ai32:
|
||||
return kernel_selector::weights_layout::os_iyx_osv32__ai32;
|
||||
case format::os_iyx_osv64:
|
||||
return kernel_selector::weights_layout::os_iyx_osv64;
|
||||
case format::image_2d_weights_c4_fyx_b:
|
||||
@ -509,18 +512,26 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
|
||||
return kernel_selector::weights_layout::os_i_osv8__ai8;
|
||||
case format::os_i_osv16__ai8:
|
||||
return kernel_selector::weights_layout::os_i_osv16__ai8;
|
||||
case format::bs_f_bsv16:
|
||||
case format::os_i_osv16:
|
||||
return kernel_selector::weights_layout::os_i_osv16;
|
||||
case format::os_is_zyx_isv16_osv16:
|
||||
return kernel_selector::weights_layout::os_is_zyx_isv16_osv16;
|
||||
case format::is_os_zyx_isv16_osv16:
|
||||
return kernel_selector::weights_layout::is_os_zyx_isv16_osv16;
|
||||
case format::os_is_zyx_osv32_isv16:
|
||||
return kernel_selector::weights_layout::os_is_zyx_osv32_isv16;
|
||||
case format::is_os_yx_isv16_osv16:
|
||||
return kernel_selector::weights_layout::is_os_yx_isv16_osv16;
|
||||
case format::is_os_yx_isv16_osv8:
|
||||
return kernel_selector::weights_layout::is_os_yx_isv16_osv8;
|
||||
case format::i_yxs_os_yxsv2_osv16:
|
||||
return kernel_selector::weights_layout::i_yxs_os_yxsv2_osv16;
|
||||
case format::is_os_yx_osa4_isa8_osv8_isv4:
|
||||
return kernel_selector::weights_layout::is_os_yx_osa4_isa8_osv8_isv4;
|
||||
case format::iy_xs_os_xsv2_osv8__ao32:
|
||||
return kernel_selector::weights_layout::iy_xs_os_xsv2_osv8__ao32;
|
||||
case format::iy_xs_os_xsv2_osv16__ao32:
|
||||
return kernel_selector::weights_layout::iy_xs_os_xsv2_osv16__ao32;
|
||||
case format::os_is_osv32_isv32_swizzled_by_4:
|
||||
return kernel_selector::weights_layout::os_is_osv32_isv32_swizzled_by_4;
|
||||
case format::os_is_zyx_isv8_osv16_isv2:
|
||||
@ -551,6 +562,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
|
||||
return kernel_selector::weights_layout::gs_oizyx_gsv16;
|
||||
case format::gs_oiyx_gsv32:
|
||||
return kernel_selector::weights_layout::gs_oiyx_gsv32;
|
||||
case format::gs_oi_yxs_gsv4_yxsv4:
|
||||
return kernel_selector::weights_layout::gs_oi_yxs_gsv4_yxsv4;
|
||||
case format::gs_oi_yxs_gsv16_yxsv4:
|
||||
return kernel_selector::weights_layout::gs_oi_yxs_gsv16_yxsv4;
|
||||
case format::gs_oi_yxs_gsv32_yxsv4:
|
||||
return kernel_selector::weights_layout::gs_oi_yxs_gsv32_yxsv4;
|
||||
case format::gs_oizyx_gsv32:
|
||||
return kernel_selector::weights_layout::gs_oizyx_gsv32;
|
||||
case format::gyxio:
|
||||
@ -647,6 +664,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) {
|
||||
return kernel_selector::weights_layout::g_os_y_is_x_osv8_isv2;
|
||||
case format::g_os_y_is_x_osv8_isv4:
|
||||
return kernel_selector::weights_layout::g_os_y_is_x_osv8_isv4;
|
||||
case format::g_os_is_yx_isv16_osv16:
|
||||
return kernel_selector::weights_layout::g_os_is_yx_isv16_osv16;
|
||||
case format::lstm_weights_dio:
|
||||
return kernel_selector::weights_layout::dlstm_dir_io;
|
||||
case format::os_i_yxs_osv4_yxsv4:
|
||||
return kernel_selector::weights_layout::os_i_yxs_osv4_yxsv4;
|
||||
default:
|
||||
throw std::invalid_argument("Unable to convert tensor layout " + fmt_to_str(f) + " to weights layout");
|
||||
}
|
||||
@ -686,7 +709,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
|
||||
case kernel_selector::weights_layout::os_iyx_osv64:
|
||||
return cldnn::format::os_iyx_osv64;
|
||||
case kernel_selector::weights_layout::os_i_osv16:
|
||||
return cldnn::format::bs_f_bsv16;
|
||||
return cldnn::format::os_i_osv16;
|
||||
case kernel_selector::weights_layout::os_i_osv8__ai8:
|
||||
return cldnn::format::os_i_osv8__ai8;
|
||||
case kernel_selector::weights_layout::os_i_osv16__ai8:
|
||||
@ -775,6 +798,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
|
||||
return cldnn::format::os_is_yx_osv32_isv32p;
|
||||
case kernel_selector::weights_layout::oizyx:
|
||||
return cldnn::format::oizyx;
|
||||
case kernel_selector::weights_layout::iozyx:
|
||||
return cldnn::format::iozyx;
|
||||
case kernel_selector::weights_layout::os_is_zyx_isv16_osv16:
|
||||
return cldnn::format::os_is_zyx_isv16_osv16;
|
||||
case kernel_selector::weights_layout::is_os_zyx_isv16_osv16:
|
||||
@ -939,6 +964,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) {
|
||||
return cldnn::format::g_os_y_is_x_osv8_isv2;
|
||||
case kernel_selector::weights_layout::g_os_y_is_x_osv8_isv4:
|
||||
return cldnn::format::g_os_y_is_x_osv8_isv4;
|
||||
case kernel_selector::weights_layout::giozyx:
|
||||
return cldnn::format::giozyx;
|
||||
default:
|
||||
throw std::invalid_argument("Unable to convert kernel selector Weights layout " +
|
||||
std::to_string(static_cast<int>(l)) + " to cldnn format");
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "intel_gpu/primitives/eltwise.hpp"
|
||||
#include "intel_gpu/primitives/quantize.hpp"
|
||||
#include "intel_gpu/primitives/activation.hpp"
|
||||
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||
#include "intel_gpu/primitives/reorder.hpp"
|
||||
#include "intel_gpu/primitives/primitive.hpp"
|
||||
|
||||
#include "kernel_selector_params.h"
|
||||
@ -80,7 +80,6 @@ using multi_data_tensor = kernel_selector::MultiDataTensor;
|
||||
|
||||
using params = kernel_selector::Params;
|
||||
using weights_reorder_params = kernel_selector::WeightsReorderParams;
|
||||
using generic_kernel_params = kernel_selector::GenericKernelParams;
|
||||
|
||||
} // namespace kernel_selector
|
||||
|
||||
@ -272,106 +271,12 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
|
||||
return updated_impl_params;
|
||||
}
|
||||
|
||||
class WeightsReorderParamsOCL : public WeightsReorderParams {
|
||||
public:
|
||||
explicit WeightsReorderParamsOCL(const kernel_selector::WeightsReorderParams& params)
|
||||
: WeightsReorderParams(from_weights_tensor(params.src), from_weights_tensor(params.dest)) {
|
||||
cl_kernel = params.clKernel;
|
||||
}
|
||||
|
||||
size_t hash() const override {
|
||||
size_t seed = WeightsReorderParams::hash();
|
||||
|
||||
if (cl_kernel == nullptr)
|
||||
return seed;
|
||||
|
||||
seed = hash_combine(seed, cl_kernel->skip_execution);
|
||||
|
||||
auto& gws = cl_kernel->params.workGroups.global;
|
||||
seed = hash_range(seed, gws.begin(), gws.end());
|
||||
|
||||
auto& lws = cl_kernel->params.workGroups.local;
|
||||
seed = hash_range(seed, lws.begin(), lws.end());
|
||||
|
||||
auto& arguments = cl_kernel->params.arguments;
|
||||
for (auto& args : arguments) {
|
||||
seed = hash_combine(seed, args.index);
|
||||
seed = hash_combine(seed, args.t);
|
||||
}
|
||||
|
||||
auto& scalars = cl_kernel->params.scalars;
|
||||
for (auto& s : scalars) {
|
||||
seed = hash_combine(seed, s.t);
|
||||
}
|
||||
|
||||
return seed;
|
||||
}
|
||||
|
||||
bool operator==(const WeightsReorderParams& rhs) const override {
|
||||
if (typeid(*this) != typeid(rhs))
|
||||
return false;
|
||||
|
||||
if (!WeightsReorderParams::operator==(rhs))
|
||||
return false;
|
||||
|
||||
auto rhs_casted = downcast<const WeightsReorderParamsOCL>(rhs);
|
||||
|
||||
if (cl_kernel != nullptr && rhs_casted.cl_kernel != nullptr) {
|
||||
auto& clKernel_rhs = rhs_casted.cl_kernel;
|
||||
if (cl_kernel->skip_execution != clKernel_rhs->skip_execution)
|
||||
return false;
|
||||
|
||||
auto& gws = cl_kernel->params.workGroups.global;
|
||||
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
|
||||
if (gws != gws_rhs)
|
||||
return false;
|
||||
|
||||
auto& lws = cl_kernel->params.workGroups.local;
|
||||
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
|
||||
if (lws != lws_rhs)
|
||||
return false;
|
||||
|
||||
auto& arguments = cl_kernel->params.arguments;
|
||||
auto& arguments_rhs = clKernel_rhs->params.arguments;
|
||||
if (arguments.size() != arguments_rhs.size())
|
||||
return false;
|
||||
|
||||
for (size_t idx = 0; idx < arguments.size(); idx++) {
|
||||
if (arguments[idx].index != arguments_rhs[idx].index)
|
||||
return false;
|
||||
|
||||
if (arguments[idx].t != arguments_rhs[idx].t)
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& scalars = cl_kernel->params.scalars;
|
||||
auto& scalars_rhs = clKernel_rhs->params.scalars;
|
||||
if (scalars.size() != scalars_rhs.size())
|
||||
return false;
|
||||
|
||||
for (size_t idx = 0; idx < scalars.size(); idx++) {
|
||||
if (scalars[idx].t != scalars_rhs[idx].t)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<kernel_selector::clKernelData> get_cl_kernel() {
|
||||
return cl_kernel;
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<kernel_selector::clKernelData> cl_kernel;
|
||||
};
|
||||
|
||||
inline std::shared_ptr<WeightsReorderParams> create_weights_reorder_params(const kernel_selector::WeightsReorderParams& params) {
|
||||
if (params.engine == kernel_selector::generic_kernel_params::Engine::NONE) {
|
||||
if (!params.is_initialized) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return std::make_shared<WeightsReorderParamsOCL>(params);
|
||||
return std::make_shared<WeightsReorderParams>(from_weights_tensor(params.src), from_weights_tensor(params.dest), params.rotate);
|
||||
}
|
||||
|
||||
} // namespace cldnn
|
||||
|
@ -39,10 +39,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
||||
// a pair of batch program hash and kernel entry hash of each ocl impl.
|
||||
std::pair<std::string, std::string> kernel_dump_info;
|
||||
|
||||
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
|
||||
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
||||
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
||||
}
|
||||
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {}
|
||||
|
||||
typed_primitive_impl_ocl(const typed_primitive_impl_ocl<PType>& other)
|
||||
: typed_primitive_impl<PType>(other._weights_reorder_params, other._kernel_name, other._is_dynamic)
|
||||
@ -59,10 +56,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
||||
typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd)
|
||||
: typed_primitive_impl<PType>(create_weights_reorder_params(kd.weightsReorderParams), kd.kernelName),
|
||||
_kernel_data(kd) {
|
||||
// weights reorder params got copied to parent, clear in _kernel_data to release shared ptr
|
||||
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
||||
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
||||
|
||||
this->can_reuse_memory = _kernel_data.can_reuse_memory;
|
||||
}
|
||||
|
||||
|
@ -81,7 +81,6 @@ void register_implementations() {
|
||||
REGISTER_OCL(tile);
|
||||
REGISTER_OCL(lstm_dynamic_input);
|
||||
REGISTER_OCL(lstm_dynamic_timeloop);
|
||||
REGISTER_OCL(generic_layer);
|
||||
REGISTER_OCL(gather_tree);
|
||||
REGISTER_OCL(resample);
|
||||
REGISTER_OCL(grn);
|
||||
|
@ -28,7 +28,6 @@
|
||||
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
|
||||
#include "intel_gpu/primitives/eye.hpp"
|
||||
#include "intel_gpu/primitives/fully_connected.hpp"
|
||||
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||
#include "intel_gpu/primitives/gather.hpp"
|
||||
#include "intel_gpu/primitives/gather_elements.hpp"
|
||||
#include "intel_gpu/primitives/gather_nd.hpp"
|
||||
@ -162,7 +161,6 @@ REGISTER_OCL(strided_slice);
|
||||
REGISTER_OCL(tile);
|
||||
REGISTER_OCL(lstm_dynamic_input);
|
||||
REGISTER_OCL(lstm_dynamic_timeloop);
|
||||
REGISTER_OCL(generic_layer);
|
||||
REGISTER_OCL(gather_tree);
|
||||
REGISTER_OCL(resample);
|
||||
REGISTER_OCL(grn);
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "reorder_inst.h"
|
||||
#include "reorder/reorder_kernel_selector.h"
|
||||
#include "reorder/reorder_kernel_base.h"
|
||||
#include "reorder/reorder_weights_kernel_selector.h"
|
||||
|
||||
namespace cldnn {
|
||||
namespace ocl {
|
||||
@ -26,9 +27,10 @@ struct reorder_impl : typed_primitive_impl_ocl<reorder> {
|
||||
protected:
|
||||
kernel_arguments_data get_arguments(const reorder_inst& instance) const override {
|
||||
kernel_arguments_data args = parent::get_arguments(instance);
|
||||
if (instance.has_node() && instance.has_mean()) {
|
||||
auto input = &instance.input_memory();
|
||||
auto input_layout = input->get_layout();
|
||||
if (instance.has_mean()) {
|
||||
|
||||
if (input_layout.format == cldnn::format::nv12) {
|
||||
args.bias = instance.mean_nv12_memory();
|
||||
} else {
|
||||
@ -108,12 +110,45 @@ public:
|
||||
auto kernel_params = get_kernel_params(impl_param, true);
|
||||
(_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data);
|
||||
}
|
||||
|
||||
static std::unique_ptr<primitive_impl> create(const reorder_node& arg, const kernel_impl_params& impl_param) {
|
||||
bool is_reorder_weights = format::is_weights_format(impl_param.get_input_layout().format) ||
|
||||
format::is_weights_format(impl_param.get_output_layout().format);
|
||||
if (is_reorder_weights) {
|
||||
return create_reorder_weights(impl_param);
|
||||
} else {
|
||||
return typed_primitive_impl_ocl<reorder>::create<reorder_impl>(arg, impl_param);
|
||||
}
|
||||
}
|
||||
|
||||
static std::unique_ptr<primitive_impl> create_reorder_weights(const kernel_impl_params& impl_param) {
|
||||
const auto& prim = impl_param.typed_desc<reorder>();
|
||||
const auto& weights_params = prim->weights_reorder_params;
|
||||
auto& kernel_selector = kernel_selector::ReorderWeightsKernelSelector::Instance();
|
||||
|
||||
OPENVINO_ASSERT(impl_param.get_input_layout().bytes_count() == weights_params->get_input_layout().bytes_count(),
|
||||
"[GPU] Input layout doesn't match required reorder weights layout");
|
||||
|
||||
kernel_selector::reorder_weights_params r_params;
|
||||
set_params(impl_param, r_params);
|
||||
|
||||
r_params.input = convert_weights_tensor(weights_params->get_input_layout(), weights_params->get_grouped());
|
||||
r_params.output = convert_weights_tensor(weights_params->get_output_layout());
|
||||
r_params.layerID = impl_param.desc->id + "_reorder_weigths";
|
||||
r_params.uniqueID = std::to_string(impl_param.unique_id) + "_weight";
|
||||
r_params.rotate_180 = weights_params->should_be_transposed();
|
||||
|
||||
kernel_selector::reorder_optional_params optional_params;
|
||||
auto best_kernel = kernel_selector.get_best_kernel(r_params, optional_params);
|
||||
|
||||
return make_unique<reorder_impl>(best_kernel);
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
attach_reorder_impl::attach_reorder_impl() {
|
||||
implementation_map<reorder>::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl<reorder>::create<reorder_impl>, {});
|
||||
implementation_map<reorder>::add(impl_types::ocl, shape_types::static_shape, reorder_impl::create, {});
|
||||
|
||||
auto types = {
|
||||
data_types::f32,
|
||||
@ -129,7 +164,9 @@ attach_reorder_impl::attach_reorder_impl() {
|
||||
format::bfzyx,
|
||||
format::bfwzyx,
|
||||
};
|
||||
implementation_map<reorder>::add(impl_types::ocl, shape_types::dynamic_shape, typed_primitive_impl_ocl<reorder>::create<reorder_impl>, types, formats);
|
||||
implementation_map<reorder>::add(impl_types::ocl, shape_types::dynamic_shape, reorder_impl::create, types, formats);
|
||||
|
||||
WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, reorder_impl::create_reorder_weights);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
@ -132,38 +132,17 @@ protected:
|
||||
return attrs;
|
||||
}
|
||||
|
||||
static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd, bool rotate) {
|
||||
kernel_selector::WeightsReorderParams weights_reorder_params;
|
||||
auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance();
|
||||
kernel_selector::reorder_weights_params r_params;
|
||||
|
||||
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd, bool rotate) {
|
||||
auto cldnn_prim = impl_params.typed_desc<convolution>();
|
||||
auto weights_layout = impl_params.get_input_layout(1);
|
||||
auto grouped_weights = format::is_grouped(weights_layout.format) || cldnn_prim->grouped_weights_shape;
|
||||
cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
|
||||
kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape);
|
||||
|
||||
set_params(impl_params, r_params);
|
||||
r_params.layerID = cldnn_prim->id + "_reorder_";
|
||||
r_params.input = convert_weights_tensor(weights_layout, cldnn_prim->grouped_weights_shape);
|
||||
r_params.output = r_params.input.TransformIgnorePadding(reqLayout, r_params.input.GetDType(), cldnn_prim->groups, false);
|
||||
r_params.rotate_180 = rotate;
|
||||
auto input_weights_layout = impl_params.get_input_layout(1);
|
||||
auto grouped_weights = format::is_grouped(input_weights_layout.format) || cldnn_prim->grouped_weights_shape;
|
||||
format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
|
||||
|
||||
kernel_selector::reorder_optional_params op;
|
||||
kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
|
||||
auto output_weights_layout = input_weights_layout;
|
||||
output_weights_layout.format = out_fmt;
|
||||
|
||||
if (kernels_data.empty()) {
|
||||
throw std::runtime_error("No suitable kernel found for weights reorder from " +
|
||||
kernel_selector::toString(r_params.input.GetLayout()) + " to " +
|
||||
kernel_selector::toString(r_params.output.GetLayout()));
|
||||
}
|
||||
|
||||
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
||||
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
||||
weights_reorder_params.src = r_params.input;
|
||||
weights_reorder_params.dest = r_params.output;
|
||||
|
||||
return weights_reorder_params;
|
||||
return std::make_shared<WeightsReorderParams>(input_weights_layout, output_weights_layout, rotate, grouped_weights);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -53,38 +53,17 @@ protected:
|
||||
return arg.get_onednn_primitive_attributes();
|
||||
}
|
||||
|
||||
static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
|
||||
kernel_selector::WeightsReorderParams weights_reorder_params;
|
||||
auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance();
|
||||
kernel_selector::reorder_weights_params r_params;
|
||||
|
||||
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
|
||||
auto cldnn_prim = impl_params.typed_desc<deconvolution>();
|
||||
auto weights_layout = impl_params.get_input_layout(1);
|
||||
auto grouped_weights = format::is_grouped(weights_layout.format) || cldnn_prim->grouped_weights_shape;
|
||||
cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
|
||||
kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape);
|
||||
|
||||
set_params(impl_params, r_params);
|
||||
r_params.layerID = cldnn_prim->id + "_reorder_";
|
||||
r_params.input = convert_weights_tensor(weights_layout, cldnn_prim->grouped_weights_shape);
|
||||
r_params.output = r_params.input.TransformIgnorePadding(reqLayout, r_params.input.GetDType(), cldnn_prim->groups, false);
|
||||
r_params.rotate_180 = false;
|
||||
auto input_weights_layout = impl_params.get_input_layout(1);
|
||||
auto grouped_weights = format::is_grouped(input_weights_layout.format) || cldnn_prim->grouped_weights_shape;
|
||||
format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights);
|
||||
|
||||
kernel_selector::reorder_optional_params op;
|
||||
kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
|
||||
auto output_weights_layout = input_weights_layout;
|
||||
output_weights_layout.format = out_fmt;
|
||||
|
||||
if (kernels_data.empty()) {
|
||||
throw std::runtime_error("No suitable kernel found for weights reorder from " +
|
||||
kernel_selector::toString(r_params.input.GetLayout()) + " to " +
|
||||
kernel_selector::toString(r_params.output.GetLayout()));
|
||||
}
|
||||
|
||||
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
||||
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
||||
weights_reorder_params.src = r_params.input;
|
||||
weights_reorder_params.dest = r_params.output;
|
||||
|
||||
return weights_reorder_params;
|
||||
return std::make_shared<WeightsReorderParams>(input_weights_layout, output_weights_layout, false, grouped_weights);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -53,7 +53,7 @@ protected:
|
||||
return args;
|
||||
}
|
||||
|
||||
static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
|
||||
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) {
|
||||
auto input_layout = impl_params.get_input_layout(0);
|
||||
auto weights_layout = impl_params.get_input_layout(1);
|
||||
auto cldnn_prim = impl_params.typed_desc<fully_connected>();
|
||||
@ -68,35 +68,12 @@ protected:
|
||||
weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature));
|
||||
}
|
||||
|
||||
kernel_selector::WeightsReorderParams weights_reorder_params;
|
||||
auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance();
|
||||
kernel_selector::reorder_weights_params r_params;
|
||||
format out_fmt = onednn::find_format(pd.weights_desc(0));
|
||||
|
||||
cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0));
|
||||
kernel_selector::WeightsLayout req_layout = to_weights_layout(out_fmt, false);
|
||||
auto output_weights_layout = weights_layout;
|
||||
output_weights_layout.format = out_fmt;
|
||||
|
||||
// set engine info & forcing
|
||||
set_params(impl_params, r_params);
|
||||
r_params.layerID = cldnn_prim->id + "_reorder_";
|
||||
r_params.input = convert_weights_tensor(weights_layout, false);
|
||||
r_params.output = r_params.input.TransformIgnorePadding(req_layout, r_params.input.GetDType(), 1, false);
|
||||
r_params.rotate_180 = false;
|
||||
|
||||
kernel_selector::reorder_optional_params op;
|
||||
kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
|
||||
|
||||
if (kernels_data.empty()) {
|
||||
throw std::runtime_error("No suitable kernel found for weights reorder from " +
|
||||
kernel_selector::toString(r_params.input.GetLayout()) + " to " +
|
||||
kernel_selector::toString(r_params.output.GetLayout()));
|
||||
}
|
||||
|
||||
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
||||
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
||||
weights_reorder_params.src = r_params.input;
|
||||
weights_reorder_params.dest = r_params.output;
|
||||
|
||||
return weights_reorder_params;
|
||||
return std::make_shared<WeightsReorderParams>(weights_layout, output_weights_layout, false);
|
||||
}
|
||||
|
||||
static std::shared_ptr<dnnl::inner_product_forward::primitive_desc> get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params,
|
||||
|
@ -47,8 +47,8 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
|
||||
const ExecutionConfig& config,
|
||||
std::shared_ptr<dnnl::primitive_attr> attrs,
|
||||
const PrimDescType& pd,
|
||||
kernel_selector::WeightsReorderParams weights_reorder = {})
|
||||
: typed_primitive_impl<PType>(create_weights_reorder_params(weights_reorder), pd.impl_info_str()),
|
||||
std::shared_ptr<WeightsReorderParams> weights_reorder = {})
|
||||
: typed_primitive_impl<PType>(weights_reorder, pd.impl_info_str()),
|
||||
_engine(&engine),
|
||||
_attrs(attrs),
|
||||
_pd(pd) {
|
||||
|
@ -1,45 +0,0 @@
|
||||
// Copyright (C) 2018-2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||
#include "primitive_inst.h"
|
||||
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
namespace cldnn {
|
||||
|
||||
template <>
|
||||
struct typed_program_node<generic_layer> : public typed_program_node_base<generic_layer> {
|
||||
using parent = typed_program_node_base<generic_layer>;
|
||||
typed_program_node(const std::shared_ptr<generic_layer> prim, program& prog);
|
||||
|
||||
public:
|
||||
using parent::parent;
|
||||
|
||||
program_node& input() const { return get_dependency(0); }
|
||||
};
|
||||
|
||||
using generic_layer_node = typed_program_node<generic_layer>;
|
||||
|
||||
template <>
|
||||
class typed_primitive_inst<generic_layer> : public typed_primitive_inst_base<generic_layer> {
|
||||
using parent = typed_primitive_inst_base<generic_layer>;
|
||||
using parent::parent;
|
||||
|
||||
public:
|
||||
static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) {
|
||||
return impl_param.typed_desc<generic_layer>()->params->get_output_layout();
|
||||
}
|
||||
|
||||
static std::string to_string(generic_layer_node const& node);
|
||||
|
||||
typed_primitive_inst(network& network, generic_layer_node const& node);
|
||||
typed_primitive_inst(network& network);
|
||||
};
|
||||
|
||||
using generic_layer_inst = typed_primitive_inst<generic_layer>;
|
||||
|
||||
} // namespace cldnn
|
@ -10,7 +10,6 @@
|
||||
#include "intel_gpu/runtime/lru_cache.hpp"
|
||||
|
||||
#include "data_inst.h"
|
||||
#include "generic_layer_inst.h"
|
||||
#include "reorder_inst.h"
|
||||
#include "convolution_inst.h"
|
||||
#include "deconvolution_inst.h"
|
||||
@ -73,7 +72,6 @@ private:
|
||||
};
|
||||
|
||||
std::map<cache_key, std::shared_ptr<reorder>> _cached_reorders;
|
||||
std::map<cache_key, std::shared_ptr<generic_layer>> _cached_generic_reorders;
|
||||
};
|
||||
|
||||
class layout_optimizer {
|
||||
|
@ -5,7 +5,6 @@
|
||||
#pragma once
|
||||
#include "intel_gpu/primitives/primitive.hpp"
|
||||
#include "intel_gpu/primitives/concatenation.hpp"
|
||||
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||
#include "intel_gpu/runtime/event.hpp"
|
||||
#include "intel_gpu/runtime/memory.hpp"
|
||||
#include "intel_gpu/runtime/lru_cache.hpp"
|
||||
@ -99,7 +98,6 @@ struct primitive_impl {
|
||||
|
||||
bool need_weights_reorder() const { return _weights_reorder_params != nullptr; }
|
||||
std::shared_ptr<WeightsReorderParams> get_weights_reorder_params() const { return _weights_reorder_params; }
|
||||
void reset_weights_reorder_params() { _weights_reorder_params = nullptr; }
|
||||
|
||||
std::shared_ptr<kernel_impl_params> get_weights_reorder_kernel_params() const;
|
||||
|
||||
@ -232,6 +230,7 @@ public:
|
||||
bool is_constant() const { return _is_constant; }
|
||||
bool needs_completion_event() const { return _needs_completion_event; }
|
||||
bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); }
|
||||
bool has_node() const { return _node != nullptr; }
|
||||
bool has_inner_networks() const;
|
||||
void allocate_internal_buffers(bool reset = true);
|
||||
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id,
|
||||
|
@ -56,7 +56,6 @@ struct program_node {
|
||||
friend class prepare_conv_eltw_fusing; // to be removed when possible
|
||||
friend class prepare_conv_eltw_read_write_opt; // to be removed when possible
|
||||
friend class propagate_constants; // to be removed when possible
|
||||
friend class post_optimize_weights; // to be removed when possible - requires an access to selected_impl
|
||||
|
||||
template <class PType>
|
||||
friend struct typed_program_node;
|
||||
|
@ -77,7 +77,9 @@ public:
|
||||
static std::string to_string(reorder_node const& node);
|
||||
|
||||
public:
|
||||
typed_primitive_inst(network& network);
|
||||
typed_primitive_inst(network& network, reorder_node const& node);
|
||||
|
||||
memory::ptr mean_nv12_memory() const { return dep_memory_ptr(2); }
|
||||
memory::ptr mean_memory() const { return dep_memory_ptr(1); }
|
||||
|
||||
|
@ -159,22 +159,18 @@ std::pair<std::shared_ptr<reorder>, bool> reorder_factory::get_reorder(primitive
|
||||
|
||||
std::pair<std::shared_ptr<primitive>, bool> reorder_factory::get_weights_reorder(primitive_id input_id,
|
||||
std::shared_ptr<WeightsReorderParams> reorder_params) {
|
||||
if (reorder_params == nullptr)
|
||||
return {};
|
||||
OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized.");
|
||||
|
||||
layout expected_layout = reorder_params->get_output_layout();
|
||||
|
||||
cache_key ckey{ input_id, expected_layout, false };
|
||||
auto itr = _cached_generic_reorders.find(ckey);
|
||||
if (itr != _cached_generic_reorders.end()) {
|
||||
cache_key ckey{ input_id, reorder_params->get_output_layout(), false };
|
||||
auto itr = _cached_reorders.find(ckey);
|
||||
if (itr != _cached_reorders.end()) {
|
||||
return std::make_pair(itr->second, true);
|
||||
} else {
|
||||
auto count = _cached_generic_reorders.size();
|
||||
std::stringstream ss;
|
||||
ss << input_id << "_generic_layer_" << count;
|
||||
auto count = _cached_reorders.size();
|
||||
std::string reorder_id = input_id + "_weights_reorder_" + std::to_string(count);
|
||||
|
||||
auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), input_id, reorder_params);
|
||||
_cached_generic_reorders[ckey] = reorder;
|
||||
auto reorder = std::make_shared<cldnn::reorder>(reorder_id, input_id, reorder_params);
|
||||
_cached_reorders[ckey] = reorder;
|
||||
return std::make_pair(reorder, false);
|
||||
}
|
||||
}
|
||||
@ -942,8 +938,8 @@ bool layout_optimizer::deps_for_convolution_byxf_opt(program_node const& node, u
|
||||
return true;
|
||||
|
||||
for (auto& dep : node.get_dependencies()) {
|
||||
// skip data and generic_layers
|
||||
if (dep.first->is_type<data>() || dep.first->is_type<generic_layer>())
|
||||
// skip data layers
|
||||
if (dep.first->is_type<data>())
|
||||
continue;
|
||||
|
||||
if (dep.first->is_type<convolution>()) {
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include "primitive_inst.h"
|
||||
#include "data_inst.h"
|
||||
#include "mutable_data_inst.h"
|
||||
#include "generic_layer_inst.h"
|
||||
#include "reorder_inst.h"
|
||||
#include "input_layout_inst.h"
|
||||
#include "arg_max_min_inst.h"
|
||||
#include "fully_connected_inst.h"
|
||||
@ -116,7 +116,7 @@ std::shared_ptr<kernel_impl_params> primitive_impl::get_weights_reorder_kernel_p
|
||||
return nullptr;
|
||||
|
||||
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
|
||||
auto prim = std::make_shared<generic_layer>("", "", _weights_reorder_params);
|
||||
auto prim = std::make_shared<reorder>("", input_info(), _weights_reorder_params);
|
||||
reorder_kernel_params->desc = prim;
|
||||
reorder_kernel_params->unique_id = _weights_reorder_params->hash();
|
||||
reorder_kernel_params->input_layouts.push_back(_weights_reorder_params->get_input_layout());
|
||||
@ -1000,6 +1000,9 @@ event::ptr primitive_inst::update_weights() {
|
||||
auto& engine = _network.get_engine();
|
||||
auto reorder_kernel_params = _impl->get_weights_reorder_kernel_params();
|
||||
|
||||
if (reorder_kernel_params)
|
||||
reorder_kernel_params->prog = get_network().get_program().get();
|
||||
|
||||
auto weights_idx = _node->get_primitive()->input.size();
|
||||
auto original_weights_memory = dep_memory_ptr(weights_idx);
|
||||
auto original_layout = original_weights_memory->get_layout();
|
||||
@ -1028,7 +1031,7 @@ event::ptr primitive_inst::update_weights() {
|
||||
} else {
|
||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false);
|
||||
auto& cache = get_network().get_program()->get_implementations_cache();
|
||||
auto reorder_inst = std::make_shared<generic_layer_inst>(get_network());
|
||||
auto reorder_inst = std::make_shared<cldnn::reorder_inst>(get_network());
|
||||
|
||||
if (auto cached_impl = cache.get(*reorder_kernel_params)) {
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
|
||||
@ -1041,7 +1044,7 @@ event::ptr primitive_inst::update_weights() {
|
||||
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
|
||||
auto reorder_impl = factory(*reorder_kernel_params);
|
||||
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
|
||||
auto kernels = kernels_cache.compile(*_impl_params, reorder_impl->get_kernels_source());
|
||||
auto kernels = kernels_cache.compile(*reorder_kernel_params, reorder_impl->get_kernels_source());
|
||||
OPENVINO_ASSERT(kernels.size() == 1, "[GPU] Expected number of compiled kernels is 1, but got ", kernels.size());
|
||||
reorder_impl->set_kernels(kernels);
|
||||
|
||||
@ -1152,10 +1155,11 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
: !usm_device_allocatable ? lockable_mem_type : allocation_type::usm_device;
|
||||
|
||||
if (is_internal) {
|
||||
if (_node.can_be_optimized() || _node.is_type<generic_layer>()) {
|
||||
bool is_reorder_weights = _node.is_type<reorder>() && _node.as<reorder>().get_primitive()->weights_reorder_params;
|
||||
if (_node.can_be_optimized() || is_reorder_weights) {
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
|
||||
// Use usm_device memory for weights reordering
|
||||
if (is_internal && _node.is_type<generic_layer>() &&
|
||||
if (is_internal && is_reorder_weights &&
|
||||
_engine.supports_allocation(allocation_type::usm_device))
|
||||
alloc_type = allocation_type::usm_device;
|
||||
return get_memory_from_pool(_engine,
|
||||
@ -1167,7 +1171,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
reset,
|
||||
curr_memory);
|
||||
} else {
|
||||
if ((_node.is_output() && _node.is_type<generic_layer>()) || (!_node.is_output() && _node.is_type<input_layout>()))
|
||||
if ((_node.is_output() && is_reorder_weights) || (!_node.is_output() && _node.is_type<input_layout>()))
|
||||
reset = false;
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl;
|
||||
return _engine.allocate_memory(layout, alloc_type, reset);
|
||||
|
@ -1489,7 +1489,6 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
|
||||
prim.type() != cldnn::grid_sample::type_id() &&
|
||||
prim.type() != cldnn::softmax::type_id() &&
|
||||
prim.type() != cldnn::fully_connected::type_id() &&
|
||||
prim.type() != cldnn::generic_layer::type_id() &&
|
||||
prim.type() != cldnn::scatter_nd_update::type_id() &&
|
||||
prim.type() != cldnn::broadcast::type_id() &&
|
||||
prim.type() != cldnn::quantize::type_id() &&
|
||||
@ -1628,10 +1627,7 @@ std::pair<int64_t, int64_t> program::get_estimated_device_mem_usage() {
|
||||
|
||||
if (node->can_be_optimized())
|
||||
continue;
|
||||
if (node->is_type<data>() && node->get_users().size() == 1 && node->have_user_with_type<generic_layer>()) {
|
||||
continue;
|
||||
}
|
||||
if (node->is_type<data>() || (node->is_type<generic_layer>() && node->get_dependency(0).is_type<data>())) {
|
||||
if (node->is_type<data>()) {
|
||||
const_sum += out_size;
|
||||
} else if (node->have_user_with_type<concatenation>() && node->get_users().size() == 1 && node->get_users().front()->can_be_optimized()) {
|
||||
continue;
|
||||
|
@ -19,7 +19,7 @@ layout reorder_inst::calc_output_layout(reorder_node const& node, kernel_impl_pa
|
||||
auto ifmt = input_layout.format;
|
||||
|
||||
auto desc = impl_param.typed_desc<reorder>();
|
||||
auto odt = *desc->output_data_types[0];
|
||||
auto odt = desc->output_data_types[0].value_or(input_layout.data_type);
|
||||
auto ofmt = desc->output_format;
|
||||
auto op = desc->output_paddings[0];
|
||||
|
||||
@ -146,7 +146,11 @@ layout reorder_inst::calc_output_layout(reorder_node const& node, kernel_impl_pa
|
||||
"Conversion of weights from winograd to standard domain is currently unsupported");
|
||||
}
|
||||
|
||||
if ((ofmt == format::bs_fs_fsv8_bsv8 || ofmt == format::os_i_osv8__ai8 || ofmt == format::os_i_osv16__ai8 || ofmt == format::bs_f_bsv16 ||
|
||||
if (desc->weights_reorder_params) {
|
||||
return desc->weights_reorder_params->get_output_layout();
|
||||
}
|
||||
|
||||
if ((ofmt == format::bs_fs_fsv8_bsv8 || ofmt == format::os_i_osv8__ai8 || ofmt == format::os_i_osv16__ai8 || ofmt == format::os_i_osv16 ||
|
||||
ofmt == format::bfzyx || ifmt == format::bfzyx || ofmt == format::b_fs_zyx_fsv16 || ifmt == format::b_fs_zyx_fsv16 ||
|
||||
ofmt == format::bs_fs_zyx_bsv16_fsv16 || ifmt == format::bs_fs_zyx_bsv16_fsv16 ||
|
||||
ofmt == format::bs_fs_zyx_bsv16_fsv32 || ifmt == format::bs_fs_zyx_bsv16_fsv32 ||
|
||||
@ -169,7 +173,11 @@ std::vector<layout> reorder_inst::calc_output_layouts(reorder_node const& /*node
|
||||
auto ifmt = input_layout.format;
|
||||
auto ofmt = desc->output_format == format::any ? ifmt : desc->output_format;
|
||||
|
||||
if (desc->weights_reorder_params) {
|
||||
return { desc->weights_reorder_params->get_output_layout() };
|
||||
} else {
|
||||
return { layout(input_layout.get<ShapeType>(), desc->output_data_types[0].value(), ofmt, desc->output_paddings[0]) };
|
||||
}
|
||||
}
|
||||
|
||||
std::string reorder_inst::to_string(reorder_node const& node) {
|
||||
@ -197,6 +205,10 @@ std::string reorder_inst::to_string(reorder_node const& node) {
|
||||
return primitive_description.str();
|
||||
}
|
||||
|
||||
reorder_inst::typed_primitive_inst(network& network) : parent(network) {
|
||||
_type = reorder::type_id();
|
||||
}
|
||||
|
||||
reorder_inst::typed_primitive_inst(network& network, reorder_node const& node)
|
||||
: parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false)
|
||||
, _req_reinterpr(node.requires_reinterpret()) {
|
||||
|
@ -67,22 +67,14 @@ struct clKernelData {
|
||||
bool skip_execution = false;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// GenericKernelParams
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct GenericKernelParams {
|
||||
enum class Engine { NONE, GPU };
|
||||
|
||||
Engine engine = Engine::NONE;
|
||||
std::shared_ptr<clKernelData> clKernel;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// WeightsReorderParams
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
struct WeightsReorderParams : public GenericKernelParams {
|
||||
struct WeightsReorderParams {
|
||||
WeightsTensor src;
|
||||
WeightsTensor dest;
|
||||
bool rotate;
|
||||
bool is_initialized = false;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include "kernel_selector_utils.h"
|
||||
#include "reorder/reorder_weights_kernel_selector.h"
|
||||
#include "reorder/reorder_kernel_selector.h"
|
||||
#include "reorder/reorder_kernel_base.h"
|
||||
#include "convolution/convolution_params.h"
|
||||
#include <vector>
|
||||
@ -110,31 +111,10 @@ bool UpdateWeightsParams(weight_bias_params& newParams,
|
||||
if (!optParams.allowStaticInputReordering) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto& reorderKS = ReorderWeightsKernelSelctor::Instance();
|
||||
reorder_weights_params r_params;
|
||||
|
||||
r_params.layerID = newParams.layerID + "_reorder_";
|
||||
r_params.input = newParams.weights;
|
||||
r_params.output = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups, false);
|
||||
r_params.rotate_180 = rotate;
|
||||
r_params.engineInfo = newParams.engineInfo;
|
||||
r_params.uniqueID = newParams.uniqueID + "_weight";
|
||||
|
||||
reorder_optional_params op;
|
||||
KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op);
|
||||
|
||||
if (kernels_data.empty()) {
|
||||
throw std::runtime_error("No suitable kernel found for weights reorder from " +
|
||||
toString(r_params.input.GetLayout()) + " to " +
|
||||
toString(r_params.output.GetLayout()) +
|
||||
(rotate ? " with rotate" : ""));
|
||||
}
|
||||
|
||||
weightsReorderParams.engine = WeightsReorderParams::Engine::GPU;
|
||||
weightsReorderParams.clKernel = std::make_shared<clKernelData>(kernels_data[0].kernels[0]);
|
||||
weightsReorderParams.src = r_params.input;
|
||||
weightsReorderParams.dest = r_params.output;
|
||||
weightsReorderParams.is_initialized = true;
|
||||
weightsReorderParams.src = newParams.weights;
|
||||
weightsReorderParams.dest = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups, false);
|
||||
weightsReorderParams.rotate = rotate;
|
||||
|
||||
newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups);
|
||||
return true;
|
||||
|
@ -13,7 +13,7 @@
|
||||
|
||||
namespace kernel_selector {
|
||||
|
||||
ReorderWeightsKernelSelctor::ReorderWeightsKernelSelctor() {
|
||||
ReorderWeightsKernelSelector::ReorderWeightsKernelSelector() {
|
||||
Attach<ReorderWeightsKernel>();
|
||||
Attach<ReorderWeightsWinograd2x3Kernel>();
|
||||
Attach<ReorderWeightsWinograd6x3Kernel>();
|
||||
@ -23,7 +23,7 @@ ReorderWeightsKernelSelctor::ReorderWeightsKernelSelctor() {
|
||||
Attach<ReorderWeightsOpt>();
|
||||
}
|
||||
|
||||
KernelsData ReorderWeightsKernelSelctor::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
KernelsData ReorderWeightsKernelSelector::GetBestKernels(const Params& params, const optional_params& options) const {
|
||||
return GetNaiveBestKernel(params, options, KernelType::REORDER);
|
||||
}
|
||||
} // namespace kernel_selector
|
||||
|
@ -7,16 +7,16 @@
|
||||
#include "kernel_selector.h"
|
||||
|
||||
namespace kernel_selector {
|
||||
class ReorderWeightsKernelSelctor : public kernel_selector_base {
|
||||
class ReorderWeightsKernelSelector : public kernel_selector_base {
|
||||
public:
|
||||
static ReorderWeightsKernelSelctor& Instance() {
|
||||
static ReorderWeightsKernelSelctor instance_;
|
||||
static ReorderWeightsKernelSelector& Instance() {
|
||||
static ReorderWeightsKernelSelector instance_;
|
||||
return instance_;
|
||||
}
|
||||
|
||||
ReorderWeightsKernelSelctor();
|
||||
ReorderWeightsKernelSelector();
|
||||
|
||||
virtual ~ReorderWeightsKernelSelctor() {}
|
||||
virtual ~ReorderWeightsKernelSelector() {}
|
||||
|
||||
KernelsData GetBestKernels(const Params& params, const optional_params& options) const override;
|
||||
};
|
||||
|
@ -109,7 +109,7 @@ enum WeightsLayout {
|
||||
os_i_osv16__ai8,
|
||||
os_i_osv16,
|
||||
os_is_yx_osv16_isv2,
|
||||
os_is_yx_osv16_isv16, // wieghts for int8 blocked conv
|
||||
os_is_yx_osv16_isv16, // weights for int8 blocked conv
|
||||
os_is_zyx_osv16_isv16,
|
||||
os_is_zyx_osv32_isv16,
|
||||
os_is_zyx_osv64_isv16,
|
||||
|
@ -108,9 +108,9 @@ static const std::map<format::type, format_traits> format_traits_map {
|
||||
FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}),
|
||||
FMT_TRAITS(os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}),
|
||||
FMT_TRAITS(os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 32}, {1, 32}}),
|
||||
FMT_TRAITS(is_os_yx_osa4_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{0, 4}, {1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(is_os_yx_isa2_osa8_isv8_osv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 2}, {0, 8}, {1, 8}, {0, 2}}),
|
||||
FMT_TRAITS(is_os_yx_isa4_osa8_isv8_osv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 4}, {0, 8}, {1, 8}, {0, 4}}),
|
||||
FMT_TRAITS(is_os_yx_osa4_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{0, 4}, {1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(is_os_yx_isa2_osa8_isv8_osv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 2}, {0, 8}, {1, 8}, {0, 2}}),
|
||||
FMT_TRAITS(is_os_yx_isa4_osa8_isv8_osv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 4}, {0, 8}, {1, 8}, {0, 4}}),
|
||||
FMT_TRAITS(is_o_yx_isv32, 1, 1, 2, 0, {1, 0, 2, 3}, "oyxi", "oixy", {{1, 32}}),
|
||||
FMT_TRAITS(is_o32_yx_isv32_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oyxi", "oixy", {}),
|
||||
FMT_TRAITS(os_is_y_x8_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oyxi", "oixy", {}),
|
||||
@ -129,13 +129,13 @@ static const std::map<format::type, format_traits> format_traits_map {
|
||||
FMT_TRAITS(is_os_zyx_isv16_osv16, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "oixyz", {{1, 16}, {0, 16}}),
|
||||
FMT_TRAITS(is_os_yx_isv16_osv16, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 16}, {0, 16}}),
|
||||
FMT_TRAITS(is_os_yx_isv16_osv8, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 16}, {0, 8}}),
|
||||
FMT_TRAITS(is_os_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "ioxyz", {{1, 8}, {0, 8}, {1, 2}}),
|
||||
FMT_TRAITS(is_os_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "ioxyz", {{1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(is_os_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "oixyz", {{1, 8}, {0, 8}, {1, 2}}),
|
||||
FMT_TRAITS(is_os_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(os_is_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(os_is_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 2}}),
|
||||
FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}),
|
||||
FMT_TRAITS(is_os_yx_isa8_osv8_isv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 8}, {0, 8}, {1, 2}}),
|
||||
FMT_TRAITS(is_os_yx_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(is_os_yx_isa8_osv8_isv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 8}, {0, 8}, {1, 2}}),
|
||||
FMT_TRAITS(is_os_yx_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(os_is_yx_isa8_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{1, 8}, {0, 8}, {1, 4}}),
|
||||
FMT_TRAITS(os_is_yx_isa8_osv8_isv2, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{1, 8}, {0, 8}, {1, 2}}),
|
||||
FMT_TRAITS(os_is_osv32_isv32_swizzled_by_4, 1, 1, 0, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}),
|
||||
@ -152,6 +152,7 @@ static const std::map<format::type, format_traits> format_traits_map {
|
||||
FMT_TRAITS(iy_xs_os_xsv2_osv8__ao32, 1, 1, 2, 0, {1, 2, 3, 0}, "iyxo", "oixy", {{2, 2}, {0, 8}}),
|
||||
FMT_TRAITS(iy_xs_os_xsv2_osv16__ao32, 1, 1, 2, 0, {1, 2, 3, 0}, "iyxo", "oixy", {{2, 2}, {0, 16}}),
|
||||
FMT_TRAITS(os_i_yxs_osv4_yxsv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 4}}),
|
||||
FMT_TRAITS(os_i_osv16, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{0, 16}}),
|
||||
FMT_TRAITS(os_i_osv16__ai8, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{1, 8}, {0, 16}}),
|
||||
FMT_TRAITS(os_i_osv8__ai8, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{1, 8}, {0, 8}}),
|
||||
FMT_TRAITS(os_y_is_x_osv8_isv2, 1, 1, 2, 0, {0, 2, 1, 3}, "oyix", "oixy", {{0, 8}, {1, 2}}),
|
||||
|
@ -4551,7 +4551,7 @@ TEST_P(onednn_replace_full_tensor_sum_to_binary_add, basic) {
|
||||
#define CASE_CONV_ELTW_SUM_TO_BINARY_ADD { 1, 32, 4, 4 }, { 1, 32, 2, 2 }, { 32, 32, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(eltwise_sum_fusings_gpu, onednn_replace_full_tensor_sum_to_binary_add, ::testing::ValuesIn(std::vector<convolution_eltw_sum_test_params>{
|
||||
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_TO_BINARY_ADD, 2, 3, 4 },
|
||||
convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_TO_BINARY_ADD, 2, 2, 3 },
|
||||
}));
|
||||
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include "test_utils.h"
|
||||
|
||||
#include "intel_gpu/runtime/layout.hpp"
|
||||
#include "impls/ocl/kernel_selector_helper.h"
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
@ -305,3 +306,67 @@ INSTANTIATE_TEST_SUITE_P(smoke, layout_transform_test,
|
||||
|
||||
{format::bfzyx, format::bfyx, ov::PartialShape{1, 2, 3, 4, 5}, ov::PartialShape{1, 2, 3*4, 5}},
|
||||
}));
|
||||
|
||||
struct layouts_convert_params {
|
||||
format::type in_format;
|
||||
ov::PartialShape in_shape;
|
||||
bool is_grouped;
|
||||
};
|
||||
|
||||
class layout_convert_test : public testing::TestWithParam<layouts_convert_params> { };
|
||||
|
||||
TEST_P(layout_convert_test, basic) {
|
||||
auto p = GetParam();
|
||||
|
||||
auto test_layout = layout(p.in_shape, data_types::f32, p.in_format);
|
||||
auto weights_tensor = convert_weights_tensor(test_layout, p.is_grouped);
|
||||
auto converted_layout = from_weights_tensor(weights_tensor);
|
||||
|
||||
if (p.in_format == format::bfzyx && p.is_grouped) {
|
||||
ASSERT_EQ(converted_layout, layout(p.in_shape, data_types::f32, format::goiyx));
|
||||
} else if (p.in_format == format::bfwzyx && p.is_grouped) {
|
||||
ASSERT_EQ(converted_layout, layout(p.in_shape, data_types::f32, format::goizyx));
|
||||
} else if (p.in_format == format::os_i_osv16__ai8) {
|
||||
auto ref_shape = p.in_shape;
|
||||
for (size_t i = ref_shape.size(); i < converted_layout.get_dims().size(); ++i)
|
||||
ref_shape.push_back(1);
|
||||
test_layout.set_partial_shape(ref_shape);
|
||||
ASSERT_EQ(test_layout, converted_layout);
|
||||
} else {
|
||||
ASSERT_EQ(test_layout, converted_layout);
|
||||
}
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(smoke, layout_convert_test,
|
||||
testing::ValuesIn(std::vector<layouts_convert_params>{
|
||||
// 4D formats
|
||||
{format::oiyx, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
{format::ioyx, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
{format::os_i_osv16__ai8, ov::PartialShape{1, 2}, false},
|
||||
{format::os_iyx_osv16, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
{format::os_i_yxs_osv4_yxsv4, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
{format::os_is_yx_isa8_osv8_isv2, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
{format::is_os_yx_isa2_osa8_isv8_osv2, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
{format::is_o32_yx_isv32_swizzled_by_4, ov::PartialShape{1, 2, 3, 4}, false},
|
||||
// 4D formats grouped
|
||||
{format::bfzyx, ov::PartialShape{1, 2, 3, 4, 5}, true},
|
||||
{format::goiyx, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::g_os_iyx_osv32, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::g_os_is_yx_isv8_osv16_isv2, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::g_os_is_yx_osv16_isv4, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
// {format::gs_oi_yxs_gsv32_yxsv4, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
// 5D formats
|
||||
{format::oizyx, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::iozyx, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::os_is_zyx_isa8_osv16_isv4, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::os_is_zyx_osa4_isa8_osv8_isv4, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::is_os_zyx_isa8_osv8_isv2, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
{format::is_os_zyx_isv16_osv16, ov::PartialShape{1, 2, 3, 4, 5}, false},
|
||||
// 5D formats grouped
|
||||
{format::bfwzyx, ov::PartialShape{1, 2, 3, 4, 5, 6}, true},
|
||||
{format::giozyx, ov::PartialShape{1, 2, 3, 4, 5, 6}, false},
|
||||
{format::gs_oizyx_gsv32, ov::PartialShape{1, 2, 3, 4, 5, 6}, false},
|
||||
{format::g_os_zyx_is_osv32_isv32, ov::PartialShape{1, 2, 3, 4, 5, 6}, false},
|
||||
{format::g_is_os_zyx_isv16_osv16, ov::PartialShape{1, 2, 3, 4, 5, 6}, false},
|
||||
{format::g_os_is_zyx_osa4_isa8_osv8_isv2, ov::PartialShape{1, 2, 3, 4, 5, 6}, false},
|
||||
}));
|
||||
|
@ -10,7 +10,6 @@
|
||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||
#include <intel_gpu/primitives/gather.hpp>
|
||||
#include <intel_gpu/primitives/permute.hpp>
|
||||
#include <intel_gpu/primitives/generic_layer.hpp>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
@ -111,18 +110,20 @@ TEST(primitive_comparison, permute) {
|
||||
ASSERT_NE(permute_prim, permute_prim_order);
|
||||
}
|
||||
|
||||
TEST(primitive_comparison, generic_layer) {
|
||||
TEST(primitive_comparison, reorder_weights) {
|
||||
auto shape = ov::PartialShape{1, 2, 3, 4};
|
||||
auto data_type = data_types::f32;
|
||||
auto format_in = format::bfyx;
|
||||
auto format_out = format::os_iyx_osv16;
|
||||
|
||||
auto input_layout = layout{shape, data_type, format_in};
|
||||
auto output_layout = layout{shape, data_type, format_out};
|
||||
auto generic_layer_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
|
||||
auto generic_layer_eq_prim = generic_layer("generic_layer_eq", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
|
||||
auto generic_layer_different_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(output_layout, input_layout));
|
||||
auto format_osv16 = format::os_iyx_osv16;
|
||||
auto format_osv32 = format::os_iyx_osv32;
|
||||
|
||||
ASSERT_EQ(generic_layer_prim, generic_layer_eq_prim);
|
||||
ASSERT_NE(generic_layer_prim, generic_layer_different_prim);
|
||||
auto layout_osv16 = layout{shape, data_type, format_osv16};
|
||||
auto layout_osv32 = layout{shape, data_type, format_osv32};
|
||||
|
||||
auto reorder_weights_prim = reorder("reorder_weights", input_info("input"), layout_osv16);
|
||||
auto reorder_weights_eq_prim = reorder("reorder_weights_eq", input_info("input"), layout_osv16);
|
||||
auto reorder_weights_diff_prim = reorder("reorder_weights_neq", input_info("input"), layout_osv32);
|
||||
|
||||
ASSERT_EQ(reorder_weights_prim, reorder_weights_eq_prim);
|
||||
ASSERT_NE(reorder_weights_prim, reorder_weights_diff_prim);
|
||||
}
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include "intel_gpu/primitives/input_layout.hpp"
|
||||
#include "intel_gpu/primitives/data.hpp"
|
||||
|
||||
#include "generic_layer_inst.h"
|
||||
#include "reorder_inst.h"
|
||||
#include "fully_connected_inst.h"
|
||||
#include "implementation_map.hpp"
|
||||
#include "graph/impls/ocl/register.hpp"
|
||||
@ -41,6 +41,7 @@ TEST(weights_factory, reorder_test) {
|
||||
tests::random_generator rg(GET_SUITE_NAME);
|
||||
const int input_f = 32, output_f = 32;
|
||||
|
||||
|
||||
auto weights_layout = layout(ov::PartialShape{ output_f, input_f }, data_types::f32, format::bfyx);
|
||||
auto weights_data_input = engine.allocate_memory(weights_layout);
|
||||
auto weights_data_vec = rg.generate_random_1d<float>(output_f * input_f, -1, 1);
|
||||
@ -69,10 +70,11 @@ TEST(weights_factory, reorder_test) {
|
||||
|
||||
// Constuct kernel_impl_params for weights reorder based requested WeightsReorderParams
|
||||
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
|
||||
reorder_kernel_params->desc = std::make_shared<generic_layer>("weights_reorder", "", weights_reorder_params);
|
||||
reorder_kernel_params->desc = std::make_shared<reorder>("weights_reorder", input_info(), weights_reorder_params);
|
||||
reorder_kernel_params->unique_id = weights_reorder_params->hash();
|
||||
reorder_kernel_params->input_layouts.push_back(weights_reorder_params->get_input_layout());
|
||||
reorder_kernel_params->output_layouts.push_back(weights_reorder_params->get_output_layout());
|
||||
reorder_kernel_params->prog = network.get_program().get();
|
||||
|
||||
// Create new generic_layer_impl
|
||||
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
|
||||
@ -93,7 +95,8 @@ TEST(weights_factory, reorder_test) {
|
||||
args.inputs.push_back(weights_data_input);
|
||||
args.outputs.push_back(weights_data_output);
|
||||
|
||||
auto reorder_inst = std::make_shared<generic_layer_inst>(network);
|
||||
auto reorder_inst = std::make_shared<cldnn::reorder_inst>(network);
|
||||
|
||||
reorder_inst->set_impl(reorder_impl->clone());
|
||||
|
||||
reorder_inst->get_impl()->set_arguments(*reorder_inst, args);
|
||||
|
@ -0,0 +1,87 @@
|
||||
// Copyright (C) 2023 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
#include "program_wrapper.h"
|
||||
#include "fully_connected_inst.h"
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
auto input = engine.allocate_memory({ { 2, 32 }, data_types::f16, format::bfyx });
|
||||
auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx });
|
||||
|
||||
topology topology(
|
||||
input_layout("input", input->get_layout()),
|
||||
input_layout("weights", weights->get_layout()),
|
||||
reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16),
|
||||
fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16)
|
||||
);
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
auto prog = program::build_program(engine, topology, config, false, true);
|
||||
|
||||
reorder_factory rf;
|
||||
program_wrapper::apply_opt_pass<compile_graph>(*prog);
|
||||
program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
|
||||
|
||||
ASSERT_TRUE(has_node(*prog, "reorder_dt"));
|
||||
ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format));
|
||||
}
|
||||
|
||||
TEST(post_optimize_weights, weights_reorder_constant_folding_test) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
ov::Shape pshape = { 4, 16 };
|
||||
auto input = engine.allocate_memory({ pshape, data_types::f32, format::bfyx });
|
||||
auto weights = engine.allocate_memory({ pshape, data_types::f32, format::bfyx });
|
||||
|
||||
std::vector<float> weights_data(pshape[0] * pshape[1]);
|
||||
std::iota(weights_data.begin(), weights_data.end(), 0.f);
|
||||
set_values(weights, weights_data);
|
||||
|
||||
topology topology(
|
||||
input_layout("input", input->get_layout()),
|
||||
data("weights", weights),
|
||||
fully_connected("fc", input_info("input"), { "weights" })
|
||||
);
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
auto prog = program::build_program(engine, topology, config, false, true);
|
||||
|
||||
reorder_factory rf;
|
||||
program_wrapper::apply_opt_pass<compile_graph>(*prog);
|
||||
program_wrapper::apply_opt_pass<post_optimize_weights>(*prog, rf);
|
||||
program_wrapper::apply_opt_pass<propagate_constants>(*prog);
|
||||
|
||||
ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0"));
|
||||
auto& weights_node = prog->get_node("weights_weights_reorder_0");
|
||||
ASSERT_TRUE(weights_node.is_type<data>());
|
||||
|
||||
size_t align = 16; // os_iyx_osv16 format
|
||||
size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0]
|
||||
: pshape[0] - pshape[0] % align + align;
|
||||
std::vector<float> expected(aligned_b_size * pshape[1], 0.f);
|
||||
size_t input_idx = 0;
|
||||
for (size_t i = 0; i < pshape[0]; ++i) {
|
||||
for (size_t j = 0; j < pshape[1]; ++j) {
|
||||
expected[j * align + i] = weights_data[input_idx++];
|
||||
}
|
||||
}
|
||||
|
||||
auto weights_mem_ptr = weights_node.as<data>().get_attached_memory_ptr();
|
||||
cldnn::mem_lock<float, mem_lock_type::read> weights_mem(weights_mem_ptr, get_test_stream());
|
||||
|
||||
for (size_t i = 0; i < expected.size(); ++i) {
|
||||
ASSERT_EQ(weights_mem[i], expected[i]);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user