diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp index 321e594dc66..a922153b010 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/kernel_impl_params.hpp @@ -136,7 +136,7 @@ virtual primitive_type_id type() const { return desc->type; } void save(BinaryOutputBuffer& ob) const; void load(BinaryInputBuffer& ib); const program& get_program() const { - OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params in not initialized"); + OPENVINO_ASSERT(prog != nullptr, "[GPU] Program pointer in kernel_impl_params is not initialized"); return *prog; } stream& get_stream() const { return *strm; } diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp index c576185ae88..c68d5de243c 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/program.hpp @@ -45,6 +45,7 @@ struct program { friend class prepare_conv_eltw_fusing; // to be removed when possible friend class reorder_inputs; // to be removed when possible friend class remove_redundant_reorders; // to be removed when possible + friend class post_optimize_weights; // to be removed when possible friend class program_wrapper; // this class is intended to extend the interface of program for // the usage within tests_core_internal project only friend class prepare_primitive_fusing_through; // to be removed when possible diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/generic_layer.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/generic_layer.hpp deleted file mode 100644 index 1638d3fa534..00000000000 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/generic_layer.hpp +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once -#include "intel_gpu/primitives/primitive.hpp" -#include "intel_gpu/runtime/memory.hpp" - -#include - -namespace cldnn { - -struct WeightsReorderParams { - WeightsReorderParams(layout in_layout, layout out_layout) : _in_layout(in_layout), _out_layout(out_layout) {} - - virtual size_t hash() const { - return hash_combine(_in_layout.hash(), _out_layout.hash()); - } - - virtual bool operator==(const WeightsReorderParams& rhs) const { - if (typeid(*this) != typeid(rhs)) - return false; - - return _in_layout == rhs._in_layout && - _out_layout == rhs._out_layout; - } - - layout get_input_layout() const { return _in_layout; } - layout get_output_layout() const { return _out_layout; } - - virtual ~WeightsReorderParams() = default; - -protected: - layout _in_layout; - layout _out_layout; -}; - -/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved. -/// @details Corresponding values are bitwise equal before/after reorder. -struct generic_layer : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(generic_layer) - - generic_layer() : primitive_base("", {}) {} - - DECLARE_OBJECT_TYPE_SERIALIZATION - - /// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param output_layout Requested memory layout. - /// @param mean Primitive id to get mean subtract values. - generic_layer(const primitive_id& id, - const primitive_id& input, - std::shared_ptr params, - const padding& output_padding = padding()) - : primitive_base(id, {input}, {output_padding}), params(params) {} - - std::shared_ptr params; - - size_t hash() const override { - size_t seed = primitive::hash(); - - if (params) - seed = hash_combine(seed, params->hash()); - - return seed; - } - - bool operator==(const primitive& rhs) const override { - if (!compare_common_params(rhs)) - return false; - - auto rhs_casted = downcast(rhs); - - if ((params == nullptr) != (rhs_casted.params == nullptr)) - return false; - - if (params != nullptr) - return *params == *rhs_casted.params; - - return true; - } - - void save(BinaryOutputBuffer& ob) const override { - primitive_base::save(ob); - ob << params->get_input_layout(); - ob << params->get_output_layout(); - } - - void load(BinaryInputBuffer& ib) override { - primitive_base::load(ib); - layout input_layout, output_layout; - ib >> input_layout; - ib >> output_layout; - params = std::make_shared(input_layout, output_layout); - } - -protected: - std::vector> get_dependencies() const override { return {}; } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp index df30fdd9920..0e18063c443 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/reorder.hpp @@ -19,6 +19,44 @@ enum class reorder_mean_mode { div, // val/mean }; +struct WeightsReorderParams { + WeightsReorderParams(const layout& in_layout, const layout& out_layout, bool transposed, bool grouped = false) + : _in_layout(in_layout), + _out_layout(out_layout), + _transposed(transposed), + _grouped(grouped) {} + + size_t hash() const { + size_t seed = hash_combine(_in_layout.hash(), _out_layout.hash()); + seed = hash_combine(seed, _transposed); + seed = hash_combine(seed, _grouped); + return seed; + } + + bool operator==(const WeightsReorderParams& rhs) const { + if (typeid(*this) != typeid(rhs)) + return false; + + return _in_layout == rhs._in_layout && + _out_layout == rhs._out_layout && + _transposed == rhs._transposed && + _grouped == rhs._grouped; + } + + layout get_input_layout() const { return _in_layout; } + layout get_output_layout() const { return _out_layout; } + bool should_be_transposed() const { return _transposed; } + bool get_grouped() const { return _grouped; } + + void set_input_layout(const layout& layout) { _in_layout = layout; } + +protected: + layout _in_layout; + layout _out_layout; + bool _transposed; + bool _grouped; +}; + /// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved. /// @details Corresponding values are bitwise equal before/after reorder. /// Also merged with subtraction layer, which can subtract, multiply or divide values based on mean_mode value, while doing reordering. @@ -144,16 +182,32 @@ struct reorder : public primitive_base { mean(mean), mean_mode(mode) {} + /// @brief Constructs weights reorder primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param weights_reorder_params Parameters required for reorder weights. + reorder(const primitive_id& id, + const input_info& input, + std::shared_ptr weights_reorder_params) + : primitive_base(id, {input}), + output_format(weights_reorder_params->get_output_layout().format), + mean(""), + subtract_per_feature({}), + mean_mode(reorder_mean_mode::none), + weights_reorder_params(weights_reorder_params) {} + /// @brief Requested memory format. format output_format; - /// @brief Primitive id to get mean subtract values. Ignored if subtract_per_featrue is set. + /// @brief Primitive id to get mean subtract values. Ignored if subtract_per_feature is set. primitive_id mean; /// @brief Array of mean subtract values. std::vector subtract_per_feature; - /// @brief Mode of mean execution + /// @brief Mode of mean execution. reorder_mean_mode mean_mode; - /// @brief Input memory type + /// @brief Input memory type. memory_type input_mem_type = memory_type::buffer; + /// @brief Parameters required for reorder weights. + std::shared_ptr weights_reorder_params = {}; inline bool has_surface_input() const { return input.size() == 1 && @@ -170,6 +224,10 @@ struct reorder : public primitive_base { seed = hash_combine(seed, truncate); seed = hash_range(seed, subtract_per_feature.begin(), subtract_per_feature.end()); seed = hash_combine(seed, mean.empty()); + + if (weights_reorder_params) { + seed = hash_combine(seed, weights_reorder_params->hash()); + } return seed; } @@ -179,11 +237,18 @@ struct reorder : public primitive_base { auto rhs_casted = downcast(rhs); + bool reorder_weights_eq = (weights_reorder_params == nullptr) == (rhs_casted.weights_reorder_params == nullptr); + if (reorder_weights_eq && weights_reorder_params) { + reorder_weights_eq = *weights_reorder_params == *rhs_casted.weights_reorder_params; + } + return subtract_per_feature == rhs_casted.subtract_per_feature && mean_mode == rhs_casted.mean_mode && input_mem_type == rhs_casted.input_mem_type && truncate == rhs_casted.truncate && - mean.empty() == rhs_casted.mean.empty(); + output_format == rhs_casted.output_format && + mean.empty() == rhs_casted.mean.empty() && + reorder_weights_eq; } void save(BinaryOutputBuffer& ob) const override { diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp index 5403d89ee1f..d6fcc41c0db 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/format.hpp @@ -217,6 +217,7 @@ struct format { iy_xs_os_xsv2_osv16__ao32, i_yxs_os_yxsv2_osv16, os_i_yxs_osv4_yxsv4, + os_i_osv16, ///< format used only for fully connected weights os_i_osv16__ai8, ///< format used only for fully connected weights os_i_osv8__ai8, ///< format used only for fully connected weights os_y_is_x_osv8_isv2, diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp index f2c915c7f2e..36189a26d85 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp @@ -471,13 +471,12 @@ public: * @endcode */ tensor transform(cldnn::format new_fmt, value_type default_size) const { - cldnn::format format = cldnn::format::bfvuwzyx; - auto val_order = format.internal_order(); + cldnn::format default_fmt = cldnn::format::bfvuwzyx; + auto val_order = default_fmt.internal_order(); auto new_order = new_fmt.internal_order(); std::vector old_sizes = sizes(); std::vector new_sizes(old_sizes.size(), default_size); const auto& new_traits = format::traits(new_fmt); - const cldnn::format default_fmt = cldnn::format::bfvuwzyx; static const std::map flatten_mapping = { { 'v', 'u'}, { 'u', 'w'}, diff --git a/src/plugins/intel_gpu/src/graph/generic_layer.cpp b/src/plugins/intel_gpu/src/graph/generic_layer.cpp deleted file mode 100644 index 88c1da6c58b..00000000000 --- a/src/plugins/intel_gpu/src/graph/generic_layer.cpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// -#include "generic_layer_inst.h" -#include "primitive_type_base.h" - -#include "json_object.h" - -#include -#include -#include - -namespace cldnn { -GPU_DEFINE_PRIMITIVE_TYPE_ID(generic_layer) - -generic_layer_node::typed_program_node(const std::shared_ptr prim, program& prog) - : parent(prim, prog) { - can_share_buffer(false); -} - -generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node) - : parent(network, node) {} - -generic_layer_inst::typed_primitive_inst(network& network) - : parent(network) { - _type = generic_layer::type_id(); -} - -std::string generic_layer_inst::to_string(generic_layer_node const& node) { - auto node_info = node.desc_to_json(); - - std::stringstream primitive_description; - - node_info->dump(primitive_description); - - return primitive_description.str(); -} - -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index d75663aa45f..06b50782588 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -4,6 +4,8 @@ #include "pass_manager.h" #include "program_helpers.h" +#include "implementation_map.hpp" + #include "convolution_inst.h" #include "binary_convolution_inst.h" #include "deconvolution_inst.h" @@ -38,53 +40,82 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { if (impl->is_dynamic()) return; + // Don't run impl selection to avoid double compilation of reorder kernels + // in main program and internal program for constant propagation + auto set_implementation = [&p, &impl](program_node& weights_reorder_node) { + if (!weights_reorder_node.is_constant()) { + auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape); + auto reorder_kernel_params = impl->get_weights_reorder_kernel_params(); + reorder_kernel_params->prog = &p; + auto reorder_impl = factory(*reorder_kernel_params); + + weights_reorder_node.set_selected_impl(reorder_impl->clone()); + if (auto impl = weights_reorder_node.get_selected_impl()) { + auto params = weights_reorder_node.get_kernel_impl_params(); + p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source()); + } + } + }; + auto output_layout = node.get_output_layout(); auto weights_reorder_params = impl->get_weights_reorder_params(); for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) { - auto& weights_node = node.get_dependency(i); + program_node& prev_node = node.get_dependency(i); - auto reorder = _rf.get_weights_reorder(weights_node.id(), weights_reorder_params); + if (weights_reorder_params != nullptr) { + bool can_be_fused = prev_node.is_type() && + prev_node.get_users().size() == 1 && + prev_node.get_dependencies().size() == 1 && + !prev_node.has_fused_primitives() && + !prev_node.as().has_mean() && + prev_node.as().get_primitive()->subtract_per_feature.empty(); + if (can_be_fused) { + // Need to update input data_type for correct merging format reorder with precision reorder + data_types input_dtype = prev_node.get_input_layouts()[0].data_type; + auto updated_input_layout = weights_reorder_params->get_input_layout(); + updated_input_layout.data_type = input_dtype; + weights_reorder_params->set_input_layout(updated_input_layout); - if (reorder.first) { - // insert new generic_layer node to topology - p.add_intermediate(reorder.first, node, i, !reorder.second); - // set generic_layer's node output layout and implementation - auto& g_node = node.get_dependency(i); - g_node.get_output_layout(false); + auto weights_reorder = _rf.get_weights_reorder(prev_node.get_primitive()->input[0].pid, + weights_reorder_params); + auto& weights_reorder_node = p.get_or_create(weights_reorder.first); + p.replace(prev_node, weights_reorder_node); + weights_reorder_node.recalc_output_layout(false); - // Don't run impl selection to avoid double compilation of reorder kernels - // in main program and internal program for constant propagation - if ((!g_node.is_constant()) && (!reorder.second)) { - g_node.set_selected_impl(g_node.type()->choose_impl(g_node)); - if (auto impl = g_node.get_selected_impl()) { - auto params = g_node.get_kernel_impl_params(); - p.get_kernels_cache().add_kernels_source(*params, impl->get_kernels_source()); + if (!weights_reorder.second) { + set_implementation(weights_reorder_node); + } + } else { + auto weights_reorder = _rf.get_weights_reorder(prev_node.id(), weights_reorder_params); + // insert new weights reorder node to topology + p.add_intermediate(weights_reorder.first, node, i, !weights_reorder.second); + // set weights reorder's node output layout and implementation + auto& weights_reorder_node = node.get_dependency(i); + weights_reorder_node.get_output_layout(false); + + if (!weights_reorder.second) { + set_implementation(weights_reorder_node); } } } } - - // Reset weights reorder params to not keep source code pointer - impl->reset_weights_reorder_params(); - // set the old output layout and do not invalidate users as change of weights will not affect output layout node.set_output_layout(output_layout, false); } void post_optimize_weights::run(program& p) { for (auto& node : p.get_processing_order()) { - if (node->type() == convolution::type_id()) { + if (node->is_type()) { optimize_weights(node->as(), p); - } - if (node->type() == binary_convolution::type_id()) { + } else if (node->is_type()) { optimize_weights(node->as(), p); - } else if (node->type() == deconvolution::type_id()) { + } else if (node->is_type()) { optimize_weights(node->as(), p); - } else if (node->type() == deformable_conv::type_id()) { + } else if (node->is_type()) { optimize_weights(node->as(), p); - } else if (node->type() == fully_connected::type_id()) { + } else if (node->is_type()) { optimize_weights(node->as(), p); - } else if (node->type() == lstm_dynamic_input::type_id()) { + } else if (node->is_type()) { optimize_weights(node->as(), p); } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp index f9a6978f065..32c1c4ba12f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/pre_replace_deconv.cpp @@ -25,12 +25,10 @@ void pre_replace_deconv::run(program& p) { while (itr != p.nodes_map.end()) { auto node_itr = itr++; auto& node = (*node_itr).second; - // find deconvolution primitives with stride 1 and change them to convolution with trasposed weights + // find deconvolution primitives with stride 1 and change them to convolution with transposed weights if (node->is_type()) { if (node->is_dynamic()) continue; - if (!p.get_config().get_property(ov::intel_gpu::optimize_data)) - continue; auto& deconv_node = node->as(); auto& weights_node = deconv_node.weights(); @@ -61,7 +59,6 @@ void pre_replace_deconv::run(program& p) { if (!perform_opt) continue; - // setting convolution parameters based on deconvolution params auto output_layout = deconv_node.get_output_layout(); auto output_pshape = output_layout.get_partial_shape(); @@ -73,8 +70,7 @@ void pre_replace_deconv::run(program& p) { auto output_padding = deconv_prim->output_paddings[0]; auto grouped_weights_shape = deconv_prim->grouped_weights_shape; - // remove deconvolution node and its connections to weights and biases, rename it and move to the optimized - // list + // remove deconvolution node and its connections to weights and biases, rename it and move to the optimized list p.remove_connection(input_node, deconv_node); std::vector> weight_connections; for (auto& weights_id : weights_nodes_id) { diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp index 82d01394e88..a7eab103455 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/propagate_constants.cpp @@ -166,7 +166,7 @@ void propagate_constants::add_deps_to_tpl(program& prog, const std::vectoris_type()) { auto dep_ptr = prog.get_node_ptr(dep.first->get_primitive()->id); if (nodes.find(dep_ptr) == nodes.end()) { - nodes.insert(prog.get_node_ptr(dep.first->get_primitive()->id)); + nodes.insert(dep_ptr); const_inputs.push_back(&dep.first->as()); } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 92e0044f923..280b99c527a 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -275,7 +275,9 @@ void remove_redundant_reorders::run(program& p) { !r_node.get_primitive()->subtract_per_feature.empty() || no_output_optimization || r_node.has_fused_primitives() || - r_node.get_primitive()->has_surface_input()) + r_node.get_primitive()->has_surface_input() || + (r_node.get_primitive()->weights_reorder_params && + r_node.get_primitive()->weights_reorder_params->should_be_transposed())) continue; auto o_layout = r_node.get_output_layout(); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp index 68f56f2379e..f6c4e19133e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/fully_connected.cpp @@ -60,13 +60,14 @@ public: feature = input0_pshape[primitive->input_size - 1ul]; } + // TO DO, to remove WA if (primitive->input_size > 3) { input0_layout.set_partial_shape(reshape_to_2d(input0_pshape, feature, primitive->input_size)); input0_layout.format = format::bfyx; } if (input1_pshape.size() != 2) { input1_layout.set_partial_shape(reshape_to_2d(input1_pshape, feature, primitive->weights_rank)); - input1_layout.format = format::bfyx; + // input1_layout.format = format::bfyx; } std::vector layouts{input0_layout, input1_layout}; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp deleted file mode 100644 index 93fe0d42b7e..00000000000 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp +++ /dev/null @@ -1,137 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "primitive_base.hpp" - -#include "generic_layer_inst.h" - -namespace cldnn { -namespace ocl { - -struct generic_layer_impl : typed_primitive_impl { - using parent = typed_primitive_impl; - using parent::parent; - - kernel_selector::cl_kernel_data _cl_kernel_data; - kernel::ptr _kernel; - kernel_id _cached_kernel_id; - - DECLARE_OBJECT_TYPE_SERIALIZATION - - std::unique_ptr clone() const override { - return make_unique(*this); - } - - generic_layer_impl() : parent() {} - - generic_layer_impl(const generic_layer_impl& other) - : _cl_kernel_data(other._cl_kernel_data) - , _kernel(nullptr) - , _cached_kernel_id(other._cached_kernel_id) { - OPENVINO_ASSERT(other._kernel, "[GPU] Can't copy generic_layer_impl node: kernel is nullptr"); - _kernel = other._kernel->clone(); - } - - generic_layer_impl(const kernel_impl_params& params) - : _cl_kernel_data() - , _kernel(nullptr) - , _cached_kernel_id() { - auto reorder_params = params.typed_desc()->params; - auto casted_params = std::dynamic_pointer_cast(reorder_params); - OPENVINO_ASSERT(casted_params, "[GPU] Invalid weights reorder parameters type for ", params.desc->id, " node"); - _cl_kernel_data = *casted_params->get_cl_kernel(); - } - - std::vector> get_kernels_source() override { - std::vector> kernel_strings; - kernel_strings.push_back(_cl_kernel_data.code.kernelString); - return kernel_strings; - } - - std::vector get_kernels() const override { - return {_kernel}; - } - - void save(BinaryOutputBuffer& ob) const override { - ob << _cl_kernel_data; - ob << _cached_kernel_id; - } - - void load(BinaryInputBuffer& ib) override { - ib >> _cl_kernel_data; - ib >> _cached_kernel_id; - } - - void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override { - _kernel = nullptr; - auto compiled_kernels = kernels_cache.get_kernels(params); - OPENVINO_ASSERT(compiled_kernels.size() == 1, "[GPU] Unexpected number of kernels for generic_layer during init_kernels() call"); - _kernel = compiled_kernels.front(); - } - - void init_by_cached_kernels(const kernels_cache& kernels_cache) override { - _kernel = kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id); - } - - void set_cached_kernel_ids(const kernels_cache& kernels_cache) override { - _cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernel); - } - - void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { - OPENVINO_ASSERT(kernels.size() == 1 && - kernels.begin()->second.size() == 1, "[GPU] Unexpected number of kernels for generic_layer"); - _kernel = kernels.begin()->second[0].first; - } - - void set_arguments_impl(generic_layer_inst& instance) override { - kernel_arguments_data args; - args.scalars = &_cl_kernel_data.params.scalars; - - for (size_t i = 0; i < instance.inputs_memory_count(); i++) { - args.inputs.push_back(instance.input_memory_ptr(i)); - } - args.outputs.push_back(instance.output_memory_ptr()); - - set_arguments_impl(instance, args); - } - - void set_arguments_impl(generic_layer_inst& instance, kernel_arguments_data& args) override { - stream& stream = instance.get_network().get_stream(); - stream.set_arguments(*_kernel, _cl_kernel_data.params, args); - } - - event::ptr execute_impl(const std::vector& events, generic_layer_inst& instance) override { - stream& stream = instance.get_network().get_stream(); - kernel_arguments_data args; - args.scalars = &_cl_kernel_data.params.scalars; - - for (size_t i = 0; i < instance.inputs_memory_count(); i++) { - args.inputs.push_back(instance.input_memory_ptr(i)); - } - args.outputs.push_back(instance.output_memory_ptr()); - return stream.enqueue_kernel(*_kernel, _cl_kernel_data.params, args, events, true); - } - - static std::unique_ptr create(const kernel_impl_params& params) { - return make_unique(params); - } -}; - -static std::unique_ptr create(const generic_layer_node& arg, const kernel_impl_params& params) { - return make_unique(params); -} - -namespace detail { -attach_generic_layer_impl::attach_generic_layer_impl() { - implementation_map::add(cldnn::impl_types::ocl, create, {}); - - WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, generic_layer_impl::create); -} - -} // namespace detail -} // namespace ocl -} // namespace cldnn - -BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl) -BIND_BINARY_BUFFER_WITH_TYPE(cldnn::generic_layer) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 1c2d3486a55..b2ab89d3b11 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -387,6 +387,7 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { case format::iyxo: case format::fyxb: return kernel_selector::weights_layout::iyxo; + case format::oyxi: case format::byxf: return kernel_selector::weights_layout::oyxi; case format::byfx: @@ -408,6 +409,8 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::os_is_yx_osv16_isv16; case format::os_iyx_osv32: return kernel_selector::weights_layout::os_iyx_osv32; + case format::os_iyx_osv32__ai32: + return kernel_selector::weights_layout::os_iyx_osv32__ai32; case format::os_iyx_osv64: return kernel_selector::weights_layout::os_iyx_osv64; case format::image_2d_weights_c4_fyx_b: @@ -509,18 +512,26 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::os_i_osv8__ai8; case format::os_i_osv16__ai8: return kernel_selector::weights_layout::os_i_osv16__ai8; - case format::bs_f_bsv16: + case format::os_i_osv16: return kernel_selector::weights_layout::os_i_osv16; case format::os_is_zyx_isv16_osv16: return kernel_selector::weights_layout::os_is_zyx_isv16_osv16; case format::is_os_zyx_isv16_osv16: return kernel_selector::weights_layout::is_os_zyx_isv16_osv16; + case format::os_is_zyx_osv32_isv16: + return kernel_selector::weights_layout::os_is_zyx_osv32_isv16; case format::is_os_yx_isv16_osv16: return kernel_selector::weights_layout::is_os_yx_isv16_osv16; case format::is_os_yx_isv16_osv8: return kernel_selector::weights_layout::is_os_yx_isv16_osv8; + case format::i_yxs_os_yxsv2_osv16: + return kernel_selector::weights_layout::i_yxs_os_yxsv2_osv16; case format::is_os_yx_osa4_isa8_osv8_isv4: return kernel_selector::weights_layout::is_os_yx_osa4_isa8_osv8_isv4; + case format::iy_xs_os_xsv2_osv8__ao32: + return kernel_selector::weights_layout::iy_xs_os_xsv2_osv8__ao32; + case format::iy_xs_os_xsv2_osv16__ao32: + return kernel_selector::weights_layout::iy_xs_os_xsv2_osv16__ao32; case format::os_is_osv32_isv32_swizzled_by_4: return kernel_selector::weights_layout::os_is_osv32_isv32_swizzled_by_4; case format::os_is_zyx_isv8_osv16_isv2: @@ -551,6 +562,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::gs_oizyx_gsv16; case format::gs_oiyx_gsv32: return kernel_selector::weights_layout::gs_oiyx_gsv32; + case format::gs_oi_yxs_gsv4_yxsv4: + return kernel_selector::weights_layout::gs_oi_yxs_gsv4_yxsv4; + case format::gs_oi_yxs_gsv16_yxsv4: + return kernel_selector::weights_layout::gs_oi_yxs_gsv16_yxsv4; + case format::gs_oi_yxs_gsv32_yxsv4: + return kernel_selector::weights_layout::gs_oi_yxs_gsv32_yxsv4; case format::gs_oizyx_gsv32: return kernel_selector::weights_layout::gs_oizyx_gsv32; case format::gyxio: @@ -647,6 +664,12 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::g_os_y_is_x_osv8_isv2; case format::g_os_y_is_x_osv8_isv4: return kernel_selector::weights_layout::g_os_y_is_x_osv8_isv4; + case format::g_os_is_yx_isv16_osv16: + return kernel_selector::weights_layout::g_os_is_yx_isv16_osv16; + case format::lstm_weights_dio: + return kernel_selector::weights_layout::dlstm_dir_io; + case format::os_i_yxs_osv4_yxsv4: + return kernel_selector::weights_layout::os_i_yxs_osv4_yxsv4; default: throw std::invalid_argument("Unable to convert tensor layout " + fmt_to_str(f) + " to weights layout"); } @@ -686,7 +709,7 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { case kernel_selector::weights_layout::os_iyx_osv64: return cldnn::format::os_iyx_osv64; case kernel_selector::weights_layout::os_i_osv16: - return cldnn::format::bs_f_bsv16; + return cldnn::format::os_i_osv16; case kernel_selector::weights_layout::os_i_osv8__ai8: return cldnn::format::os_i_osv8__ai8; case kernel_selector::weights_layout::os_i_osv16__ai8: @@ -775,6 +798,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { return cldnn::format::os_is_yx_osv32_isv32p; case kernel_selector::weights_layout::oizyx: return cldnn::format::oizyx; + case kernel_selector::weights_layout::iozyx: + return cldnn::format::iozyx; case kernel_selector::weights_layout::os_is_zyx_isv16_osv16: return cldnn::format::os_is_zyx_isv16_osv16; case kernel_selector::weights_layout::is_os_zyx_isv16_osv16: @@ -939,6 +964,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { return cldnn::format::g_os_y_is_x_osv8_isv2; case kernel_selector::weights_layout::g_os_y_is_x_osv8_isv4: return cldnn::format::g_os_y_is_x_osv8_isv4; + case kernel_selector::weights_layout::giozyx: + return cldnn::format::giozyx; default: throw std::invalid_argument("Unable to convert kernel selector Weights layout " + std::to_string(static_cast(l)) + " to cldnn format"); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h index 75dd3dcc2d4..4fa1181aaa7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h @@ -14,7 +14,7 @@ #include "intel_gpu/primitives/eltwise.hpp" #include "intel_gpu/primitives/quantize.hpp" #include "intel_gpu/primitives/activation.hpp" -#include "intel_gpu/primitives/generic_layer.hpp" +#include "intel_gpu/primitives/reorder.hpp" #include "intel_gpu/primitives/primitive.hpp" #include "kernel_selector_params.h" @@ -80,7 +80,6 @@ using multi_data_tensor = kernel_selector::MultiDataTensor; using params = kernel_selector::Params; using weights_reorder_params = kernel_selector::WeightsReorderParams; -using generic_kernel_params = kernel_selector::GenericKernelParams; } // namespace kernel_selector @@ -272,106 +271,12 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im return updated_impl_params; } -class WeightsReorderParamsOCL : public WeightsReorderParams { -public: - explicit WeightsReorderParamsOCL(const kernel_selector::WeightsReorderParams& params) - : WeightsReorderParams(from_weights_tensor(params.src), from_weights_tensor(params.dest)) { - cl_kernel = params.clKernel; - } - - size_t hash() const override { - size_t seed = WeightsReorderParams::hash(); - - if (cl_kernel == nullptr) - return seed; - - seed = hash_combine(seed, cl_kernel->skip_execution); - - auto& gws = cl_kernel->params.workGroups.global; - seed = hash_range(seed, gws.begin(), gws.end()); - - auto& lws = cl_kernel->params.workGroups.local; - seed = hash_range(seed, lws.begin(), lws.end()); - - auto& arguments = cl_kernel->params.arguments; - for (auto& args : arguments) { - seed = hash_combine(seed, args.index); - seed = hash_combine(seed, args.t); - } - - auto& scalars = cl_kernel->params.scalars; - for (auto& s : scalars) { - seed = hash_combine(seed, s.t); - } - - return seed; - } - - bool operator==(const WeightsReorderParams& rhs) const override { - if (typeid(*this) != typeid(rhs)) - return false; - - if (!WeightsReorderParams::operator==(rhs)) - return false; - - auto rhs_casted = downcast(rhs); - - if (cl_kernel != nullptr && rhs_casted.cl_kernel != nullptr) { - auto& clKernel_rhs = rhs_casted.cl_kernel; - if (cl_kernel->skip_execution != clKernel_rhs->skip_execution) - return false; - - auto& gws = cl_kernel->params.workGroups.global; - auto& gws_rhs = clKernel_rhs->params.workGroups.global; - if (gws != gws_rhs) - return false; - - auto& lws = cl_kernel->params.workGroups.local; - auto& lws_rhs = clKernel_rhs->params.workGroups.local; - if (lws != lws_rhs) - return false; - - auto& arguments = cl_kernel->params.arguments; - auto& arguments_rhs = clKernel_rhs->params.arguments; - if (arguments.size() != arguments_rhs.size()) - return false; - - for (size_t idx = 0; idx < arguments.size(); idx++) { - if (arguments[idx].index != arguments_rhs[idx].index) - return false; - - if (arguments[idx].t != arguments_rhs[idx].t) - return false; - } - - auto& scalars = cl_kernel->params.scalars; - auto& scalars_rhs = clKernel_rhs->params.scalars; - if (scalars.size() != scalars_rhs.size()) - return false; - - for (size_t idx = 0; idx < scalars.size(); idx++) { - if (scalars[idx].t != scalars_rhs[idx].t) - return false; - } - } - - return true; - } - - std::shared_ptr get_cl_kernel() { - return cl_kernel; - } - -private: - std::shared_ptr cl_kernel; -}; - inline std::shared_ptr create_weights_reorder_params(const kernel_selector::WeightsReorderParams& params) { - if (params.engine == kernel_selector::generic_kernel_params::Engine::NONE) { + if (!params.is_initialized) { return nullptr; } - return std::make_shared(params); + return std::make_shared(from_weights_tensor(params.src), from_weights_tensor(params.dest), params.rotate); } } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 253d003ac22..55526e8ec4e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -39,10 +39,7 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { // a pair of batch program hash and kernel entry hash of each ocl impl. std::pair kernel_dump_info; - typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) { - _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; - _kernel_data.weightsReorderParams.clKernel = nullptr; - } + typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {} typed_primitive_impl_ocl(const typed_primitive_impl_ocl& other) : typed_primitive_impl(other._weights_reorder_params, other._kernel_name, other._is_dynamic) @@ -59,10 +56,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd) : typed_primitive_impl(create_weights_reorder_params(kd.weightsReorderParams), kd.kernelName), _kernel_data(kd) { - // weights reorder params got copied to parent, clear in _kernel_data to release shared ptr - _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; - _kernel_data.weightsReorderParams.clKernel = nullptr; - this->can_reuse_memory = _kernel_data.can_reuse_memory; } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp index e6e0d575b29..3a287bdeda4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp @@ -81,7 +81,6 @@ void register_implementations() { REGISTER_OCL(tile); REGISTER_OCL(lstm_dynamic_input); REGISTER_OCL(lstm_dynamic_timeloop); - REGISTER_OCL(generic_layer); REGISTER_OCL(gather_tree); REGISTER_OCL(resample); REGISTER_OCL(grn); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index 13fffc8b786..a5fb5a5817e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -28,7 +28,6 @@ #include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp" #include "intel_gpu/primitives/eye.hpp" #include "intel_gpu/primitives/fully_connected.hpp" -#include "intel_gpu/primitives/generic_layer.hpp" #include "intel_gpu/primitives/gather.hpp" #include "intel_gpu/primitives/gather_elements.hpp" #include "intel_gpu/primitives/gather_nd.hpp" @@ -162,7 +161,6 @@ REGISTER_OCL(strided_slice); REGISTER_OCL(tile); REGISTER_OCL(lstm_dynamic_input); REGISTER_OCL(lstm_dynamic_timeloop); -REGISTER_OCL(generic_layer); REGISTER_OCL(gather_tree); REGISTER_OCL(resample); REGISTER_OCL(grn); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp index 47661b13b91..7ae0271fcd3 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp @@ -7,6 +7,7 @@ #include "reorder_inst.h" #include "reorder/reorder_kernel_selector.h" #include "reorder/reorder_kernel_base.h" +#include "reorder/reorder_weights_kernel_selector.h" namespace cldnn { namespace ocl { @@ -26,9 +27,10 @@ struct reorder_impl : typed_primitive_impl_ocl { protected: kernel_arguments_data get_arguments(const reorder_inst& instance) const override { kernel_arguments_data args = parent::get_arguments(instance); - auto input = &instance.input_memory(); - auto input_layout = input->get_layout(); - if (instance.has_mean()) { + if (instance.has_node() && instance.has_mean()) { + auto input = &instance.input_memory(); + auto input_layout = input->get_layout(); + if (input_layout.format == cldnn::format::nv12) { args.bias = instance.mean_nv12_memory(); } else { @@ -108,12 +110,45 @@ public: auto kernel_params = get_kernel_params(impl_param, true); (_kernel_data.update_dispatch_data_func)(kernel_params.first, _kernel_data); } + + static std::unique_ptr create(const reorder_node& arg, const kernel_impl_params& impl_param) { + bool is_reorder_weights = format::is_weights_format(impl_param.get_input_layout().format) || + format::is_weights_format(impl_param.get_output_layout().format); + if (is_reorder_weights) { + return create_reorder_weights(impl_param); + } else { + return typed_primitive_impl_ocl::create(arg, impl_param); + } + } + + static std::unique_ptr create_reorder_weights(const kernel_impl_params& impl_param) { + const auto& prim = impl_param.typed_desc(); + const auto& weights_params = prim->weights_reorder_params; + auto& kernel_selector = kernel_selector::ReorderWeightsKernelSelector::Instance(); + + OPENVINO_ASSERT(impl_param.get_input_layout().bytes_count() == weights_params->get_input_layout().bytes_count(), + "[GPU] Input layout doesn't match required reorder weights layout"); + + kernel_selector::reorder_weights_params r_params; + set_params(impl_param, r_params); + + r_params.input = convert_weights_tensor(weights_params->get_input_layout(), weights_params->get_grouped()); + r_params.output = convert_weights_tensor(weights_params->get_output_layout()); + r_params.layerID = impl_param.desc->id + "_reorder_weigths"; + r_params.uniqueID = std::to_string(impl_param.unique_id) + "_weight"; + r_params.rotate_180 = weights_params->should_be_transposed(); + + kernel_selector::reorder_optional_params optional_params; + auto best_kernel = kernel_selector.get_best_kernel(r_params, optional_params); + + return make_unique(best_kernel); + } }; namespace detail { attach_reorder_impl::attach_reorder_impl() { - implementation_map::add(impl_types::ocl, shape_types::static_shape, typed_primitive_impl_ocl::create, {}); + implementation_map::add(impl_types::ocl, shape_types::static_shape, reorder_impl::create, {}); auto types = { data_types::f32, @@ -129,7 +164,9 @@ attach_reorder_impl::attach_reorder_impl() { format::bfzyx, format::bfwzyx, }; - implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, typed_primitive_impl_ocl::create, types, formats); + implementation_map::add(impl_types::ocl, shape_types::dynamic_shape, reorder_impl::create, types, formats); + + WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, reorder_impl::create_reorder_weights); } } // namespace detail diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp index 82b2e04b1fb..7b7a344d8d9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp @@ -132,38 +132,17 @@ protected: return attrs; } - static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd, bool rotate) { - kernel_selector::WeightsReorderParams weights_reorder_params; - auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance(); - kernel_selector::reorder_weights_params r_params; - + static std::shared_ptr get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd, bool rotate) { auto cldnn_prim = impl_params.typed_desc(); - auto weights_layout = impl_params.get_input_layout(1); - auto grouped_weights = format::is_grouped(weights_layout.format) || cldnn_prim->grouped_weights_shape; - cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights); - kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape); - set_params(impl_params, r_params); - r_params.layerID = cldnn_prim->id + "_reorder_"; - r_params.input = convert_weights_tensor(weights_layout, cldnn_prim->grouped_weights_shape); - r_params.output = r_params.input.TransformIgnorePadding(reqLayout, r_params.input.GetDType(), cldnn_prim->groups, false); - r_params.rotate_180 = rotate; + auto input_weights_layout = impl_params.get_input_layout(1); + auto grouped_weights = format::is_grouped(input_weights_layout.format) || cldnn_prim->grouped_weights_shape; + format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights); - kernel_selector::reorder_optional_params op; - kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op); + auto output_weights_layout = input_weights_layout; + output_weights_layout.format = out_fmt; - if (kernels_data.empty()) { - throw std::runtime_error("No suitable kernel found for weights reorder from " + - kernel_selector::toString(r_params.input.GetLayout()) + " to " + - kernel_selector::toString(r_params.output.GetLayout())); - } - - weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; - weights_reorder_params.clKernel = std::make_shared(kernels_data[0].kernels[0]); - weights_reorder_params.src = r_params.input; - weights_reorder_params.dest = r_params.output; - - return weights_reorder_params; + return std::make_shared(input_weights_layout, output_weights_layout, rotate, grouped_weights); } public: diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp index ee9c2009f34..6aaad001012 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp @@ -53,38 +53,17 @@ protected: return arg.get_onednn_primitive_attributes(); } - static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) { - kernel_selector::WeightsReorderParams weights_reorder_params; - auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance(); - kernel_selector::reorder_weights_params r_params; - + static std::shared_ptr get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) { auto cldnn_prim = impl_params.typed_desc(); - auto weights_layout = impl_params.get_input_layout(1); - auto grouped_weights = format::is_grouped(weights_layout.format) || cldnn_prim->grouped_weights_shape; - cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights); - kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape); - set_params(impl_params, r_params); - r_params.layerID = cldnn_prim->id + "_reorder_"; - r_params.input = convert_weights_tensor(weights_layout, cldnn_prim->grouped_weights_shape); - r_params.output = r_params.input.TransformIgnorePadding(reqLayout, r_params.input.GetDType(), cldnn_prim->groups, false); - r_params.rotate_180 = false; + auto input_weights_layout = impl_params.get_input_layout(1); + auto grouped_weights = format::is_grouped(input_weights_layout.format) || cldnn_prim->grouped_weights_shape; + format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights); - kernel_selector::reorder_optional_params op; - kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op); + auto output_weights_layout = input_weights_layout; + output_weights_layout.format = out_fmt; - if (kernels_data.empty()) { - throw std::runtime_error("No suitable kernel found for weights reorder from " + - kernel_selector::toString(r_params.input.GetLayout()) + " to " + - kernel_selector::toString(r_params.output.GetLayout())); - } - - weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; - weights_reorder_params.clKernel = std::make_shared(kernels_data[0].kernels[0]); - weights_reorder_params.src = r_params.input; - weights_reorder_params.dest = r_params.output; - - return weights_reorder_params; + return std::make_shared(input_weights_layout, output_weights_layout, false, grouped_weights); } public: diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 4d3faf1e9ac..44dab5cbc8f 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -53,7 +53,7 @@ protected: return args; } - static kernel_selector::WeightsReorderParams get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) { + static std::shared_ptr get_weights_reorder(const kernel_impl_params& impl_params, const dnnl::primitive_desc& pd) { auto input_layout = impl_params.get_input_layout(0); auto weights_layout = impl_params.get_input_layout(1); auto cldnn_prim = impl_params.typed_desc(); @@ -68,35 +68,12 @@ protected: weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature)); } - kernel_selector::WeightsReorderParams weights_reorder_params; - auto& reorderKS = kernel_selector::ReorderWeightsKernelSelctor::Instance(); - kernel_selector::reorder_weights_params r_params; + format out_fmt = onednn::find_format(pd.weights_desc(0)); - cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0)); - kernel_selector::WeightsLayout req_layout = to_weights_layout(out_fmt, false); + auto output_weights_layout = weights_layout; + output_weights_layout.format = out_fmt; - // set engine info & forcing - set_params(impl_params, r_params); - r_params.layerID = cldnn_prim->id + "_reorder_"; - r_params.input = convert_weights_tensor(weights_layout, false); - r_params.output = r_params.input.TransformIgnorePadding(req_layout, r_params.input.GetDType(), 1, false); - r_params.rotate_180 = false; - - kernel_selector::reorder_optional_params op; - kernel_selector::KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op); - - if (kernels_data.empty()) { - throw std::runtime_error("No suitable kernel found for weights reorder from " + - kernel_selector::toString(r_params.input.GetLayout()) + " to " + - kernel_selector::toString(r_params.output.GetLayout())); - } - - weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; - weights_reorder_params.clKernel = std::make_shared(kernels_data[0].kernels[0]); - weights_reorder_params.src = r_params.input; - weights_reorder_params.dest = r_params.output; - - return weights_reorder_params; + return std::make_shared(weights_layout, output_weights_layout, false); } static std::shared_ptr get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params, diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index d7120ccd53f..17e1398ecb0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -47,8 +47,8 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { const ExecutionConfig& config, std::shared_ptr attrs, const PrimDescType& pd, - kernel_selector::WeightsReorderParams weights_reorder = {}) - : typed_primitive_impl(create_weights_reorder_params(weights_reorder), pd.impl_info_str()), + std::shared_ptr weights_reorder = {}) + : typed_primitive_impl(weights_reorder, pd.impl_info_str()), _engine(&engine), _attrs(attrs), _pd(pd) { diff --git a/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h b/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h deleted file mode 100644 index 9022f9aadaf..00000000000 --- a/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once -#include "intel_gpu/primitives/generic_layer.hpp" -#include "primitive_inst.h" - -#include -#include - -namespace cldnn { - -template <> -struct typed_program_node : public typed_program_node_base { - using parent = typed_program_node_base; - typed_program_node(const std::shared_ptr prim, program& prog); - -public: - using parent::parent; - - program_node& input() const { return get_dependency(0); } -}; - -using generic_layer_node = typed_program_node; - -template <> -class typed_primitive_inst : public typed_primitive_inst_base { - using parent = typed_primitive_inst_base; - using parent::parent; - -public: - static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) { - return impl_param.typed_desc()->params->get_output_layout(); - } - - static std::string to_string(generic_layer_node const& node); - - typed_primitive_inst(network& network, generic_layer_node const& node); - typed_primitive_inst(network& network); -}; - -using generic_layer_inst = typed_primitive_inst; - -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h index de382fe3d51..fd048838c70 100644 --- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h +++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h @@ -10,7 +10,6 @@ #include "intel_gpu/runtime/lru_cache.hpp" #include "data_inst.h" -#include "generic_layer_inst.h" #include "reorder_inst.h" #include "convolution_inst.h" #include "deconvolution_inst.h" @@ -73,7 +72,6 @@ private: }; std::map> _cached_reorders; - std::map> _cached_generic_reorders; }; class layout_optimizer { diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index a5583ae7d90..cefdfee4503 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -5,7 +5,6 @@ #pragma once #include "intel_gpu/primitives/primitive.hpp" #include "intel_gpu/primitives/concatenation.hpp" -#include "intel_gpu/primitives/generic_layer.hpp" #include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/lru_cache.hpp" @@ -99,7 +98,6 @@ struct primitive_impl { bool need_weights_reorder() const { return _weights_reorder_params != nullptr; } std::shared_ptr get_weights_reorder_params() const { return _weights_reorder_params; } - void reset_weights_reorder_params() { _weights_reorder_params = nullptr; } std::shared_ptr get_weights_reorder_kernel_params() const; @@ -232,6 +230,7 @@ public: bool is_constant() const { return _is_constant; } bool needs_completion_event() const { return _needs_completion_event; } bool has_unfused_subgraph() const { return (_unfused_subgraph != nullptr); } + bool has_node() const { return _node != nullptr; } bool has_inner_networks() const; void allocate_internal_buffers(bool reset = true); static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params, uint32_t net_id, diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index 21937f94b65..a840467a63d 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -56,7 +56,6 @@ struct program_node { friend class prepare_conv_eltw_fusing; // to be removed when possible friend class prepare_conv_eltw_read_write_opt; // to be removed when possible friend class propagate_constants; // to be removed when possible - friend class post_optimize_weights; // to be removed when possible - requires an access to selected_impl template friend struct typed_program_node; diff --git a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h index c7b881ada90..f04cb7e7f5b 100644 --- a/src/plugins/intel_gpu/src/graph/include/reorder_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/reorder_inst.h @@ -77,7 +77,9 @@ public: static std::string to_string(reorder_node const& node); public: + typed_primitive_inst(network& network); typed_primitive_inst(network& network, reorder_node const& node); + memory::ptr mean_nv12_memory() const { return dep_memory_ptr(2); } memory::ptr mean_memory() const { return dep_memory_ptr(1); } diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index e57aa5118fc..0a10658419d 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -159,22 +159,18 @@ std::pair, bool> reorder_factory::get_reorder(primitive std::pair, bool> reorder_factory::get_weights_reorder(primitive_id input_id, std::shared_ptr reorder_params) { - if (reorder_params == nullptr) - return {}; + OPENVINO_ASSERT(reorder_params != nullptr, "[GPU] WeightsReorderParams is not initialized."); - layout expected_layout = reorder_params->get_output_layout(); - - cache_key ckey{ input_id, expected_layout, false }; - auto itr = _cached_generic_reorders.find(ckey); - if (itr != _cached_generic_reorders.end()) { + cache_key ckey{ input_id, reorder_params->get_output_layout(), false }; + auto itr = _cached_reorders.find(ckey); + if (itr != _cached_reorders.end()) { return std::make_pair(itr->second, true); } else { - auto count = _cached_generic_reorders.size(); - std::stringstream ss; - ss << input_id << "_generic_layer_" << count; + auto count = _cached_reorders.size(); + std::string reorder_id = input_id + "_weights_reorder_" + std::to_string(count); - auto reorder = std::make_shared(ss.str(), input_id, reorder_params); - _cached_generic_reorders[ckey] = reorder; + auto reorder = std::make_shared(reorder_id, input_id, reorder_params); + _cached_reorders[ckey] = reorder; return std::make_pair(reorder, false); } } @@ -942,8 +938,8 @@ bool layout_optimizer::deps_for_convolution_byxf_opt(program_node const& node, u return true; for (auto& dep : node.get_dependencies()) { - // skip data and generic_layers - if (dep.first->is_type() || dep.first->is_type()) + // skip data layers + if (dep.first->is_type()) continue; if (dep.first->is_type()) { diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 91d6e389a87..bb20afe1e3d 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -5,7 +5,7 @@ #include "primitive_inst.h" #include "data_inst.h" #include "mutable_data_inst.h" -#include "generic_layer_inst.h" +#include "reorder_inst.h" #include "input_layout_inst.h" #include "arg_max_min_inst.h" #include "fully_connected_inst.h" @@ -116,7 +116,7 @@ std::shared_ptr primitive_impl::get_weights_reorder_kernel_p return nullptr; auto reorder_kernel_params = std::make_shared(); - auto prim = std::make_shared("", "", _weights_reorder_params); + auto prim = std::make_shared("", input_info(), _weights_reorder_params); reorder_kernel_params->desc = prim; reorder_kernel_params->unique_id = _weights_reorder_params->hash(); reorder_kernel_params->input_layouts.push_back(_weights_reorder_params->get_input_layout()); @@ -1000,6 +1000,9 @@ event::ptr primitive_inst::update_weights() { auto& engine = _network.get_engine(); auto reorder_kernel_params = _impl->get_weights_reorder_kernel_params(); + if (reorder_kernel_params) + reorder_kernel_params->prog = get_network().get_program().get(); + auto weights_idx = _node->get_primitive()->input.size(); auto original_weights_memory = dep_memory_ptr(weights_idx); auto original_layout = original_weights_memory->get_layout(); @@ -1028,7 +1031,7 @@ event::ptr primitive_inst::update_weights() { } else { GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false); auto& cache = get_network().get_program()->get_implementations_cache(); - auto reorder_inst = std::make_shared(get_network()); + auto reorder_inst = std::make_shared(get_network()); if (auto cached_impl = cache.get(*reorder_kernel_params)) { GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string() @@ -1041,7 +1044,7 @@ event::ptr primitive_inst::update_weights() { auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape); auto reorder_impl = factory(*reorder_kernel_params); auto& kernels_cache = get_network().get_program()->get_kernels_cache(); - auto kernels = kernels_cache.compile(*_impl_params, reorder_impl->get_kernels_source()); + auto kernels = kernels_cache.compile(*reorder_kernel_params, reorder_impl->get_kernels_source()); OPENVINO_ASSERT(kernels.size() == 1, "[GPU] Expected number of compiled kernels is 1, but got ", kernels.size()); reorder_impl->set_kernels(kernels); @@ -1152,10 +1155,11 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, : !usm_device_allocatable ? lockable_mem_type : allocation_type::usm_device; if (is_internal) { - if (_node.can_be_optimized() || _node.is_type()) { + bool is_reorder_weights = _node.is_type() && _node.as().get_primitive()->weights_reorder_params; + if (_node.can_be_optimized() || is_reorder_weights) { GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl; // Use usm_device memory for weights reordering - if (is_internal && _node.is_type() && + if (is_internal && is_reorder_weights && _engine.supports_allocation(allocation_type::usm_device)) alloc_type = allocation_type::usm_device; return get_memory_from_pool(_engine, @@ -1167,7 +1171,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, reset, curr_memory); } else { - if ((_node.is_output() && _node.is_type()) || (!_node.is_output() && _node.is_type())) + if ((_node.is_output() && is_reorder_weights) || (!_node.is_output() && _node.is_type())) reset = false; GPU_DEBUG_LOG << "[" << _node.id() << ": constant]" << std::endl; return _engine.allocate_memory(layout, alloc_type, reset); diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 28ca7e15118..66261fe9c13 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1489,7 +1489,6 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::grid_sample::type_id() && prim.type() != cldnn::softmax::type_id() && prim.type() != cldnn::fully_connected::type_id() && - prim.type() != cldnn::generic_layer::type_id() && prim.type() != cldnn::scatter_nd_update::type_id() && prim.type() != cldnn::broadcast::type_id() && prim.type() != cldnn::quantize::type_id() && @@ -1628,10 +1627,7 @@ std::pair program::get_estimated_device_mem_usage() { if (node->can_be_optimized()) continue; - if (node->is_type() && node->get_users().size() == 1 && node->have_user_with_type()) { - continue; - } - if (node->is_type() || (node->is_type() && node->get_dependency(0).is_type())) { + if (node->is_type()) { const_sum += out_size; } else if (node->have_user_with_type() && node->get_users().size() == 1 && node->get_users().front()->can_be_optimized()) { continue; diff --git a/src/plugins/intel_gpu/src/graph/reorder.cpp b/src/plugins/intel_gpu/src/graph/reorder.cpp index bcd98cfc12c..bae4e428aba 100644 --- a/src/plugins/intel_gpu/src/graph/reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/reorder.cpp @@ -19,7 +19,7 @@ layout reorder_inst::calc_output_layout(reorder_node const& node, kernel_impl_pa auto ifmt = input_layout.format; auto desc = impl_param.typed_desc(); - auto odt = *desc->output_data_types[0]; + auto odt = desc->output_data_types[0].value_or(input_layout.data_type); auto ofmt = desc->output_format; auto op = desc->output_paddings[0]; @@ -146,7 +146,11 @@ layout reorder_inst::calc_output_layout(reorder_node const& node, kernel_impl_pa "Conversion of weights from winograd to standard domain is currently unsupported"); } - if ((ofmt == format::bs_fs_fsv8_bsv8 || ofmt == format::os_i_osv8__ai8 || ofmt == format::os_i_osv16__ai8 || ofmt == format::bs_f_bsv16 || + if (desc->weights_reorder_params) { + return desc->weights_reorder_params->get_output_layout(); + } + + if ((ofmt == format::bs_fs_fsv8_bsv8 || ofmt == format::os_i_osv8__ai8 || ofmt == format::os_i_osv16__ai8 || ofmt == format::os_i_osv16 || ofmt == format::bfzyx || ifmt == format::bfzyx || ofmt == format::b_fs_zyx_fsv16 || ifmt == format::b_fs_zyx_fsv16 || ofmt == format::bs_fs_zyx_bsv16_fsv16 || ifmt == format::bs_fs_zyx_bsv16_fsv16 || ofmt == format::bs_fs_zyx_bsv16_fsv32 || ifmt == format::bs_fs_zyx_bsv16_fsv32 || @@ -169,7 +173,11 @@ std::vector reorder_inst::calc_output_layouts(reorder_node const& /*node auto ifmt = input_layout.format; auto ofmt = desc->output_format == format::any ? ifmt : desc->output_format; - return { layout(input_layout.get(), desc->output_data_types[0].value(), ofmt, desc->output_paddings[0]) }; + if (desc->weights_reorder_params) { + return { desc->weights_reorder_params->get_output_layout() }; + } else { + return { layout(input_layout.get(), desc->output_data_types[0].value(), ofmt, desc->output_paddings[0]) }; + } } std::string reorder_inst::to_string(reorder_node const& node) { @@ -197,6 +205,10 @@ std::string reorder_inst::to_string(reorder_node const& node) { return primitive_description.str(); } +reorder_inst::typed_primitive_inst(network& network) : parent(network) { + _type = reorder::type_id(); +} + reorder_inst::typed_primitive_inst(network& network, reorder_node const& node) : parent(network, node, (!node.can_be_optimized() && node.get_output_layout().is_static()) ? true : false) , _req_reinterpr(node.requires_reinterpret()) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index 4d00949e44b..c08e1b78e7b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -67,22 +67,14 @@ struct clKernelData { bool skip_execution = false; }; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// GenericKernelParams -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct GenericKernelParams { - enum class Engine { NONE, GPU }; - - Engine engine = Engine::NONE; - std::shared_ptr clKernel; -}; - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // WeightsReorderParams //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct WeightsReorderParams : public GenericKernelParams { +struct WeightsReorderParams { WeightsTensor src; WeightsTensor dest; + bool rotate; + bool is_initialized = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp index b7d9cc13599..f1914ddfc78 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp @@ -4,6 +4,7 @@ #include "kernel_selector_utils.h" #include "reorder/reorder_weights_kernel_selector.h" +#include "reorder/reorder_kernel_selector.h" #include "reorder/reorder_kernel_base.h" #include "convolution/convolution_params.h" #include @@ -110,31 +111,10 @@ bool UpdateWeightsParams(weight_bias_params& newParams, if (!optParams.allowStaticInputReordering) { return false; } - - auto& reorderKS = ReorderWeightsKernelSelctor::Instance(); - reorder_weights_params r_params; - - r_params.layerID = newParams.layerID + "_reorder_"; - r_params.input = newParams.weights; - r_params.output = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups, false); - r_params.rotate_180 = rotate; - r_params.engineInfo = newParams.engineInfo; - r_params.uniqueID = newParams.uniqueID + "_weight"; - - reorder_optional_params op; - KernelsData kernels_data = reorderKS.GetBestKernels(r_params, op); - - if (kernels_data.empty()) { - throw std::runtime_error("No suitable kernel found for weights reorder from " + - toString(r_params.input.GetLayout()) + " to " + - toString(r_params.output.GetLayout()) + - (rotate ? " with rotate" : "")); - } - - weightsReorderParams.engine = WeightsReorderParams::Engine::GPU; - weightsReorderParams.clKernel = std::make_shared(kernels_data[0].kernels[0]); - weightsReorderParams.src = r_params.input; - weightsReorderParams.dest = r_params.output; + weightsReorderParams.is_initialized = true; + weightsReorderParams.src = newParams.weights; + weightsReorderParams.dest = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups, false); + weightsReorderParams.rotate = rotate; newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups); return true; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.cpp index 9b0eab72110..3006063045f 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.cpp @@ -13,7 +13,7 @@ namespace kernel_selector { -ReorderWeightsKernelSelctor::ReorderWeightsKernelSelctor() { +ReorderWeightsKernelSelector::ReorderWeightsKernelSelector() { Attach(); Attach(); Attach(); @@ -23,7 +23,7 @@ ReorderWeightsKernelSelctor::ReorderWeightsKernelSelctor() { Attach(); } -KernelsData ReorderWeightsKernelSelctor::GetBestKernels(const Params& params, const optional_params& options) const { +KernelsData ReorderWeightsKernelSelector::GetBestKernels(const Params& params, const optional_params& options) const { return GetNaiveBestKernel(params, options, KernelType::REORDER); } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.h index 5157c596faf..580aef24709 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/reorder/reorder_weights_kernel_selector.h @@ -7,16 +7,16 @@ #include "kernel_selector.h" namespace kernel_selector { -class ReorderWeightsKernelSelctor : public kernel_selector_base { +class ReorderWeightsKernelSelector : public kernel_selector_base { public: - static ReorderWeightsKernelSelctor& Instance() { - static ReorderWeightsKernelSelctor instance_; + static ReorderWeightsKernelSelector& Instance() { + static ReorderWeightsKernelSelector instance_; return instance_; } - ReorderWeightsKernelSelctor(); + ReorderWeightsKernelSelector(); - virtual ~ReorderWeightsKernelSelctor() {} + virtual ~ReorderWeightsKernelSelector() {} KernelsData GetBestKernels(const Params& params, const optional_params& options) const override; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h index e70e92a0e38..1a01d917e18 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h +++ b/src/plugins/intel_gpu/src/kernel_selector/tensor_type.h @@ -109,7 +109,7 @@ enum WeightsLayout { os_i_osv16__ai8, os_i_osv16, os_is_yx_osv16_isv2, - os_is_yx_osv16_isv16, // wieghts for int8 blocked conv + os_is_yx_osv16_isv16, // weights for int8 blocked conv os_is_zyx_osv16_isv16, os_is_zyx_osv32_isv16, os_is_zyx_osv64_isv16, diff --git a/src/plugins/intel_gpu/src/runtime/format.cpp b/src/plugins/intel_gpu/src/runtime/format.cpp index 1f9f127ba37..ca94a647e1a 100644 --- a/src/plugins/intel_gpu/src/runtime/format.cpp +++ b/src/plugins/intel_gpu/src/runtime/format.cpp @@ -108,9 +108,9 @@ static const std::map format_traits_map { FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}), FMT_TRAITS(os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}), FMT_TRAITS(os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{0, 32}, {1, 32}}), - FMT_TRAITS(is_os_yx_osa4_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{0, 4}, {1, 8}, {0, 8}, {1, 4}}), - FMT_TRAITS(is_os_yx_isa2_osa8_isv8_osv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 2}, {0, 8}, {1, 8}, {0, 2}}), - FMT_TRAITS(is_os_yx_isa4_osa8_isv8_osv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 4}, {0, 8}, {1, 8}, {0, 4}}), + FMT_TRAITS(is_os_yx_osa4_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{0, 4}, {1, 8}, {0, 8}, {1, 4}}), + FMT_TRAITS(is_os_yx_isa2_osa8_isv8_osv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 2}, {0, 8}, {1, 8}, {0, 2}}), + FMT_TRAITS(is_os_yx_isa4_osa8_isv8_osv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 4}, {0, 8}, {1, 8}, {0, 4}}), FMT_TRAITS(is_o_yx_isv32, 1, 1, 2, 0, {1, 0, 2, 3}, "oyxi", "oixy", {{1, 32}}), FMT_TRAITS(is_o32_yx_isv32_swizzled_by_4, 1, 1, 2, 0, {0, 1, 2, 3}, "oyxi", "oixy", {}), FMT_TRAITS(os_is_y_x8_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oyxi", "oixy", {}), @@ -129,13 +129,13 @@ static const std::map format_traits_map { FMT_TRAITS(is_os_zyx_isv16_osv16, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "oixyz", {{1, 16}, {0, 16}}), FMT_TRAITS(is_os_yx_isv16_osv16, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 16}, {0, 16}}), FMT_TRAITS(is_os_yx_isv16_osv8, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 16}, {0, 8}}), - FMT_TRAITS(is_os_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "ioxyz", {{1, 8}, {0, 8}, {1, 2}}), - FMT_TRAITS(is_os_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "ioxyz", {{1, 8}, {0, 8}, {1, 4}}), + FMT_TRAITS(is_os_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "oixyz", {{1, 8}, {0, 8}, {1, 2}}), + FMT_TRAITS(is_os_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {1, 0, 2, 3, 4}, "iozyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}), FMT_TRAITS(os_is_zyx_isa8_osv8_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 4}}), FMT_TRAITS(os_is_zyx_isa8_osv8_isv2, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 8}, {1, 2}}), FMT_TRAITS(os_is_zyx_isa8_osv16_isv4, 1, 1, 3, 0, {0, 1, 2, 3, 4}, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}), - FMT_TRAITS(is_os_yx_isa8_osv8_isv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 8}, {0, 8}, {1, 2}}), - FMT_TRAITS(is_os_yx_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "ioxy", {{1, 8}, {0, 8}, {1, 4}}), + FMT_TRAITS(is_os_yx_isa8_osv8_isv2, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 8}, {0, 8}, {1, 2}}), + FMT_TRAITS(is_os_yx_isa8_osv8_isv4, 1, 1, 2, 0, {1, 0, 2, 3}, "ioyx", "oixy", {{1, 8}, {0, 8}, {1, 4}}), FMT_TRAITS(os_is_yx_isa8_osv8_isv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{1, 8}, {0, 8}, {1, 4}}), FMT_TRAITS(os_is_yx_isa8_osv8_isv2, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{1, 8}, {0, 8}, {1, 2}}), FMT_TRAITS(os_is_osv32_isv32_swizzled_by_4, 1, 1, 0, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 32}, {1, 32}}), @@ -152,6 +152,7 @@ static const std::map format_traits_map { FMT_TRAITS(iy_xs_os_xsv2_osv8__ao32, 1, 1, 2, 0, {1, 2, 3, 0}, "iyxo", "oixy", {{2, 2}, {0, 8}}), FMT_TRAITS(iy_xs_os_xsv2_osv16__ao32, 1, 1, 2, 0, {1, 2, 3, 0}, "iyxo", "oixy", {{2, 2}, {0, 16}}), FMT_TRAITS(os_i_yxs_osv4_yxsv4, 1, 1, 2, 0, {0, 1, 2, 3}, "oiyx", "oixy", {{0, 4}}), + FMT_TRAITS(os_i_osv16, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{0, 16}}), FMT_TRAITS(os_i_osv16__ai8, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{1, 8}, {0, 16}}), FMT_TRAITS(os_i_osv8__ai8, 1, 1, 0, 0, {0, 1}, "oi", "oi", {{1, 8}, {0, 8}}), FMT_TRAITS(os_y_is_x_osv8_isv2, 1, 1, 2, 0, {0, 2, 1, 3}, "oyix", "oixy", {{0, 8}, {1, 2}}), diff --git a/src/plugins/intel_gpu/tests/unit/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/convolution_fusion_test.cpp index fbd497f9d9a..aa5934a710f 100644 --- a/src/plugins/intel_gpu/tests/unit/fusions/convolution_fusion_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/fusions/convolution_fusion_test.cpp @@ -4551,7 +4551,7 @@ TEST_P(onednn_replace_full_tensor_sum_to_binary_add, basic) { #define CASE_CONV_ELTW_SUM_TO_BINARY_ADD { 1, 32, 4, 4 }, { 1, 32, 2, 2 }, { 32, 32, 3, 3 }, { 1, 1 }, { 0, 0 }, { 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx INSTANTIATE_TEST_SUITE_P(eltwise_sum_fusings_gpu, onednn_replace_full_tensor_sum_to_binary_add, ::testing::ValuesIn(std::vector{ - convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_TO_BINARY_ADD, 2, 3, 4 }, + convolution_eltw_sum_test_params{ CASE_CONV_ELTW_SUM_TO_BINARY_ADD, 2, 2, 3 }, })); #endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp index ac6fd4b6784..c2474516330 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/layout_test.cpp @@ -5,6 +5,7 @@ #include "test_utils.h" #include "intel_gpu/runtime/layout.hpp" +#include "impls/ocl/kernel_selector_helper.h" using namespace cldnn; using namespace ::tests; @@ -305,3 +306,67 @@ INSTANTIATE_TEST_SUITE_P(smoke, layout_transform_test, {format::bfzyx, format::bfyx, ov::PartialShape{1, 2, 3, 4, 5}, ov::PartialShape{1, 2, 3*4, 5}}, })); + +struct layouts_convert_params { + format::type in_format; + ov::PartialShape in_shape; + bool is_grouped; +}; + +class layout_convert_test : public testing::TestWithParam { }; + +TEST_P(layout_convert_test, basic) { + auto p = GetParam(); + + auto test_layout = layout(p.in_shape, data_types::f32, p.in_format); + auto weights_tensor = convert_weights_tensor(test_layout, p.is_grouped); + auto converted_layout = from_weights_tensor(weights_tensor); + + if (p.in_format == format::bfzyx && p.is_grouped) { + ASSERT_EQ(converted_layout, layout(p.in_shape, data_types::f32, format::goiyx)); + } else if (p.in_format == format::bfwzyx && p.is_grouped) { + ASSERT_EQ(converted_layout, layout(p.in_shape, data_types::f32, format::goizyx)); + } else if (p.in_format == format::os_i_osv16__ai8) { + auto ref_shape = p.in_shape; + for (size_t i = ref_shape.size(); i < converted_layout.get_dims().size(); ++i) + ref_shape.push_back(1); + test_layout.set_partial_shape(ref_shape); + ASSERT_EQ(test_layout, converted_layout); + } else { + ASSERT_EQ(test_layout, converted_layout); + } +} + +INSTANTIATE_TEST_SUITE_P(smoke, layout_convert_test, + testing::ValuesIn(std::vector{ + // 4D formats + {format::oiyx, ov::PartialShape{1, 2, 3, 4}, false}, + {format::ioyx, ov::PartialShape{1, 2, 3, 4}, false}, + {format::os_i_osv16__ai8, ov::PartialShape{1, 2}, false}, + {format::os_iyx_osv16, ov::PartialShape{1, 2, 3, 4}, false}, + {format::os_i_yxs_osv4_yxsv4, ov::PartialShape{1, 2, 3, 4}, false}, + {format::os_is_yx_isa8_osv8_isv2, ov::PartialShape{1, 2, 3, 4}, false}, + {format::is_os_yx_isa2_osa8_isv8_osv2, ov::PartialShape{1, 2, 3, 4}, false}, + {format::is_o32_yx_isv32_swizzled_by_4, ov::PartialShape{1, 2, 3, 4}, false}, + // 4D formats grouped + {format::bfzyx, ov::PartialShape{1, 2, 3, 4, 5}, true}, + {format::goiyx, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::g_os_iyx_osv32, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::g_os_is_yx_isv8_osv16_isv2, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::g_os_is_yx_osv16_isv4, ov::PartialShape{1, 2, 3, 4, 5}, false}, + // {format::gs_oi_yxs_gsv32_yxsv4, ov::PartialShape{1, 2, 3, 4, 5}, false}, + // 5D formats + {format::oizyx, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::iozyx, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::os_is_zyx_isa8_osv16_isv4, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::os_is_zyx_osa4_isa8_osv8_isv4, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::is_os_zyx_isa8_osv8_isv2, ov::PartialShape{1, 2, 3, 4, 5}, false}, + {format::is_os_zyx_isv16_osv16, ov::PartialShape{1, 2, 3, 4, 5}, false}, + // 5D formats grouped + {format::bfwzyx, ov::PartialShape{1, 2, 3, 4, 5, 6}, true}, + {format::giozyx, ov::PartialShape{1, 2, 3, 4, 5, 6}, false}, + {format::gs_oizyx_gsv32, ov::PartialShape{1, 2, 3, 4, 5, 6}, false}, + {format::g_os_zyx_is_osv32_isv32, ov::PartialShape{1, 2, 3, 4, 5, 6}, false}, + {format::g_is_os_zyx_isv16_osv16, ov::PartialShape{1, 2, 3, 4, 5, 6}, false}, + {format::g_os_is_zyx_osa4_isa8_osv8_isv2, ov::PartialShape{1, 2, 3, 4, 5, 6}, false}, + })); diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp index 300e6b5064b..11d769f322b 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp @@ -10,7 +10,6 @@ #include #include #include -#include using namespace cldnn; using namespace ::tests; @@ -111,18 +110,20 @@ TEST(primitive_comparison, permute) { ASSERT_NE(permute_prim, permute_prim_order); } -TEST(primitive_comparison, generic_layer) { +TEST(primitive_comparison, reorder_weights) { auto shape = ov::PartialShape{1, 2, 3, 4}; auto data_type = data_types::f32; - auto format_in = format::bfyx; - auto format_out = format::os_iyx_osv16; - auto input_layout = layout{shape, data_type, format_in}; - auto output_layout = layout{shape, data_type, format_out}; - auto generic_layer_prim = generic_layer("generic_layer", "", std::make_shared(input_layout, output_layout)); - auto generic_layer_eq_prim = generic_layer("generic_layer_eq", "", std::make_shared(input_layout, output_layout)); - auto generic_layer_different_prim = generic_layer("generic_layer", "", std::make_shared(output_layout, input_layout)); + auto format_osv16 = format::os_iyx_osv16; + auto format_osv32 = format::os_iyx_osv32; - ASSERT_EQ(generic_layer_prim, generic_layer_eq_prim); - ASSERT_NE(generic_layer_prim, generic_layer_different_prim); + auto layout_osv16 = layout{shape, data_type, format_osv16}; + auto layout_osv32 = layout{shape, data_type, format_osv32}; + + auto reorder_weights_prim = reorder("reorder_weights", input_info("input"), layout_osv16); + auto reorder_weights_eq_prim = reorder("reorder_weights_eq", input_info("input"), layout_osv16); + auto reorder_weights_diff_prim = reorder("reorder_weights_neq", input_info("input"), layout_osv32); + + ASSERT_EQ(reorder_weights_prim, reorder_weights_eq_prim); + ASSERT_NE(reorder_weights_prim, reorder_weights_diff_prim); } diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/weights_reorder_factory_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/weights_reorder_factory_test.cpp index 90f6f7e8f74..3f8da1fc6a7 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/weights_reorder_factory_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/weights_reorder_factory_test.cpp @@ -10,7 +10,7 @@ #include "intel_gpu/primitives/input_layout.hpp" #include "intel_gpu/primitives/data.hpp" -#include "generic_layer_inst.h" +#include "reorder_inst.h" #include "fully_connected_inst.h" #include "implementation_map.hpp" #include "graph/impls/ocl/register.hpp" @@ -41,6 +41,7 @@ TEST(weights_factory, reorder_test) { tests::random_generator rg(GET_SUITE_NAME); const int input_f = 32, output_f = 32; + auto weights_layout = layout(ov::PartialShape{ output_f, input_f }, data_types::f32, format::bfyx); auto weights_data_input = engine.allocate_memory(weights_layout); auto weights_data_vec = rg.generate_random_1d(output_f * input_f, -1, 1); @@ -69,10 +70,11 @@ TEST(weights_factory, reorder_test) { // Constuct kernel_impl_params for weights reorder based requested WeightsReorderParams auto reorder_kernel_params = std::make_shared(); - reorder_kernel_params->desc = std::make_shared("weights_reorder", "", weights_reorder_params); + reorder_kernel_params->desc = std::make_shared("weights_reorder", input_info(), weights_reorder_params); reorder_kernel_params->unique_id = weights_reorder_params->hash(); reorder_kernel_params->input_layouts.push_back(weights_reorder_params->get_input_layout()); reorder_kernel_params->output_layouts.push_back(weights_reorder_params->get_output_layout()); + reorder_kernel_params->prog = network.get_program().get(); // Create new generic_layer_impl auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape); @@ -93,7 +95,8 @@ TEST(weights_factory, reorder_test) { args.inputs.push_back(weights_data_input); args.outputs.push_back(weights_data_output); - auto reorder_inst = std::make_shared(network); + auto reorder_inst = std::make_shared(network); + reorder_inst->set_impl(reorder_impl->clone()); reorder_inst->get_impl()->set_arguments(*reorder_inst, args); diff --git a/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp new file mode 100644 index 00000000000..b205f21938f --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/passes/post_optimize_weights.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "program_wrapper.h" +#include "fully_connected_inst.h" + +using namespace cldnn; +using namespace ::tests; + +TEST(post_optimize_weights, fuse_reorder_to_weights_reorder_test) { + auto& engine = get_test_engine(); + + auto input = engine.allocate_memory({ { 2, 32 }, data_types::f16, format::bfyx }); + auto weights = engine.allocate_memory({{ 2, 32 }, data_types::f32, format::bfyx }); + + topology topology( + input_layout("input", input->get_layout()), + input_layout("weights", weights->get_layout()), + reorder("reorder_dt", input_info("weights"), format::bfyx, data_types::f16), + fully_connected("fc", input_info("input"), { "reorder_dt" }, "", data_types::f16) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + auto prog = program::build_program(engine, topology, config, false, true); + + reorder_factory rf; + program_wrapper::apply_opt_pass(*prog); + program_wrapper::apply_opt_pass(*prog, rf); + + ASSERT_TRUE(has_node(*prog, "reorder_dt")); + ASSERT_TRUE(format::is_weights_format(prog->get_node("reorder_dt").get_output_layout().format)); +} + +TEST(post_optimize_weights, weights_reorder_constant_folding_test) { + auto& engine = get_test_engine(); + + ov::Shape pshape = { 4, 16 }; + auto input = engine.allocate_memory({ pshape, data_types::f32, format::bfyx }); + auto weights = engine.allocate_memory({ pshape, data_types::f32, format::bfyx }); + + std::vector weights_data(pshape[0] * pshape[1]); + std::iota(weights_data.begin(), weights_data.end(), 0.f); + set_values(weights, weights_data); + + topology topology( + input_layout("input", input->get_layout()), + data("weights", weights), + fully_connected("fc", input_info("input"), { "weights" }) + ); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + auto prog = program::build_program(engine, topology, config, false, true); + + reorder_factory rf; + program_wrapper::apply_opt_pass(*prog); + program_wrapper::apply_opt_pass(*prog, rf); + program_wrapper::apply_opt_pass(*prog); + + ASSERT_TRUE(has_node(*prog, "weights_weights_reorder_0")); + auto& weights_node = prog->get_node("weights_weights_reorder_0"); + ASSERT_TRUE(weights_node.is_type()); + + size_t align = 16; // os_iyx_osv16 format + size_t aligned_b_size = pshape[0] % align == 0 ? pshape[0] + : pshape[0] - pshape[0] % align + align; + std::vector expected(aligned_b_size * pshape[1], 0.f); + size_t input_idx = 0; + for (size_t i = 0; i < pshape[0]; ++i) { + for (size_t j = 0; j < pshape[1]; ++j) { + expected[j * align + i] = weights_data[input_idx++]; + } + } + + auto weights_mem_ptr = weights_node.as().get_attached_memory_ptr(); + cldnn::mem_lock weights_mem(weights_mem_ptr, get_test_stream()); + + for (size_t i = 0; i < expected.size(); ++i) { + ASSERT_EQ(weights_mem[i], expected[i]); + } +}