diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index b797747f39b..2ae57fc82aa 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -225,10 +225,6 @@ public: /// Returns memory state @p variable_id of stateful network VariableState& get_variable_memory(const std::string &variable_id); - /// Return in_mem_kernels_cache - KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; } - std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; } - const ExecutionConfig& get_config() const { return _config; } private: @@ -260,8 +256,6 @@ private: std::unordered_map _events; output_chains_map _output_chains; - mutable std::mutex _in_mem_cache_mutex; - void build_exec_order(); void allocate_primitive_instance(program_node const& node); void transfer_memory_to_device(std::shared_ptr instance, program_node const& node); @@ -273,10 +267,6 @@ private: void calculate_weights_cache_capacity(); output_chains_map::iterator add_output_chain(std::shared_ptr& p_inst); - // Move from cldnn::program to cldnn::network for multi-threads issue. - std::unique_ptr _in_mem_kernels_cache; - const size_t _in_mem_kernels_cache_capacity = 10000; - #ifdef GPU_DEBUG_CONFIG int64_t iteration = 0; #endif diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/generic_layer.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/generic_layer.hpp new file mode 100644 index 00000000000..3fdfee37caf --- /dev/null +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/generic_layer.hpp @@ -0,0 +1,86 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "intel_gpu/primitives/primitive.hpp" +#include "intel_gpu/runtime/memory.hpp" + +#include + +namespace cldnn { + +struct WeightsReorderParams { + WeightsReorderParams(layout in_layout, layout out_layout) : _in_layout(in_layout), _out_layout(out_layout) {} + + virtual size_t hash() const { + return hash_combine(_in_layout.hash(), _out_layout.hash()); + } + + virtual bool operator==(const WeightsReorderParams& rhs) const { + if (typeid(*this) != typeid(rhs)) + return false; + + return _in_layout == rhs._in_layout && + _out_layout == rhs._out_layout; + } + + layout get_input_layout() const { return _in_layout; } + layout get_output_layout() const { return _out_layout; } + + virtual ~WeightsReorderParams() = default; + +protected: + layout _in_layout; + layout _out_layout; +}; + +/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved. +/// @details Corresponding values are bitwise equal before/after reorder. +struct generic_layer : public primitive_base { + CLDNN_DECLARE_PRIMITIVE(generic_layer) + + /// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive. + /// @param id This primitive id. + /// @param input Input primitive id. + /// @param output_layout Requested memory layout. + /// @param mean Primitive id to get mean subtract values. + generic_layer(const primitive_id& id, + const primitive_id& input, + std::shared_ptr params, + const padding& output_padding = padding()) + : primitive_base(id, {input}, {output_padding}), params(params) {} + + std::shared_ptr params; + + size_t hash() const override { + size_t seed = primitive::hash(); + + if (params) + seed = hash_combine(seed, params->hash()); + + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + if ((params == nullptr) != (rhs_casted.params == nullptr)) + return false; + + if (params != nullptr) + return *params == *rhs_casted.params; + + return true; + } + +protected: + std::vector> get_dependencies() const override { return {}; } +}; +/// @} +/// @} +/// @} +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/generic_layer.cpp b/src/plugins/intel_gpu/src/graph/generic_layer.cpp index e446de5394a..88c1da6c58b 100644 --- a/src/plugins/intel_gpu/src/graph/generic_layer.cpp +++ b/src/plugins/intel_gpu/src/graph/generic_layer.cpp @@ -21,6 +21,11 @@ generic_layer_node::typed_program_node(const std::shared_ptr prim generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node) : parent(network, node) {} +generic_layer_inst::typed_primitive_inst(network& network) + : parent(network) { + _type = generic_layer::type_id(); +} + std::string generic_layer_inst::to_string(generic_layer_node const& node) { auto node_info = node.desc_to_json(); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp index 6d6f5f8ca8e..e21d8571712 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp @@ -2,14 +2,16 @@ // SPDX-License-Identifier: Apache-2.0 // -#include #include "pass_manager.h" #include "program_node.h" #include "mutable_data_inst.h" #include "convert_color_inst.h" +#include "fully_connected_inst.h" #include "assign_inst.h" #include "tensor_type.h" + +#include #include #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index 459d4be84e1..1938c76e1ed 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -2,15 +2,18 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "intel_gpu/runtime/engine.hpp" +#include "intel_gpu/runtime/itt.hpp" + #include "pass_manager.h" #include "data_inst.h" #include "mutable_data_inst.h" #include "reshape_inst.h" #include "quantize_inst.h" #include "arg_max_min_inst.h" +#include "fully_connected_inst.h" #include "program_node.h" -#include "intel_gpu/runtime/engine.hpp" -#include "intel_gpu/runtime/itt.hpp" + #include #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp index 1a5d9287994..95ea2410ad7 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp @@ -9,6 +9,7 @@ #include "gemm_inst.h" #include "pooling_inst.h" +#include "fully_connected_inst.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp index 592a3c811fa..505718deaa5 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_input_reorder.cpp @@ -4,6 +4,7 @@ #include "pass_manager.h" #include "impls/ocl/primitive_base.hpp" +#include "fully_connected_inst.h" #include "fully_connected/fully_connected_params.h" #include #include diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp index 44b6d0088e2..d75663aa45f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/post_optimize_weights.cpp @@ -4,8 +4,11 @@ #include "pass_manager.h" #include "program_helpers.h" -#include "include/binary_convolution_inst.h" -#include "include/deformable_convolution_inst.h" +#include "convolution_inst.h" +#include "binary_convolution_inst.h" +#include "deconvolution_inst.h" +#include "deformable_convolution_inst.h" +#include "fully_connected_inst.h" #include "lstm_dynamic_input_inst.h" namespace cldnn { @@ -13,7 +16,6 @@ namespace cldnn { post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref) : base_pass("post_optimize_weights"), _rf(rf_ref) {} -// function which prepares given primitive for weights optimization template post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) { return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size()); } @@ -37,15 +39,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { return; auto output_layout = node.get_output_layout(); - auto& weights_reorder_params = impl->_weights_reorder_params; - + auto weights_reorder_params = impl->get_weights_reorder_params(); for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) { auto& weights_node = node.get_dependency(i); - auto weights_layout = weights_node.get_output_layout(); - auto reorders = _rf.get_weights_reorder(weights_node.id(), weights_layout, weights_reorder_params); + auto reorder = _rf.get_weights_reorder(weights_node.id(), weights_reorder_params); - for (auto& reorder : reorders) { + if (reorder.first) { // insert new generic_layer node to topology p.add_intermediate(reorder.first, node, i, !reorder.second); // set generic_layer's node output layout and implementation @@ -65,9 +65,7 @@ void post_optimize_weights::optimize_weights(T& node, program& p) { } // Reset weights reorder params to not keep source code pointer - weights_reorder_params.engine = kernel_selector::generic_kernel_params::Engine::NONE; - weights_reorder_params.clKernel = nullptr; - weights_reorder_params.cpuKernel = nullptr; + impl->reset_weights_reorder_params(); // set the old output layout and do not invalidate users as change of weights will not affect output layout node.set_output_layout(output_layout, false); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index bf959581ae7..5a1e8ecc413 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -25,6 +25,7 @@ #include "softmax_inst.h" #include "resample_inst.h" #include "depth_to_space_inst.h" +#include "fully_connected_inst.h" #include "space_to_depth_inst.h" #include "gather_inst.h" #include "gather_nd_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 8d9c0a1a9a8..d21275c5b71 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -2,13 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "intel_gpu/runtime/debug_configuration.hpp" + #include "pass_manager.h" #include "program_helpers.h" -#include "binary_convolution_inst.h" -#include -#include -#include +#include "binary_convolution_inst.h" #include "reshape_inst.h" #include "convert_color_inst.h" #include "one_hot_inst.h" @@ -16,7 +15,11 @@ #include "depth_to_space_inst.h" #include "concatenation_inst.h" #include "region_yolo_inst.h" -#include "intel_gpu/runtime/debug_configuration.hpp" +#include "fully_connected_inst.h" + +#include +#include +#include using namespace cldnn; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 06954cab552..011b41f3cde 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -13,6 +13,8 @@ #include "mvn_inst.h" #include "to_string_utils.h" #include "pooling_inst.h" +#include "reshape_inst.h" +#include "fully_connected_inst.h" #ifdef ENABLE_ONEDNN_FOR_GPU #include "gemm_inst.h" diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp index 77b9154081a..136f64b9edf 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/select_preferred_formats.cpp @@ -5,6 +5,7 @@ #include "pass_manager.h" #include "data_inst.h" #include "mutable_data_inst.h" +#include "fully_connected_inst.h" #include "gemm_inst.h" #include "program_node.h" #include "intel_gpu/runtime/engine.hpp" diff --git a/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp b/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp index 4815477ddf1..2e767f6fe7e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/common/wait_for_events.cpp @@ -31,6 +31,7 @@ public: void init_kernels(const kernels_cache&, const kernel_impl_params&) override {} void set_arguments(primitive_inst& /*instance*/) override {} + void set_arguments(primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) override {} kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override { kernel_arguments_data args; return args; diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp index bc6483fd9e1..15fe1df65e8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp @@ -400,7 +400,7 @@ struct non_max_suppression_impl : typed_primitive_impl { return make_unique(*this); } - non_max_suppression_impl() : parent(kernel_selector::weights_reorder_params(), "non_max_suppression_impl") {} + non_max_suppression_impl() : parent("non_max_suppression_impl") {} event::ptr execute_impl(const std::vector& event, typed_primitive_inst& instance) override { for (auto e : event) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp index 3a0dea449bd..7da3d0c60d2 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/generic_layer.cpp @@ -14,8 +14,8 @@ struct generic_layer_impl : typed_primitive_impl { using parent::parent; kernel_selector::cl_kernel_data _cl_kernel_data; - std::vector _kernels; - std::string _cached_kernel_id; + kernel::ptr _kernel; + kernel_id _cached_kernel_id; DECLARE_OBJECT_TYPE_SERIALIZATION @@ -27,18 +27,21 @@ struct generic_layer_impl : typed_primitive_impl { generic_layer_impl(const generic_layer_impl& other) : _cl_kernel_data(other._cl_kernel_data) - , _kernels({}) + , _kernel(nullptr) , _cached_kernel_id(other._cached_kernel_id) { - if (other._kernels.empty()) { - throw std::runtime_error("Can't copy generic_layer_impl node: kernels vector is empty"); - } - _kernels.push_back(other._kernels.front()->clone()); + OPENVINO_ASSERT(other._kernel, "[GPU] Can't copy generic_layer_impl node: kernel is nullptr"); + _kernel = other._kernel->clone(); } - generic_layer_impl(const generic_layer_node& arg) - : _cl_kernel_data(*arg.get_primitive()->generic_params.clKernel.get()) - , _kernels() - , _cached_kernel_id() { } + generic_layer_impl(const kernel_impl_params& params) + : _cl_kernel_data() + , _kernel(nullptr) + , _cached_kernel_id() { + auto reorder_params = params.typed_desc()->params; + auto casted_params = std::dynamic_pointer_cast(reorder_params); + OPENVINO_ASSERT(casted_params, "[GPU] Invalid weights reorder parameters type for ", params.desc->id, " node"); + _cl_kernel_data = *casted_params->get_cl_kernel(); + } std::vector> get_kernels_source() override { std::vector> kernel_strings; @@ -47,11 +50,11 @@ struct generic_layer_impl : typed_primitive_impl { } std::vector get_kernels() const override { - return _kernels; + return {_kernel}; } void save(BinaryOutputBuffer& ob) const override { - ob <<_cl_kernel_data; + ob << _cl_kernel_data; ob << _cached_kernel_id; } @@ -61,21 +64,27 @@ struct generic_layer_impl : typed_primitive_impl { } void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override { - _kernels.clear(); + _kernel = nullptr; auto compiled_kernels = kernels_cache.get_kernels(params); - _kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end()); + OPENVINO_ASSERT(compiled_kernels.size() == 1, "[GPU] Unexpected number of kernels for generic_layer during init_kernels() call"); + _kernel = compiled_kernels.front(); } void init_by_cached_kernels(const kernels_cache& kernels_cache) override { - _kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id)); + _kernel = kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id); } void set_cached_kernel_ids(const kernels_cache& kernels_cache) override { - _cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]); + _cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernel); + } + + void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override { + OPENVINO_ASSERT(kernels.size() == 1 && + kernels.begin()->second.size() == 1, "[GPU] Unexpected number of kernels for generic_layer"); + _kernel = kernels.begin()->second[0].first; } void set_arguments_impl(generic_layer_inst& instance) override { - stream& stream = instance.get_network().get_stream(); kernel_arguments_data args; args.scalars = &_cl_kernel_data.params.scalars; @@ -83,7 +92,13 @@ struct generic_layer_impl : typed_primitive_impl { args.inputs.push_back(instance.input_memory_ptr(i)); } args.outputs.push_back(instance.output_memory_ptr()); - stream.set_arguments(*_kernels.front(), _cl_kernel_data.params, args); + + set_arguments_impl(instance, args); + } + + void set_arguments_impl(generic_layer_inst& instance, kernel_arguments_data& args) override { + stream& stream = instance.get_network().get_stream(); + stream.set_arguments(*_kernel, _cl_kernel_data.params, args); } event::ptr execute_impl(const std::vector& events, generic_layer_inst& instance) override { @@ -95,58 +110,23 @@ struct generic_layer_impl : typed_primitive_impl { args.inputs.push_back(instance.input_memory_ptr(i)); } args.outputs.push_back(instance.output_memory_ptr()); - return stream.enqueue_kernel(*_kernels.front(), _cl_kernel_data.params, args, events, true); + return stream.enqueue_kernel(*_kernel, _cl_kernel_data.params, args, events, true); + } + + static std::unique_ptr create(const kernel_impl_params& params) { + return make_unique(params); } }; -// TODO: move this file to cpu folder and add a new traget to 'cldnn::engine_types' -struct generic_layer_cpu : typed_primitive_impl { - const generic_layer_node& outer; - DECLARE_OBJECT_TYPE_SERIALIZATION - - std::unique_ptr clone() const override { - return make_unique(*this); - } - - explicit generic_layer_cpu(const generic_layer_node& arg) : outer(arg) {} - - event::ptr execute_impl(const std::vector& events, generic_layer_inst& instance) override { - stream& stream = instance.get_network().get_stream(); - auto input_mem = instance.input_memory_ptr(); - auto output_mem = instance.output_memory_ptr(); - - auto ev = stream.create_user_event(false); - std::vector tmp_events(events); - - for (auto& a : events) { - a->wait(); - } - - mem_lock old_pointer(input_mem, stream); - mem_lock new_pointer(output_mem, stream); - - const auto& cpu_kernel = *outer.get_primitive()->generic_params.cpuKernel.get(); - - cpu_kernel.Execute(old_pointer.data(), old_pointer.size(), new_pointer.data(), new_pointer.size()); - - ev->set(); - return ev; - } - - void init_kernels(const kernels_cache&, const kernel_impl_params&) override {} -}; - -static std::unique_ptr create(const generic_layer_node& arg, const kernel_impl_params&) { - if (arg.get_primitive()->generic_params.engine == kernel_selector::generic_kernel_params::Engine::GPU) { - return make_unique(arg); - } else { - return make_unique(arg); - } +static std::unique_ptr create(const generic_layer_node& arg, const kernel_impl_params& params) { + return make_unique(params); } namespace detail { attach_generic_layer_impl::attach_generic_layer_impl() { implementation_map::add(cldnn::impl_types::ocl, create, {}); + + WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, generic_layer_impl::create); } } // namespace detail @@ -154,4 +134,3 @@ attach_generic_layer_impl::attach_generic_layer_impl() { } // namespace cldnn BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl) -ASSIGN_TYPE_NAME(cldnn::ocl::generic_layer_cpu) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h index 7d3fe4c301e..0ccbc466f50 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h @@ -14,6 +14,7 @@ #include "intel_gpu/primitives/eltwise.hpp" #include "intel_gpu/primitives/quantize.hpp" #include "intel_gpu/primitives/activation.hpp" +#include "intel_gpu/primitives/generic_layer.hpp" #include "intel_gpu/primitives/primitive.hpp" #include "kernel_selector_params.h" @@ -166,7 +167,7 @@ inline optional_params_t get_default_weights_bias_optional_params(const program& } inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) { -switch (mode) { + switch (mode) { case eltwise_mode::sum: return kernel_selector::eltwise_mode::ADD; case eltwise_mode::sub: @@ -269,4 +270,106 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im return updated_impl_params; } +class WeightsReorderParamsOCL : public WeightsReorderParams { +public: + explicit WeightsReorderParamsOCL(const kernel_selector::WeightsReorderParams& params) + : WeightsReorderParams(from_weights_tensor(params.src), from_weights_tensor(params.dest)) { + cl_kernel = params.clKernel; + } + + size_t hash() const override { + size_t seed = WeightsReorderParams::hash(); + + if (cl_kernel == nullptr) + return seed; + + seed = hash_combine(seed, cl_kernel->skip_execution); + + auto& gws = cl_kernel->params.workGroups.global; + seed = hash_range(seed, gws.begin(), gws.end()); + + auto& lws = cl_kernel->params.workGroups.local; + seed = hash_range(seed, lws.begin(), lws.end()); + + auto& arguments = cl_kernel->params.arguments; + for (auto& args : arguments) { + seed = hash_combine(seed, args.index); + seed = hash_combine(seed, args.t); + } + + auto& scalars = cl_kernel->params.scalars; + for (auto& s : scalars) { + seed = hash_combine(seed, s.t); + } + + return seed; + } + + bool operator==(const WeightsReorderParams& rhs) const override { + if (typeid(*this) != typeid(rhs)) + return false; + + if (!WeightsReorderParams::operator==(rhs)) + return false; + + auto rhs_casted = downcast(rhs); + + if (cl_kernel != nullptr && rhs_casted.cl_kernel != nullptr) { + auto& clKernel_rhs = rhs_casted.cl_kernel; + if (cl_kernel->skip_execution != clKernel_rhs->skip_execution) + return false; + + auto& gws = cl_kernel->params.workGroups.global; + auto& gws_rhs = clKernel_rhs->params.workGroups.global; + if (gws != gws_rhs) + return false; + + auto& lws = cl_kernel->params.workGroups.local; + auto& lws_rhs = clKernel_rhs->params.workGroups.local; + if (lws != lws_rhs) + return false; + + auto& arguments = cl_kernel->params.arguments; + auto& arguments_rhs = clKernel_rhs->params.arguments; + if (arguments.size() != arguments_rhs.size()) + return false; + + for (size_t idx = 0; idx < arguments.size(); idx++) { + if (arguments[idx].index != arguments_rhs[idx].index) + return false; + + if (arguments[idx].t != arguments_rhs[idx].t) + return false; + } + + auto& scalars = cl_kernel->params.scalars; + auto& scalars_rhs = clKernel_rhs->params.scalars; + if (scalars.size() != scalars_rhs.size()) + return false; + + for (size_t idx = 0; idx < scalars.size(); idx++) { + if (scalars[idx].t != scalars_rhs[idx].t) + return false; + } + } + + return true; + } + + std::shared_ptr get_cl_kernel() { + return cl_kernel; + } + +private: + std::shared_ptr cl_kernel; +}; + +inline std::shared_ptr create_weights_reorder_params(const kernel_selector::WeightsReorderParams& params) { + if (params.engine == kernel_selector::generic_kernel_params::Engine::NONE) { + return nullptr; + } + + return std::make_shared(params); +} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 85d26ed4b68..9909ec61d35 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -40,7 +40,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) { _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; - _kernel_data.weightsReorderParams.cpuKernel = nullptr; _kernel_data.weightsReorderParams.clKernel = nullptr; } @@ -57,11 +56,10 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { } typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd) - : typed_primitive_impl(kd.weightsReorderParams, kd.kernelName), + : typed_primitive_impl(create_weights_reorder_params(kd.weightsReorderParams), kd.kernelName), _kernel_data(kd) { // weights reorder params got copied to parent, clear in _kernel_data to release shared ptr _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; - _kernel_data.weightsReorderParams.cpuKernel = nullptr; _kernel_data.weightsReorderParams.clKernel = nullptr; this->can_reuse_memory = _kernel_data.can_reuse_memory; @@ -214,6 +212,21 @@ protected: } } + void set_arguments_impl(typed_primitive_inst& instance, kernel_arguments_data& args) override { + if (instance.can_be_optimized()) { + return; + } + + stream& stream = instance.get_network().get_stream(); + + for (size_t k = 0; k < _kernels.size(); ++k) { + if (_kernel_data.kernels[k].skip_execution) + continue; + + stream.set_arguments(*_kernels[k], _kernel_data.kernels[k].params, args); + } + } + kernel_arguments_data get_arguments_impl(const typed_primitive_inst& instance) const override { for (size_t k = 0; k < _kernels.size(); ++k) { auto args = get_arguments(instance); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp index 35a608be5c4..54098c2bce8 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp @@ -4,7 +4,6 @@ #pragma once -#include "generic_layer.hpp" #include "intel_gpu/primitives/activation.hpp" #include "intel_gpu/primitives/arg_max_min.hpp" #include "intel_gpu/primitives/batch_to_space.hpp" @@ -29,6 +28,7 @@ #include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp" #include "intel_gpu/primitives/eye.hpp" #include "intel_gpu/primitives/fully_connected.hpp" +#include "intel_gpu/primitives/generic_layer.hpp" #include "intel_gpu/primitives/gather.hpp" #include "intel_gpu/primitives/gather_elements.hpp" #include "intel_gpu/primitives/gather_nd.hpp" diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp index 4186cc33075..c908f1dcc10 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp @@ -8,7 +8,7 @@ #include "primitive_onednn_base.h" #include "implementation_map.hpp" -#include "kernel_selector_common.h" +#include "impls/ocl/kernel_selector_helper.h" #include "utils.hpp" @@ -158,6 +158,7 @@ protected: weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; weights_reorder_params.clKernel = std::make_shared(kernels_data[0].kernels[0]); + weights_reorder_params.src = r_params.input; weights_reorder_params.dest = r_params.output; return weights_reorder_params; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp index b13dfbf16a2..9e4a91d809a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp @@ -8,7 +8,7 @@ #include "primitive_onednn_base.h" #include "implementation_map.hpp" -#include "kernel_selector_common.h" +#include "impls/ocl/kernel_selector_helper.h" #include @@ -79,6 +79,7 @@ protected: weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; weights_reorder_params.clKernel = std::make_shared(kernels_data[0].kernels[0]); + weights_reorder_params.src = r_params.input; weights_reorder_params.dest = r_params.output; return weights_reorder_params; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 74945255f87..6eaf5ad7948 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -6,7 +6,7 @@ #include "primitive_onednn_base.h" #include "implementation_map.hpp" -#include "kernel_selector_common.h" +#include "impls/ocl/kernel_selector_helper.h" #include @@ -91,6 +91,7 @@ protected: weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; weights_reorder_params.clKernel = std::make_shared(kernels_data[0].kernels[0]); + weights_reorder_params.src = r_params.input; weights_reorder_params.dest = r_params.output; return weights_reorder_params; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index c06594580cf..f5f6f15c909 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -20,6 +20,7 @@ #include "reorder/reorder_weights_kernel_selector.h" #include "reorder/reorder_kernel_base.h" +#include "impls/ocl/kernel_selector_helper.h" #include #include @@ -46,7 +47,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl { std::shared_ptr attrs, const PrimDescType& pd, kernel_selector::WeightsReorderParams weights_reorder = {}) - : typed_primitive_impl(weights_reorder, pd.impl_info_str()), + : typed_primitive_impl(create_weights_reorder_params(weights_reorder), pd.impl_info_str()), _engine(&engine), _attrs(attrs), _pd(pd) { diff --git a/src/plugins/intel_gpu/src/graph/include/generic_layer.hpp b/src/plugins/intel_gpu/src/graph/include/generic_layer.hpp deleted file mode 100644 index 405a64e9045..00000000000 --- a/src/plugins/intel_gpu/src/graph/include/generic_layer.hpp +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (C) 2018-2023 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once -#include "intel_gpu/primitives/primitive.hpp" -#include "intel_gpu/runtime/memory.hpp" - -// TODO: Remove OCL impl dependency here or move to OCL folder -#include "impls/ocl/kernel_selector_helper.h" - -#include - -namespace cldnn { - -/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved. -/// @details Corresponding values are bitwise equal before/after reorder. -/// Also merged with subtraction layer, which can subtract values while doing reordering. -/// NOTE THAT THIS WILL SUBTRACT THE SAME VALUES FROM EACH BATCH. -struct generic_layer : public primitive_base { - CLDNN_DECLARE_PRIMITIVE(generic_layer) - - /// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive. - /// @param id This primitive id. - /// @param input Input primitive id. - /// @param output_layout Requested memory layout. - /// @param mean Primitive id to get mean subtract values. - generic_layer(const primitive_id& id, - const primitive_id& input, - const layout& output_layout, - const kernel_selector::generic_kernel_params& generic_params, - const padding& output_padding = padding()) - : primitive_base(id, {input}, {output_padding}), output_layout(output_layout), generic_params(generic_params) {} - - /// @brief Requested memory layout. - layout output_layout; - const kernel_selector::generic_kernel_params generic_params; - - size_t hash() const override { - size_t seed = primitive::hash(); - seed = hash_combine(seed, generic_params.engine); - - if (generic_params.cpuKernel != nullptr) { - auto& cpuKernel = generic_params.cpuKernel; - seed = hash_combine(seed, cpuKernel->GetExpectedInputLayout()); - seed = hash_combine(seed, cpuKernel->GetExpectedInputType()); - } - - if (generic_params.clKernel != nullptr) { - auto& clKernel = generic_params.clKernel; - seed = hash_combine(seed, clKernel->skip_execution); - - auto& gws = clKernel->params.workGroups.global; - seed = hash_range(seed, gws.begin(), gws.end()); - - auto& lws = clKernel->params.workGroups.local; - seed = hash_range(seed, lws.begin(), lws.end()); - - auto& arguments = clKernel->params.arguments; - for (auto& args : arguments) { - seed = hash_combine(seed, args.index); - seed = hash_combine(seed, args.t); - } - - auto& scalars = clKernel->params.scalars; - for (auto& s : scalars) { - seed = hash_combine(seed, s.t); - } - - seed = hash_combine(seed, clKernel->code.kernelString->get_hash()); - } - return seed; - } - - bool operator==(const primitive& rhs) const override { - if (!compare_common_params(rhs)) - return false; - - auto rhs_casted = downcast(rhs); - - if (generic_params.engine != rhs_casted.generic_params.engine) - return false; - - if (generic_params.cpuKernel != nullptr) { - if (generic_params.cpuKernel->GetExpectedInputLayout() != rhs_casted.generic_params.cpuKernel->GetExpectedInputLayout()) - return false; - - if (generic_params.cpuKernel->GetExpectedInputType() != rhs_casted.generic_params.cpuKernel->GetExpectedInputType()) - return false; - } - - if (generic_params.clKernel != nullptr) { - auto& clKernel = generic_params.clKernel; - auto& clKernel_rhs = rhs_casted.generic_params.clKernel; - if (clKernel->skip_execution != clKernel_rhs->skip_execution) - return false; - - auto& gws = clKernel->params.workGroups.global; - auto& gws_rhs = clKernel_rhs->params.workGroups.global; - if (gws != gws_rhs) - return false; - - auto& lws = clKernel->params.workGroups.local; - auto& lws_rhs = clKernel_rhs->params.workGroups.local; - if (lws != lws_rhs) - return false; - - auto& arguments = clKernel->params.arguments; - auto& arguments_rhs = clKernel_rhs->params.arguments; - if (arguments.size() != arguments_rhs.size()) - return false; - - for (size_t idx = 0; idx < arguments.size(); idx++) { - if (arguments[idx].index != arguments_rhs[idx].index) - return false; - - if (arguments[idx].t != arguments_rhs[idx].t) - return false; - } - - auto& scalars = clKernel->params.scalars; - auto& scalars_rhs = clKernel_rhs->params.scalars; - if (scalars.size() != scalars_rhs.size()) - return false; - - for (size_t idx = 0; idx < scalars.size(); idx++) { - if (scalars[idx].t != scalars_rhs[idx].t) - return false; - } - - if (clKernel->code.kernelString->get_str() != clKernel_rhs->code.kernelString->get_str()) - return false; - } - return true; - } - -protected: - std::vector> get_dependencies() const override { return {}; } -}; -/// @} -/// @} -/// @} -} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h b/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h index 3016cb6ad8a..9022f9aadaf 100644 --- a/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/generic_layer_inst.h @@ -3,7 +3,7 @@ // #pragma once -#include "generic_layer.hpp" +#include "intel_gpu/primitives/generic_layer.hpp" #include "primitive_inst.h" #include @@ -31,12 +31,13 @@ class typed_primitive_inst : public typed_primitive_inst_base()->output_layout; + return impl_param.typed_desc()->params->get_output_layout(); } static std::string to_string(generic_layer_node const& node); typed_primitive_inst(network& network, generic_layer_node const& node); + typed_primitive_inst(network& network); }; using generic_layer_inst = typed_primitive_inst; diff --git a/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp b/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp index 6a79e4391ab..3be2041bd8e 100644 --- a/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp +++ b/src/plugins/intel_gpu/src/graph/include/implementation_map.hpp @@ -130,4 +130,28 @@ public: return keys; } }; + +struct WeightsReordersFactory { + using factory_type = std::function(const kernel_impl_params&)>; + using map_type = singleton_map, factory_type>; + static void add(impl_types impl_type, shape_types shape_type, factory_type factory) { + OPENVINO_ASSERT(impl_type != impl_types::any, "[GPU] Can't register WeightsReordersFactory with type any"); + map_type::instance().insert({{impl_type, shape_type}, factory}); + } + + static factory_type get(impl_types preferred_impl_type, shape_types target_shape_type) { + for (auto& kv : map_type::instance()) { + impl_types impl_type = kv.first.first; + shape_types supported_shape_type = kv.first.second; + if ((preferred_impl_type & impl_type) != impl_type) + continue; + if ((target_shape_type & supported_shape_type) != target_shape_type) + continue; + + return kv.second; + } + OPENVINO_THROW("[GPU] WeightsReordersFactory doesn't have any implementation for " + " impl_type: ", preferred_impl_type, ", shape_type: ", target_shape_type); + } +}; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h index 17710b3a412..739788a570f 100644 --- a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h +++ b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h @@ -7,21 +7,15 @@ #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/utils.hpp" +#include "intel_gpu/runtime/lru_cache.hpp" #include "data_inst.h" +#include "generic_layer_inst.h" #include "reorder_inst.h" #include "convolution_inst.h" #include "deconvolution_inst.h" -#include "fully_connected_inst.h" #include "detection_output_inst.h" #include "binary_convolution_inst.h" -#include "lstm_gemm_inst.h" -#include "generic_layer.hpp" -#include "non_max_suppression_inst.h" -#include "region_yolo_inst.h" - -// TODO: add generic interface for weights_reorder_params and get rid of this dependency -#include "impls/ocl/kernel_selector_helper.h" #include #include @@ -52,10 +46,8 @@ public: const layout& in_layout, const layout& out_layout); - std::vector, bool>> get_weights_reorder( - primitive_id input_id, - const layout& old_layout, - const kernel_selector::weights_reorder_params& reorder_params); + std::pair, bool> get_weights_reorder(primitive_id input_id, + std::shared_ptr reorder_params); private: struct cache_key { diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index f2972898c57..52887994dc7 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -5,6 +5,7 @@ #pragma once #include "intel_gpu/primitives/primitive.hpp" #include "intel_gpu/primitives/concatenation.hpp" +#include "intel_gpu/primitives/generic_layer.hpp" #include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/lru_cache.hpp" @@ -43,21 +44,22 @@ class typed_primitive_inst; */ struct primitive_impl { primitive_impl() = default; - explicit primitive_impl(const kernel_selector::weights_reorder_params& params, std::string kernel_name = "", bool is_dynamic = false) - : _weights_reorder_params(params), _kernel_name(kernel_name), _is_dynamic(is_dynamic) {} + explicit primitive_impl(std::shared_ptr params, std::string kernel_name = "", bool is_dynamic = false) + : _weights_reorder_params(params), _kernel_name(kernel_name), _is_dynamic(is_dynamic) { + } explicit primitive_impl(std::string kernel_name, bool is_dynamic = false) : - primitive_impl(kernel_selector::weights_reorder_params{}, kernel_name, is_dynamic) {} + primitive_impl(nullptr, kernel_name, is_dynamic) {} virtual ~primitive_impl() = default; virtual std::vector get_internal_buffer_layouts() const = 0; virtual void set_node_params(const program_node&) {} virtual std::string get_type() const = 0; virtual void set_arguments(primitive_inst& instance) = 0; + virtual void set_arguments(primitive_inst& instance, kernel_arguments_data& args) = 0; virtual kernel_arguments_data get_arguments(const primitive_inst& instance) const = 0; virtual event::ptr execute(const std::vector& events, primitive_inst& instance) = 0; std::string get_kernel_name() const { return _kernel_name; } - // TODO: added a derived class for weights reordering (maybe for all static data reordering) - kernel_selector::weights_reorder_params _weights_reorder_params; + // class typed_primitive_gpu_impl override this with return false; virtual bool is_cpu() const { return true; } virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0; @@ -94,7 +96,14 @@ struct primitive_impl { virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {} virtual std::vector get_kernels() { return {}; } + bool need_weights_reorder() const { return _weights_reorder_params != nullptr; } + std::shared_ptr get_weights_reorder_params() const { return _weights_reorder_params; } + void reset_weights_reorder_params() { _weights_reorder_params = nullptr; } + + std::shared_ptr get_weights_reorder_kernel_params() const; + protected: + std::shared_ptr _weights_reorder_params = nullptr; std::string _kernel_name; bool _is_dynamic = false; }; @@ -151,6 +160,8 @@ public: const kernel_impl_params* get_impl_params() const { return _impl_params.get(); } // return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead const primitive_impl* get_impl() const { return _impl.get(); } + primitive_impl* get_impl() { return _impl.get(); } + void set_impl(std::unique_ptr impl) { _impl = std::move(impl); } memory& input_memory(size_t index = 0) const { if (index >= inputs_memory_count()) @@ -418,11 +429,22 @@ private: return set_arguments_impl(reinterpret_cast&>(instance)); } + void set_arguments(primitive_inst& instance, kernel_arguments_data& args) override { + OPENVINO_ASSERT(instance.type() == PType::type_id(), "[GPU] Implementation type ", instance.type(), + " does not match primitive type ", PType::type_id()); + if (instance.get_impl() != this) + throw std::invalid_argument( + "Trying to set_arguments for primitive implementation with mismatching primitive instance"); + + return set_arguments_impl(reinterpret_cast&>(instance), args); + } + kernel_arguments_data get_arguments(const primitive_inst& instance) const override { return get_arguments_impl(reinterpret_cast&>(instance)); } virtual void set_arguments_impl(typed_primitive_inst& /*instance*/) {} + virtual void set_arguments_impl(typed_primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) {} virtual kernel_arguments_data get_arguments_impl(const typed_primitive_inst& /*instance*/) const { kernel_arguments_data args; return args; diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index e3ea869b8d1..f9b8255748b 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -13,11 +13,12 @@ #include "reshape_inst.h" #include "arg_max_min_inst.h" #include "shape_of_inst.h" -#include "generic_layer.hpp" #include #include "gemm_inst.h" #include "deconvolution_inst.h" +#include "fully_connected_inst.h" +#include "non_max_suppression_inst.h" #include "eltwise_inst.h" #include "pooling_inst.h" #include "reduce_inst.h" @@ -155,50 +156,26 @@ std::pair, bool> reorder_factory::get_reorder(primitive return std::make_pair(reorder, false); } -std::vector, bool>> reorder_factory::get_weights_reorder( - primitive_id input_id, - const layout& old_layout, - const kernel_selector::weights_reorder_params& reorder_params) { - - if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::NONE) +std::pair, bool> reorder_factory::get_weights_reorder(primitive_id input_id, + std::shared_ptr reorder_params) { + if (reorder_params == nullptr) return {}; - std::vector, bool>> ret; - - if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::CPU && - reorder_params.cpuKernel != nullptr) { - const auto intermediate_format = from_weights_layout(reorder_params.cpuKernel->GetExpectedInputLayout()); - const auto intermediate_type = from_weights_type(reorder_params.cpuKernel->GetExpectedInputType()); - if (intermediate_format != old_layout.format || intermediate_type != old_layout.data_type) { - const layout intermediate_layout = { intermediate_type, - intermediate_format, - old_layout.get_tensor().transform(intermediate_format, 1) }; - - auto reorder = get_reorder(input_id, old_layout, intermediate_layout); - if (reorder.first) { - ret.push_back(reorder); - input_id = reorder.first->id; - } - } - } - - layout expected_layout = from_weights_tensor(reorder_params.dest); + layout expected_layout = reorder_params->get_output_layout(); cache_key ckey{ input_id, expected_layout, false }; auto itr = _cached_generic_reorders.find(ckey); if (itr != _cached_generic_reorders.end()) { - ret.push_back(std::make_pair(itr->second, true)); + return std::make_pair(itr->second, true); } else { auto count = _cached_generic_reorders.size(); std::stringstream ss; ss << input_id << "_generic_layer_" << count; - auto reorder = std::make_shared(ss.str(), input_id, expected_layout, reorder_params); + auto reorder = std::make_shared(ss.str(), input_id, reorder_params); _cached_generic_reorders[ckey] = reorder; - ret.push_back(std::make_pair(reorder, false)); + return std::make_pair(reorder, false); } - - return ret; } bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) { diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 5f3299b622c..936b4bb4d78 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -337,11 +337,6 @@ network::network(program::ptr program, const ExecutionConfig& config, stream::pt build_exec_order(); validate_primitives(); add_default_output_chains(); - - if (is_dynamic()) { - GPU_DEBUG_DEFINE_MEM_LOGGER("dynamic_network_initialization"); - _in_mem_kernels_cache = std::unique_ptr(new KernelsCache(_in_mem_kernels_cache_capacity)); - } } network::network(engine& engine, @@ -537,7 +532,8 @@ void network::save(cldnn::BinaryOutputBuffer& ob) { kernels_cache.reset(); for (const auto& p_inst : _exec_order) { if (p_inst->get_impl() != nullptr) { - kernels_cache.add_to_cached_kernels(p_inst->get_impl()->get_kernels()); + auto const_impl = static_cast(p_inst->get_impl()); + kernels_cache.add_to_cached_kernels(const_impl->get_kernels()); } } ob << kernels_cache; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 804a10219bf..c30a0074180 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -11,11 +11,13 @@ #include "fully_connected_inst.h" #include "convolution_inst.h" #include "crop_inst.h" +#include "eltwise_inst.h" #include "deconvolution_inst.h" #include "shape_of_inst.h" #include "gemm_inst.h" #include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "compilation_context.hpp" +#include "implementation_map.hpp" #include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/graph/network.hpp" @@ -93,6 +95,19 @@ bool is_any_user_cpu(const std::list& users) { return false; } +std::shared_ptr primitive_impl::get_weights_reorder_kernel_params() const { + if (!need_weights_reorder()) + return nullptr; + + auto reorder_kernel_params = std::make_shared(); + auto prim = std::make_shared("", "", _weights_reorder_params); + reorder_kernel_params->desc = prim; + reorder_kernel_params->unique_id = _weights_reorder_params->hash(); + reorder_kernel_params->input_layouts.push_back(_weights_reorder_params->get_input_layout()); + reorder_kernel_params->output_layouts.push_back(_weights_reorder_params->get_output_layout()); + return reorder_kernel_params; +} + kernel_impl_params primitive_impl::static_canonicalize_shapes(const kernel_impl_params& impl_params) { auto updated_impl_params = canonicalize_fused_shapes(impl_params); @@ -787,19 +802,19 @@ event::ptr primitive_inst::update_weights() { return nullptr; auto& engine = _network.get_engine(); - auto& weights_params = _impl->_weights_reorder_params; + auto reorder_kernel_params = _impl->get_weights_reorder_kernel_params(); auto weights_idx = _node->get_primitive()->input.size(); auto original_weights_memory = dep_memory_ptr(weights_idx); auto original_layout = original_weights_memory->get_layout(); - if (weights_params.engine == kernel_selector::GenericKernelParams::Engine::NONE) { + if (!reorder_kernel_params) { // If kernel doesn't says that it doesn't require weights reorder, but weights were reordered previously, then // incorrect memory buffer may be assigned, so reset cached weights for such case _reordered_weights_cache.add(original_layout, original_weights_memory); _impl_params->weights_layout = optional_layout(original_layout); } else { - auto expected_layout = from_weights_tensor(weights_params.dest); + auto expected_layout = reorder_kernel_params->get_output_layout(); // Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion expected_layout.set_partial_shape(original_layout.get_partial_shape()); _impl_params->weights_layout = optional_layout(expected_layout); @@ -816,30 +831,27 @@ event::ptr primitive_inst::update_weights() { return nullptr; } else { GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false); - auto get_kernel_key = [&]() -> size_t { - auto seed = _node->get_primitive()->hash(); - seed = hash_combine(seed, expected_layout.hash()); - seed = hash_combine(seed, original_layout.hash()); - return seed; - }; + auto& cache = get_network().get_program()->get_implementations_cache(); + auto reorder_inst = std::make_shared(get_network()); - cldnn::kernel::ptr kernel = nullptr; - auto kernel_key = get_kernel_key(); - auto& cache = get_network().get_in_mem_kernels_cache(); - if (cache.has(kernel_key)) { + if (auto cached_impl = cache.get(*reorder_kernel_params)) { GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string() << " to " << expected_layout.to_short_string() << std::endl; - GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); - kernel = cache.get(kernel_key); + reorder_inst->set_impl(cached_impl->clone()); } else { GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() << " to " << expected_layout.to_short_string() << std::endl; + + auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape); + auto reorder_impl = factory(*reorder_kernel_params); auto& kernels_cache = get_network().get_program()->get_kernels_cache(); - auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString}); - OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue"); - auto& kernel_data = kernels.begin()->second; - kernel = kernel_data[0].first; - cache.add(kernel_key, kernel); + auto kernels = kernels_cache.compile(*_impl_params, reorder_impl->get_kernels_source()); + OPENVINO_ASSERT(kernels.size() == 1, "[GPU] Expected number of compiled kernels is 1, but got ", kernels.size()); + reorder_impl->set_kernels(kernels); + + reorder_inst->set_impl(reorder_impl->clone()); + + cache.add(*reorder_kernel_params, reorder_impl->clone()); } auto& stream = get_network().get_stream(); @@ -867,8 +879,10 @@ event::ptr primitive_inst::update_weights() { kernel_arguments_data args; args.inputs.push_back(original_weights_memory); args.outputs.push_back(weights_memory); - stream.set_arguments(*kernel, weights_params.clKernel->params, args); - auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true); + + auto reorder_impl = reorder_inst->get_impl(); + reorder_impl->set_arguments(*reorder_inst, args); + auto ev = reorder_impl->execute({}, *reorder_inst); GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index f01b4634a9a..4d00949e44b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -67,30 +67,21 @@ struct clKernelData { bool skip_execution = false; }; -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -// CPUKernel -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// -struct CPUKernel { - virtual WeightsType GetExpectedInputType() = 0; - virtual WeightsLayout GetExpectedInputLayout() const { return WeightsLayout::oiyx; } - virtual void Execute(void* input, size_t input_size, void* output, size_t output_size) const = 0; -}; - //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // GenericKernelParams //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct GenericKernelParams { - enum class Engine { NONE, CPU, GPU }; + enum class Engine { NONE, GPU }; Engine engine = Engine::NONE; std::shared_ptr clKernel; - std::shared_ptr cpuKernel; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // WeightsReorderParams //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// struct WeightsReorderParams : public GenericKernelParams { + WeightsTensor src; WeightsTensor dest; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp index c6832515597..4741ebcb71d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_utils.cpp @@ -133,6 +133,7 @@ bool UpdateWeightsParams(weight_bias_params& newParams, weightsReorderParams.engine = WeightsReorderParams::Engine::GPU; weightsReorderParams.clKernel = std::make_shared(kernels_data[0].kernels[0]); + weightsReorderParams.src = r_params.input; weightsReorderParams.dest = r_params.output; newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups); diff --git a/src/plugins/intel_gpu/tests/module_tests/weights_reorder_factory_test.cpp b/src/plugins/intel_gpu/tests/module_tests/weights_reorder_factory_test.cpp new file mode 100644 index 00000000000..b5c13c37de6 --- /dev/null +++ b/src/plugins/intel_gpu/tests/module_tests/weights_reorder_factory_test.cpp @@ -0,0 +1,111 @@ +// Copyright (C) 2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/program.hpp" +#include "intel_gpu/primitives/input_layout.hpp" +#include "intel_gpu/primitives/data.hpp" + +#include "generic_layer_inst.h" +#include "fully_connected_inst.h" +#include "implementation_map.hpp" +#include "graph/impls/ocl/register.hpp" + +#include + +using namespace cldnn; +using namespace ::tests; + +TEST(weights_factory, impl_types) { + program::init_primitives(); + ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape)); + ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::any, shape_types::static_shape)); + + ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::cpu, shape_types::static_shape)); + ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::onednn, shape_types::static_shape)); +} + +TEST(weights_factory, shape_types) { + program::init_primitives(); + ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape)); + + ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::dynamic_shape)); +} + +TEST(weights_factory, reorder_test) { + auto& engine = get_test_engine(); + const int input_f = 32, output_f = 32; + + auto weights_layout = layout(ov::PartialShape{ output_f, input_f }, data_types::f32, format::bfyx); + auto weights_data_input = engine.allocate_memory(weights_layout); + auto weights_data_vec = generate_random_1d(output_f * input_f, -1, 1); + set_values(weights_data_input, weights_data_vec); + + cldnn::topology topology { + input_layout("input", layout{ ov::PartialShape{ -1, input_f }, data_types::f32, format::bfyx }), + data("weights", weights_data_input), + fully_connected("fc", input_info("input"), "weights") + }; + + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc", fc_impl_desc} })), + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + cldnn::network network(engine, topology, config); + + auto inst = network.get_primitive("fc"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + + // Get required WeightsReorderParams + auto weights_reorder_params = impl->get_weights_reorder_params(); + ASSERT_TRUE(weights_reorder_params != nullptr); + + // Constuct kernel_impl_params for weights reorder based requested WeightsReorderParams + auto reorder_kernel_params = std::make_shared(); + reorder_kernel_params->desc = std::make_shared("weights_reorder", "", weights_reorder_params); + reorder_kernel_params->unique_id = weights_reorder_params->hash(); + reorder_kernel_params->input_layouts.push_back(weights_reorder_params->get_input_layout()); + reorder_kernel_params->output_layouts.push_back(weights_reorder_params->get_output_layout()); + + // Create new generic_layer_impl + auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape); + auto reorder_impl = factory(*reorder_kernel_params); + ASSERT_TRUE(reorder_impl != nullptr); + + // Compile kernel + auto& kernel_cache = network.get_program()->get_kernels_cache(); + auto kernels = kernel_cache.compile(*reorder_kernel_params, reorder_impl->get_kernels_source()); + ASSERT_TRUE(kernels.size() == 1); + reorder_impl->set_kernels(kernels); + + // Allocate memmory and execute generic_layer + auto output_weights_layout = weights_reorder_params->get_output_layout(); + auto weights_data_output = engine.allocate_memory({ output_weights_layout }); + + kernel_arguments_data args; + args.inputs.push_back(weights_data_input); + args.outputs.push_back(weights_data_output); + + auto reorder_inst = std::make_shared(network); + reorder_inst->set_impl(reorder_impl->clone()); + + reorder_inst->get_impl()->set_arguments(*reorder_inst, args); + reorder_inst->get_impl()->execute({}, *reorder_inst); + + network.get_stream().finish(); + + // Compare with expected resutls + cldnn::mem_lock output_ptr(weights_data_output, get_test_stream()); + for (int o = 0; o < output_f; o++) { + for (int i = 0; i < input_f; i++) { + auto tensor_coord = tensor(std::vector{o, i}, 0); + size_t input_idx = output_weights_layout.get_linear_offset(tensor_coord); + ASSERT_EQ(weights_data_vec[o * input_f + i], output_ptr[input_idx]); + } + } +} diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp index db3038f5000..c2c8e7d295b 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/kernel_impl_params_relevance_test.cpp @@ -8,6 +8,7 @@ #include #include +#include "fully_connected_inst.h" #include "compilation_context.hpp" #include "program_wrapper.h" diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp index a047bb36b13..300e6b5064b 100644 --- a/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/module_tests/primitive_comparison_test.cpp @@ -10,6 +10,7 @@ #include #include #include +#include using namespace cldnn; using namespace ::tests; @@ -109,3 +110,19 @@ TEST(primitive_comparison, permute) { ASSERT_EQ(permute_prim, permute_prim_eq); ASSERT_NE(permute_prim, permute_prim_order); } + +TEST(primitive_comparison, generic_layer) { + auto shape = ov::PartialShape{1, 2, 3, 4}; + auto data_type = data_types::f32; + auto format_in = format::bfyx; + auto format_out = format::os_iyx_osv16; + + auto input_layout = layout{shape, data_type, format_in}; + auto output_layout = layout{shape, data_type, format_out}; + auto generic_layer_prim = generic_layer("generic_layer", "", std::make_shared(input_layout, output_layout)); + auto generic_layer_eq_prim = generic_layer("generic_layer_eq", "", std::make_shared(input_layout, output_layout)); + auto generic_layer_different_prim = generic_layer("generic_layer", "", std::make_shared(output_layout, input_layout)); + + ASSERT_EQ(generic_layer_prim, generic_layer_eq_prim); + ASSERT_NE(generic_layer_prim, generic_layer_different_prim); +} diff --git a/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp b/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp index 41b553fcf62..d28437d6e3a 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/handle_reshape.cpp @@ -13,6 +13,7 @@ #include "reshape_inst.h" #include "reorder_inst.h" #include "broadcast_inst.h" +#include "fully_connected_inst.h" #include "pass_manager.h" #include "to_string_utils.h" diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_primitive_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_primitive_fusing_test.cpp index 62a3f891db8..3030f4a2d96 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_primitive_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_primitive_fusing_test.cpp @@ -12,6 +12,7 @@ #include "eltwise_inst.h" #include "reduce_inst.h" #include "reshape_inst.h" +#include "fully_connected_inst.h" #include "gemm_inst.h" #include "convolution_inst.h" #include "pass_manager.h" diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index ac4a7e8351f..a172ed7a0e6 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -2351,3 +2351,60 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(2, 9, 16, 32, 64, 128), ::testing::Values(false, true)) ); + +TEST(fully_connected_gpu, has_cached_weights_reorder) { + auto& engine = get_test_engine(); + + const int32_t input_f = 3, input_b = 1, weight_b = 4; + + auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx }; + auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx }); + auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx }); + + set_values(input_data, { -0.5f, 2.0f, 0.5f }); + set_values(weights_data, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f }); + + cldnn::topology topology{ + input_layout("input", input_dyn_layout), + data("weights", weights_data), + fully_connected("fc", input_info("input"), "weights") + }; + + ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl }; + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc", fc_impl_desc} })), + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + network network(engine, topology, config); + network.set_input_data("input", input_data); + + auto outputs = network.execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc"); + + auto output_prim_mem = outputs.begin()->second.get_memory(); + + auto inst = network.get_primitive("fc"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto reorder_kernel_params = impl->get_weights_reorder_kernel_params(); + ASSERT_TRUE(reorder_kernel_params != nullptr); + auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params); + ASSERT_TRUE(reorder_impl != nullptr); + + auto out_l = network.get_output_layout(outputs.begin()->first); + ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment + ASSERT_EQ(out_l.batch(), input_b); + ASSERT_EQ(out_l.feature(), weight_b); + ASSERT_EQ(out_l.spatial(0), 1); + ASSERT_EQ(out_l.spatial(1), 1); + + cldnn::mem_lock output_ptr (output_prim_mem, get_test_stream()); + + ASSERT_EQ(1.5f, output_ptr[0]); + ASSERT_EQ(0.75f, output_ptr[1]); + ASSERT_EQ(-2.25f, output_ptr[2]); + ASSERT_EQ(3.0f, output_ptr[3]); +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp index 4bdff59bf1c..41d7966929f 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/hash_key_gpu_test.cpp @@ -14,10 +14,10 @@ #include #include #include +#include -#include "eltwise_inst.h" -// #include "fully_connected_inst.h" +#include "primitive_inst.h" using namespace cldnn; using namespace tests;