[GPU] Weights reorders primitive cachcing (#16638)

Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
This commit is contained in:
Sergey Shlyapnikov 2023-05-15 09:57:04 +04:00 committed by GitHub
parent 42ef81a9e6
commit 808647dfb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
39 changed files with 594 additions and 338 deletions

View File

@ -225,10 +225,6 @@ public:
/// Returns memory state @p variable_id of stateful network /// Returns memory state @p variable_id of stateful network
VariableState& get_variable_memory(const std::string &variable_id); VariableState& get_variable_memory(const std::string &variable_id);
/// Return in_mem_kernels_cache
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; }
const ExecutionConfig& get_config() const { return _config; } const ExecutionConfig& get_config() const { return _config; }
private: private:
@ -260,8 +256,6 @@ private:
std::unordered_map<primitive_id, event::ptr> _events; std::unordered_map<primitive_id, event::ptr> _events;
output_chains_map _output_chains; output_chains_map _output_chains;
mutable std::mutex _in_mem_cache_mutex;
void build_exec_order(); void build_exec_order();
void allocate_primitive_instance(program_node const& node); void allocate_primitive_instance(program_node const& node);
void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node); void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
@ -273,10 +267,6 @@ private:
void calculate_weights_cache_capacity(); void calculate_weights_cache_capacity();
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst); output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
// Move from cldnn::program to cldnn::network for multi-threads issue.
std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
const size_t _in_mem_kernels_cache_capacity = 10000;
#ifdef GPU_DEBUG_CONFIG #ifdef GPU_DEBUG_CONFIG
int64_t iteration = 0; int64_t iteration = 0;
#endif #endif

View File

@ -0,0 +1,86 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/primitives/primitive.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include <vector>
namespace cldnn {
struct WeightsReorderParams {
WeightsReorderParams(layout in_layout, layout out_layout) : _in_layout(in_layout), _out_layout(out_layout) {}
virtual size_t hash() const {
return hash_combine(_in_layout.hash(), _out_layout.hash());
}
virtual bool operator==(const WeightsReorderParams& rhs) const {
if (typeid(*this) != typeid(rhs))
return false;
return _in_layout == rhs._in_layout &&
_out_layout == rhs._out_layout;
}
layout get_input_layout() const { return _in_layout; }
layout get_output_layout() const { return _out_layout; }
virtual ~WeightsReorderParams() = default;
protected:
layout _in_layout;
layout _out_layout;
};
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
/// @details Corresponding values are bitwise equal before/after reorder.
struct generic_layer : public primitive_base<generic_layer> {
CLDNN_DECLARE_PRIMITIVE(generic_layer)
/// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive.
/// @param id This primitive id.
/// @param input Input primitive id.
/// @param output_layout Requested memory layout.
/// @param mean Primitive id to get mean subtract values.
generic_layer(const primitive_id& id,
const primitive_id& input,
std::shared_ptr<WeightsReorderParams> params,
const padding& output_padding = padding())
: primitive_base(id, {input}, {output_padding}), params(params) {}
std::shared_ptr<WeightsReorderParams> params;
size_t hash() const override {
size_t seed = primitive::hash();
if (params)
seed = hash_combine(seed, params->hash());
return seed;
}
bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;
auto rhs_casted = downcast<const generic_layer>(rhs);
if ((params == nullptr) != (rhs_casted.params == nullptr))
return false;
if (params != nullptr)
return *params == *rhs_casted.params;
return true;
}
protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
};
/// @}
/// @}
/// @}
} // namespace cldnn

View File

@ -21,6 +21,11 @@ generic_layer_node::typed_program_node(const std::shared_ptr<generic_layer> prim
generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node) generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node)
: parent(network, node) {} : parent(network, node) {}
generic_layer_inst::typed_primitive_inst(network& network)
: parent(network) {
_type = generic_layer::type_id();
}
std::string generic_layer_inst::to_string(generic_layer_node const& node) { std::string generic_layer_inst::to_string(generic_layer_node const& node) {
auto node_info = node.desc_to_json(); auto node_info = node.desc_to_json();

View File

@ -2,14 +2,16 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
#include <algorithm>
#include "pass_manager.h" #include "pass_manager.h"
#include "program_node.h" #include "program_node.h"
#include "mutable_data_inst.h" #include "mutable_data_inst.h"
#include "convert_color_inst.h" #include "convert_color_inst.h"
#include "fully_connected_inst.h"
#include "assign_inst.h" #include "assign_inst.h"
#include "tensor_type.h" #include "tensor_type.h"
#include <algorithm>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <stdexcept> #include <stdexcept>

View File

@ -2,15 +2,18 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include "pass_manager.h" #include "pass_manager.h"
#include "data_inst.h" #include "data_inst.h"
#include "mutable_data_inst.h" #include "mutable_data_inst.h"
#include "reshape_inst.h" #include "reshape_inst.h"
#include "quantize_inst.h" #include "quantize_inst.h"
#include "arg_max_min_inst.h" #include "arg_max_min_inst.h"
#include "fully_connected_inst.h"
#include "program_node.h" #include "program_node.h"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/itt.hpp"
#include <iostream> #include <iostream>
#include <cmath> #include <cmath>
#include <iomanip> #include <iomanip>

View File

@ -9,6 +9,7 @@
#include "gemm_inst.h" #include "gemm_inst.h"
#include "pooling_inst.h" #include "pooling_inst.h"
#include "fully_connected_inst.h"
#include <iterator> #include <iterator>
#include <vector> #include <vector>

View File

@ -4,6 +4,7 @@
#include "pass_manager.h" #include "pass_manager.h"
#include "impls/ocl/primitive_base.hpp" #include "impls/ocl/primitive_base.hpp"
#include "fully_connected_inst.h"
#include "fully_connected/fully_connected_params.h" #include "fully_connected/fully_connected_params.h"
#include <memory> #include <memory>
#include <stdexcept> #include <stdexcept>

View File

@ -4,8 +4,11 @@
#include "pass_manager.h" #include "pass_manager.h"
#include "program_helpers.h" #include "program_helpers.h"
#include "include/binary_convolution_inst.h" #include "convolution_inst.h"
#include "include/deformable_convolution_inst.h" #include "binary_convolution_inst.h"
#include "deconvolution_inst.h"
#include "deformable_convolution_inst.h"
#include "fully_connected_inst.h"
#include "lstm_dynamic_input_inst.h" #include "lstm_dynamic_input_inst.h"
namespace cldnn { namespace cldnn {
@ -13,7 +16,6 @@ namespace cldnn {
post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref) post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref)
: base_pass("post_optimize_weights"), _rf(rf_ref) {} : base_pass("post_optimize_weights"), _rf(rf_ref) {}
// function which prepares given primitive for weights optimization
template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) { template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size()); return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size());
} }
@ -37,15 +39,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
return; return;
auto output_layout = node.get_output_layout(); auto output_layout = node.get_output_layout();
auto& weights_reorder_params = impl->_weights_reorder_params; auto weights_reorder_params = impl->get_weights_reorder_params();
for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) { for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) {
auto& weights_node = node.get_dependency(i); auto& weights_node = node.get_dependency(i);
auto weights_layout = weights_node.get_output_layout();
auto reorders = _rf.get_weights_reorder(weights_node.id(), weights_layout, weights_reorder_params); auto reorder = _rf.get_weights_reorder(weights_node.id(), weights_reorder_params);
for (auto& reorder : reorders) { if (reorder.first) {
// insert new generic_layer node to topology // insert new generic_layer node to topology
p.add_intermediate(reorder.first, node, i, !reorder.second); p.add_intermediate(reorder.first, node, i, !reorder.second);
// set generic_layer's node output layout and implementation // set generic_layer's node output layout and implementation
@ -65,9 +65,7 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
} }
// Reset weights reorder params to not keep source code pointer // Reset weights reorder params to not keep source code pointer
weights_reorder_params.engine = kernel_selector::generic_kernel_params::Engine::NONE; impl->reset_weights_reorder_params();
weights_reorder_params.clKernel = nullptr;
weights_reorder_params.cpuKernel = nullptr;
// set the old output layout and do not invalidate users as change of weights will not affect output layout // set the old output layout and do not invalidate users as change of weights will not affect output layout
node.set_output_layout(output_layout, false); node.set_output_layout(output_layout, false);

View File

@ -25,6 +25,7 @@
#include "softmax_inst.h" #include "softmax_inst.h"
#include "resample_inst.h" #include "resample_inst.h"
#include "depth_to_space_inst.h" #include "depth_to_space_inst.h"
#include "fully_connected_inst.h"
#include "space_to_depth_inst.h" #include "space_to_depth_inst.h"
#include "gather_inst.h" #include "gather_inst.h"
#include "gather_nd_inst.h" #include "gather_nd_inst.h"

View File

@ -2,13 +2,12 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
// //
#include "intel_gpu/runtime/debug_configuration.hpp"
#include "pass_manager.h" #include "pass_manager.h"
#include "program_helpers.h" #include "program_helpers.h"
#include "binary_convolution_inst.h"
#include <vector>
#include <list>
#include <utility>
#include "binary_convolution_inst.h"
#include "reshape_inst.h" #include "reshape_inst.h"
#include "convert_color_inst.h" #include "convert_color_inst.h"
#include "one_hot_inst.h" #include "one_hot_inst.h"
@ -16,7 +15,11 @@
#include "depth_to_space_inst.h" #include "depth_to_space_inst.h"
#include "concatenation_inst.h" #include "concatenation_inst.h"
#include "region_yolo_inst.h" #include "region_yolo_inst.h"
#include "intel_gpu/runtime/debug_configuration.hpp" #include "fully_connected_inst.h"
#include <vector>
#include <list>
#include <utility>
using namespace cldnn; using namespace cldnn;

View File

@ -13,6 +13,8 @@
#include "mvn_inst.h" #include "mvn_inst.h"
#include "to_string_utils.h" #include "to_string_utils.h"
#include "pooling_inst.h" #include "pooling_inst.h"
#include "reshape_inst.h"
#include "fully_connected_inst.h"
#ifdef ENABLE_ONEDNN_FOR_GPU #ifdef ENABLE_ONEDNN_FOR_GPU
#include "gemm_inst.h" #include "gemm_inst.h"

View File

@ -5,6 +5,7 @@
#include "pass_manager.h" #include "pass_manager.h"
#include "data_inst.h" #include "data_inst.h"
#include "mutable_data_inst.h" #include "mutable_data_inst.h"
#include "fully_connected_inst.h"
#include "gemm_inst.h" #include "gemm_inst.h"
#include "program_node.h" #include "program_node.h"
#include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/engine.hpp"

View File

@ -31,6 +31,7 @@ public:
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {} void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
void set_arguments(primitive_inst& /*instance*/) override {} void set_arguments(primitive_inst& /*instance*/) override {}
void set_arguments(primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) override {}
kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override { kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override {
kernel_arguments_data args; kernel_arguments_data args;
return args; return args;

View File

@ -400,7 +400,7 @@ struct non_max_suppression_impl : typed_primitive_impl<non_max_suppression> {
return make_unique<non_max_suppression_impl>(*this); return make_unique<non_max_suppression_impl>(*this);
} }
non_max_suppression_impl() : parent(kernel_selector::weights_reorder_params(), "non_max_suppression_impl") {} non_max_suppression_impl() : parent("non_max_suppression_impl") {}
event::ptr execute_impl(const std::vector<event::ptr>& event, typed_primitive_inst<non_max_suppression>& instance) override { event::ptr execute_impl(const std::vector<event::ptr>& event, typed_primitive_inst<non_max_suppression>& instance) override {
for (auto e : event) { for (auto e : event) {

View File

@ -14,8 +14,8 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
using parent::parent; using parent::parent;
kernel_selector::cl_kernel_data _cl_kernel_data; kernel_selector::cl_kernel_data _cl_kernel_data;
std::vector<kernel::ptr> _kernels; kernel::ptr _kernel;
std::string _cached_kernel_id; kernel_id _cached_kernel_id;
DECLARE_OBJECT_TYPE_SERIALIZATION DECLARE_OBJECT_TYPE_SERIALIZATION
@ -27,18 +27,21 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
generic_layer_impl(const generic_layer_impl& other) generic_layer_impl(const generic_layer_impl& other)
: _cl_kernel_data(other._cl_kernel_data) : _cl_kernel_data(other._cl_kernel_data)
, _kernels({}) , _kernel(nullptr)
, _cached_kernel_id(other._cached_kernel_id) { , _cached_kernel_id(other._cached_kernel_id) {
if (other._kernels.empty()) { OPENVINO_ASSERT(other._kernel, "[GPU] Can't copy generic_layer_impl node: kernel is nullptr");
throw std::runtime_error("Can't copy generic_layer_impl node: kernels vector is empty"); _kernel = other._kernel->clone();
}
_kernels.push_back(other._kernels.front()->clone());
} }
generic_layer_impl(const generic_layer_node& arg) generic_layer_impl(const kernel_impl_params& params)
: _cl_kernel_data(*arg.get_primitive()->generic_params.clKernel.get()) : _cl_kernel_data()
, _kernels() , _kernel(nullptr)
, _cached_kernel_id() { } , _cached_kernel_id() {
auto reorder_params = params.typed_desc<generic_layer>()->params;
auto casted_params = std::dynamic_pointer_cast<WeightsReorderParamsOCL>(reorder_params);
OPENVINO_ASSERT(casted_params, "[GPU] Invalid weights reorder parameters type for ", params.desc->id, " node");
_cl_kernel_data = *casted_params->get_cl_kernel();
}
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override { std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings; std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
@ -47,11 +50,11 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
} }
std::vector<kernel::ptr> get_kernels() const override { std::vector<kernel::ptr> get_kernels() const override {
return _kernels; return {_kernel};
} }
void save(BinaryOutputBuffer& ob) const override { void save(BinaryOutputBuffer& ob) const override {
ob <<_cl_kernel_data; ob << _cl_kernel_data;
ob << _cached_kernel_id; ob << _cached_kernel_id;
} }
@ -61,21 +64,27 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
} }
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override { void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
_kernels.clear(); _kernel = nullptr;
auto compiled_kernels = kernels_cache.get_kernels(params); auto compiled_kernels = kernels_cache.get_kernels(params);
_kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end()); OPENVINO_ASSERT(compiled_kernels.size() == 1, "[GPU] Unexpected number of kernels for generic_layer during init_kernels() call");
_kernel = compiled_kernels.front();
} }
void init_by_cached_kernels(const kernels_cache& kernels_cache) override { void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id)); _kernel = kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id);
} }
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override { void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]); _cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernel);
}
void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
OPENVINO_ASSERT(kernels.size() == 1 &&
kernels.begin()->second.size() == 1, "[GPU] Unexpected number of kernels for generic_layer");
_kernel = kernels.begin()->second[0].first;
} }
void set_arguments_impl(generic_layer_inst& instance) override { void set_arguments_impl(generic_layer_inst& instance) override {
stream& stream = instance.get_network().get_stream();
kernel_arguments_data args; kernel_arguments_data args;
args.scalars = &_cl_kernel_data.params.scalars; args.scalars = &_cl_kernel_data.params.scalars;
@ -83,7 +92,13 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
args.inputs.push_back(instance.input_memory_ptr(i)); args.inputs.push_back(instance.input_memory_ptr(i));
} }
args.outputs.push_back(instance.output_memory_ptr()); args.outputs.push_back(instance.output_memory_ptr());
stream.set_arguments(*_kernels.front(), _cl_kernel_data.params, args);
set_arguments_impl(instance, args);
}
void set_arguments_impl(generic_layer_inst& instance, kernel_arguments_data& args) override {
stream& stream = instance.get_network().get_stream();
stream.set_arguments(*_kernel, _cl_kernel_data.params, args);
} }
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override { event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
@ -95,58 +110,23 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
args.inputs.push_back(instance.input_memory_ptr(i)); args.inputs.push_back(instance.input_memory_ptr(i));
} }
args.outputs.push_back(instance.output_memory_ptr()); args.outputs.push_back(instance.output_memory_ptr());
return stream.enqueue_kernel(*_kernels.front(), _cl_kernel_data.params, args, events, true); return stream.enqueue_kernel(*_kernel, _cl_kernel_data.params, args, events, true);
}
static std::unique_ptr<primitive_impl> create(const kernel_impl_params& params) {
return make_unique<generic_layer_impl>(params);
} }
}; };
// TODO: move this file to cpu folder and add a new traget to 'cldnn::engine_types' static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params& params) {
struct generic_layer_cpu : typed_primitive_impl<generic_layer> { return make_unique<generic_layer_impl>(params);
const generic_layer_node& outer;
DECLARE_OBJECT_TYPE_SERIALIZATION
std::unique_ptr<primitive_impl> clone() const override {
return make_unique<generic_layer_cpu>(*this);
}
explicit generic_layer_cpu(const generic_layer_node& arg) : outer(arg) {}
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
stream& stream = instance.get_network().get_stream();
auto input_mem = instance.input_memory_ptr();
auto output_mem = instance.output_memory_ptr();
auto ev = stream.create_user_event(false);
std::vector<event::ptr> tmp_events(events);
for (auto& a : events) {
a->wait();
}
mem_lock<uint8_t, mem_lock_type::read> old_pointer(input_mem, stream);
mem_lock<uint8_t, mem_lock_type::write> new_pointer(output_mem, stream);
const auto& cpu_kernel = *outer.get_primitive()->generic_params.cpuKernel.get();
cpu_kernel.Execute(old_pointer.data(), old_pointer.size(), new_pointer.data(), new_pointer.size());
ev->set();
return ev;
}
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
};
static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params&) {
if (arg.get_primitive()->generic_params.engine == kernel_selector::generic_kernel_params::Engine::GPU) {
return make_unique<generic_layer_impl>(arg);
} else {
return make_unique<generic_layer_cpu>(arg);
}
} }
namespace detail { namespace detail {
attach_generic_layer_impl::attach_generic_layer_impl() { attach_generic_layer_impl::attach_generic_layer_impl() {
implementation_map<generic_layer>::add(cldnn::impl_types::ocl, create, {}); implementation_map<generic_layer>::add(cldnn::impl_types::ocl, create, {});
WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, generic_layer_impl::create);
} }
} // namespace detail } // namespace detail
@ -154,4 +134,3 @@ attach_generic_layer_impl::attach_generic_layer_impl() {
} // namespace cldnn } // namespace cldnn
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl) BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl)
ASSIGN_TYPE_NAME(cldnn::ocl::generic_layer_cpu)

View File

@ -14,6 +14,7 @@
#include "intel_gpu/primitives/eltwise.hpp" #include "intel_gpu/primitives/eltwise.hpp"
#include "intel_gpu/primitives/quantize.hpp" #include "intel_gpu/primitives/quantize.hpp"
#include "intel_gpu/primitives/activation.hpp" #include "intel_gpu/primitives/activation.hpp"
#include "intel_gpu/primitives/generic_layer.hpp"
#include "intel_gpu/primitives/primitive.hpp" #include "intel_gpu/primitives/primitive.hpp"
#include "kernel_selector_params.h" #include "kernel_selector_params.h"
@ -166,7 +167,7 @@ inline optional_params_t get_default_weights_bias_optional_params(const program&
} }
inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) { inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) {
switch (mode) { switch (mode) {
case eltwise_mode::sum: case eltwise_mode::sum:
return kernel_selector::eltwise_mode::ADD; return kernel_selector::eltwise_mode::ADD;
case eltwise_mode::sub: case eltwise_mode::sub:
@ -269,4 +270,106 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
return updated_impl_params; return updated_impl_params;
} }
class WeightsReorderParamsOCL : public WeightsReorderParams {
public:
explicit WeightsReorderParamsOCL(const kernel_selector::WeightsReorderParams& params)
: WeightsReorderParams(from_weights_tensor(params.src), from_weights_tensor(params.dest)) {
cl_kernel = params.clKernel;
}
size_t hash() const override {
size_t seed = WeightsReorderParams::hash();
if (cl_kernel == nullptr)
return seed;
seed = hash_combine(seed, cl_kernel->skip_execution);
auto& gws = cl_kernel->params.workGroups.global;
seed = hash_range(seed, gws.begin(), gws.end());
auto& lws = cl_kernel->params.workGroups.local;
seed = hash_range(seed, lws.begin(), lws.end());
auto& arguments = cl_kernel->params.arguments;
for (auto& args : arguments) {
seed = hash_combine(seed, args.index);
seed = hash_combine(seed, args.t);
}
auto& scalars = cl_kernel->params.scalars;
for (auto& s : scalars) {
seed = hash_combine(seed, s.t);
}
return seed;
}
bool operator==(const WeightsReorderParams& rhs) const override {
if (typeid(*this) != typeid(rhs))
return false;
if (!WeightsReorderParams::operator==(rhs))
return false;
auto rhs_casted = downcast<const WeightsReorderParamsOCL>(rhs);
if (cl_kernel != nullptr && rhs_casted.cl_kernel != nullptr) {
auto& clKernel_rhs = rhs_casted.cl_kernel;
if (cl_kernel->skip_execution != clKernel_rhs->skip_execution)
return false;
auto& gws = cl_kernel->params.workGroups.global;
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
if (gws != gws_rhs)
return false;
auto& lws = cl_kernel->params.workGroups.local;
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
if (lws != lws_rhs)
return false;
auto& arguments = cl_kernel->params.arguments;
auto& arguments_rhs = clKernel_rhs->params.arguments;
if (arguments.size() != arguments_rhs.size())
return false;
for (size_t idx = 0; idx < arguments.size(); idx++) {
if (arguments[idx].index != arguments_rhs[idx].index)
return false;
if (arguments[idx].t != arguments_rhs[idx].t)
return false;
}
auto& scalars = cl_kernel->params.scalars;
auto& scalars_rhs = clKernel_rhs->params.scalars;
if (scalars.size() != scalars_rhs.size())
return false;
for (size_t idx = 0; idx < scalars.size(); idx++) {
if (scalars[idx].t != scalars_rhs[idx].t)
return false;
}
}
return true;
}
std::shared_ptr<kernel_selector::clKernelData> get_cl_kernel() {
return cl_kernel;
}
private:
std::shared_ptr<kernel_selector::clKernelData> cl_kernel;
};
inline std::shared_ptr<WeightsReorderParams> create_weights_reorder_params(const kernel_selector::WeightsReorderParams& params) {
if (params.engine == kernel_selector::generic_kernel_params::Engine::NONE) {
return nullptr;
}
return std::make_shared<WeightsReorderParamsOCL>(params);
}
} // namespace cldnn } // namespace cldnn

View File

@ -40,7 +40,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) { typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
_kernel_data.weightsReorderParams.clKernel = nullptr; _kernel_data.weightsReorderParams.clKernel = nullptr;
} }
@ -57,11 +56,10 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
} }
typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd) typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd)
: typed_primitive_impl<PType>(kd.weightsReorderParams, kd.kernelName), : typed_primitive_impl<PType>(create_weights_reorder_params(kd.weightsReorderParams), kd.kernelName),
_kernel_data(kd) { _kernel_data(kd) {
// weights reorder params got copied to parent, clear in _kernel_data to release shared ptr // weights reorder params got copied to parent, clear in _kernel_data to release shared ptr
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE; _kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
_kernel_data.weightsReorderParams.clKernel = nullptr; _kernel_data.weightsReorderParams.clKernel = nullptr;
this->can_reuse_memory = _kernel_data.can_reuse_memory; this->can_reuse_memory = _kernel_data.can_reuse_memory;
@ -214,6 +212,21 @@ protected:
} }
} }
void set_arguments_impl(typed_primitive_inst<PType>& instance, kernel_arguments_data& args) override {
if (instance.can_be_optimized()) {
return;
}
stream& stream = instance.get_network().get_stream();
for (size_t k = 0; k < _kernels.size(); ++k) {
if (_kernel_data.kernels[k].skip_execution)
continue;
stream.set_arguments(*_kernels[k], _kernel_data.kernels[k].params, args);
}
}
kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& instance) const override { kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& instance) const override {
for (size_t k = 0; k < _kernels.size(); ++k) { for (size_t k = 0; k < _kernels.size(); ++k) {
auto args = get_arguments(instance); auto args = get_arguments(instance);

View File

@ -4,7 +4,6 @@
#pragma once #pragma once
#include "generic_layer.hpp"
#include "intel_gpu/primitives/activation.hpp" #include "intel_gpu/primitives/activation.hpp"
#include "intel_gpu/primitives/arg_max_min.hpp" #include "intel_gpu/primitives/arg_max_min.hpp"
#include "intel_gpu/primitives/batch_to_space.hpp" #include "intel_gpu/primitives/batch_to_space.hpp"
@ -29,6 +28,7 @@
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp" #include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
#include "intel_gpu/primitives/eye.hpp" #include "intel_gpu/primitives/eye.hpp"
#include "intel_gpu/primitives/fully_connected.hpp" #include "intel_gpu/primitives/fully_connected.hpp"
#include "intel_gpu/primitives/generic_layer.hpp"
#include "intel_gpu/primitives/gather.hpp" #include "intel_gpu/primitives/gather.hpp"
#include "intel_gpu/primitives/gather_elements.hpp" #include "intel_gpu/primitives/gather_elements.hpp"
#include "intel_gpu/primitives/gather_nd.hpp" #include "intel_gpu/primitives/gather_nd.hpp"

View File

@ -8,7 +8,7 @@
#include "primitive_onednn_base.h" #include "primitive_onednn_base.h"
#include "implementation_map.hpp" #include "implementation_map.hpp"
#include "kernel_selector_common.h" #include "impls/ocl/kernel_selector_helper.h"
#include "utils.hpp" #include "utils.hpp"
@ -158,6 +158,7 @@ protected:
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]); weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
weights_reorder_params.src = r_params.input;
weights_reorder_params.dest = r_params.output; weights_reorder_params.dest = r_params.output;
return weights_reorder_params; return weights_reorder_params;

View File

@ -8,7 +8,7 @@
#include "primitive_onednn_base.h" #include "primitive_onednn_base.h"
#include "implementation_map.hpp" #include "implementation_map.hpp"
#include "kernel_selector_common.h" #include "impls/ocl/kernel_selector_helper.h"
#include <oneapi/dnnl/dnnl.hpp> #include <oneapi/dnnl/dnnl.hpp>
@ -79,6 +79,7 @@ protected:
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]); weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
weights_reorder_params.src = r_params.input;
weights_reorder_params.dest = r_params.output; weights_reorder_params.dest = r_params.output;
return weights_reorder_params; return weights_reorder_params;

View File

@ -6,7 +6,7 @@
#include "primitive_onednn_base.h" #include "primitive_onednn_base.h"
#include "implementation_map.hpp" #include "implementation_map.hpp"
#include "kernel_selector_common.h" #include "impls/ocl/kernel_selector_helper.h"
#include <oneapi/dnnl/dnnl.hpp> #include <oneapi/dnnl/dnnl.hpp>
@ -91,6 +91,7 @@ protected:
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU; weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]); weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
weights_reorder_params.src = r_params.input;
weights_reorder_params.dest = r_params.output; weights_reorder_params.dest = r_params.output;
return weights_reorder_params; return weights_reorder_params;

View File

@ -20,6 +20,7 @@
#include "reorder/reorder_weights_kernel_selector.h" #include "reorder/reorder_weights_kernel_selector.h"
#include "reorder/reorder_kernel_base.h" #include "reorder/reorder_kernel_base.h"
#include "impls/ocl/kernel_selector_helper.h"
#include <vector> #include <vector>
#include <list> #include <list>
@ -46,7 +47,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
std::shared_ptr<dnnl::primitive_attr> attrs, std::shared_ptr<dnnl::primitive_attr> attrs,
const PrimDescType& pd, const PrimDescType& pd,
kernel_selector::WeightsReorderParams weights_reorder = {}) kernel_selector::WeightsReorderParams weights_reorder = {})
: typed_primitive_impl<PType>(weights_reorder, pd.impl_info_str()), : typed_primitive_impl<PType>(create_weights_reorder_params(weights_reorder), pd.impl_info_str()),
_engine(&engine), _engine(&engine),
_attrs(attrs), _attrs(attrs),
_pd(pd) { _pd(pd) {

View File

@ -1,143 +0,0 @@
// Copyright (C) 2018-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "intel_gpu/primitives/primitive.hpp"
#include "intel_gpu/runtime/memory.hpp"
// TODO: Remove OCL impl dependency here or move to OCL folder
#include "impls/ocl/kernel_selector_helper.h"
#include <vector>
namespace cldnn {
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
/// @details Corresponding values are bitwise equal before/after reorder.
/// Also merged with subtraction layer, which can subtract values while doing reordering.
/// NOTE THAT THIS WILL SUBTRACT THE SAME VALUES FROM EACH BATCH.
struct generic_layer : public primitive_base<generic_layer> {
CLDNN_DECLARE_PRIMITIVE(generic_layer)
/// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive.
/// @param id This primitive id.
/// @param input Input primitive id.
/// @param output_layout Requested memory layout.
/// @param mean Primitive id to get mean subtract values.
generic_layer(const primitive_id& id,
const primitive_id& input,
const layout& output_layout,
const kernel_selector::generic_kernel_params& generic_params,
const padding& output_padding = padding())
: primitive_base(id, {input}, {output_padding}), output_layout(output_layout), generic_params(generic_params) {}
/// @brief Requested memory layout.
layout output_layout;
const kernel_selector::generic_kernel_params generic_params;
size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, generic_params.engine);
if (generic_params.cpuKernel != nullptr) {
auto& cpuKernel = generic_params.cpuKernel;
seed = hash_combine(seed, cpuKernel->GetExpectedInputLayout());
seed = hash_combine(seed, cpuKernel->GetExpectedInputType());
}
if (generic_params.clKernel != nullptr) {
auto& clKernel = generic_params.clKernel;
seed = hash_combine(seed, clKernel->skip_execution);
auto& gws = clKernel->params.workGroups.global;
seed = hash_range(seed, gws.begin(), gws.end());
auto& lws = clKernel->params.workGroups.local;
seed = hash_range(seed, lws.begin(), lws.end());
auto& arguments = clKernel->params.arguments;
for (auto& args : arguments) {
seed = hash_combine(seed, args.index);
seed = hash_combine(seed, args.t);
}
auto& scalars = clKernel->params.scalars;
for (auto& s : scalars) {
seed = hash_combine(seed, s.t);
}
seed = hash_combine(seed, clKernel->code.kernelString->get_hash());
}
return seed;
}
bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;
auto rhs_casted = downcast<const generic_layer>(rhs);
if (generic_params.engine != rhs_casted.generic_params.engine)
return false;
if (generic_params.cpuKernel != nullptr) {
if (generic_params.cpuKernel->GetExpectedInputLayout() != rhs_casted.generic_params.cpuKernel->GetExpectedInputLayout())
return false;
if (generic_params.cpuKernel->GetExpectedInputType() != rhs_casted.generic_params.cpuKernel->GetExpectedInputType())
return false;
}
if (generic_params.clKernel != nullptr) {
auto& clKernel = generic_params.clKernel;
auto& clKernel_rhs = rhs_casted.generic_params.clKernel;
if (clKernel->skip_execution != clKernel_rhs->skip_execution)
return false;
auto& gws = clKernel->params.workGroups.global;
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
if (gws != gws_rhs)
return false;
auto& lws = clKernel->params.workGroups.local;
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
if (lws != lws_rhs)
return false;
auto& arguments = clKernel->params.arguments;
auto& arguments_rhs = clKernel_rhs->params.arguments;
if (arguments.size() != arguments_rhs.size())
return false;
for (size_t idx = 0; idx < arguments.size(); idx++) {
if (arguments[idx].index != arguments_rhs[idx].index)
return false;
if (arguments[idx].t != arguments_rhs[idx].t)
return false;
}
auto& scalars = clKernel->params.scalars;
auto& scalars_rhs = clKernel_rhs->params.scalars;
if (scalars.size() != scalars_rhs.size())
return false;
for (size_t idx = 0; idx < scalars.size(); idx++) {
if (scalars[idx].t != scalars_rhs[idx].t)
return false;
}
if (clKernel->code.kernelString->get_str() != clKernel_rhs->code.kernelString->get_str())
return false;
}
return true;
}
protected:
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
};
/// @}
/// @}
/// @}
} // namespace cldnn

View File

@ -3,7 +3,7 @@
// //
#pragma once #pragma once
#include "generic_layer.hpp" #include "intel_gpu/primitives/generic_layer.hpp"
#include "primitive_inst.h" #include "primitive_inst.h"
#include <string> #include <string>
@ -31,12 +31,13 @@ class typed_primitive_inst<generic_layer> : public typed_primitive_inst_base<gen
public: public:
static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) { static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) {
return impl_param.typed_desc<generic_layer>()->output_layout; return impl_param.typed_desc<generic_layer>()->params->get_output_layout();
} }
static std::string to_string(generic_layer_node const& node); static std::string to_string(generic_layer_node const& node);
typed_primitive_inst(network& network, generic_layer_node const& node); typed_primitive_inst(network& network, generic_layer_node const& node);
typed_primitive_inst(network& network);
}; };
using generic_layer_inst = typed_primitive_inst<generic_layer>; using generic_layer_inst = typed_primitive_inst<generic_layer>;

View File

@ -130,4 +130,28 @@ public:
return keys; return keys;
} }
}; };
struct WeightsReordersFactory {
using factory_type = std::function<std::unique_ptr<primitive_impl>(const kernel_impl_params&)>;
using map_type = singleton_map<std::pair<impl_types, shape_types>, factory_type>;
static void add(impl_types impl_type, shape_types shape_type, factory_type factory) {
OPENVINO_ASSERT(impl_type != impl_types::any, "[GPU] Can't register WeightsReordersFactory with type any");
map_type::instance().insert({{impl_type, shape_type}, factory});
}
static factory_type get(impl_types preferred_impl_type, shape_types target_shape_type) {
for (auto& kv : map_type::instance()) {
impl_types impl_type = kv.first.first;
shape_types supported_shape_type = kv.first.second;
if ((preferred_impl_type & impl_type) != impl_type)
continue;
if ((target_shape_type & supported_shape_type) != target_shape_type)
continue;
return kv.second;
}
OPENVINO_THROW("[GPU] WeightsReordersFactory doesn't have any implementation for "
" impl_type: ", preferred_impl_type, ", shape_type: ", target_shape_type);
}
};
} // namespace cldnn } // namespace cldnn

View File

@ -7,21 +7,15 @@
#include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/utils.hpp" #include "intel_gpu/runtime/utils.hpp"
#include "intel_gpu/runtime/lru_cache.hpp"
#include "data_inst.h" #include "data_inst.h"
#include "generic_layer_inst.h"
#include "reorder_inst.h" #include "reorder_inst.h"
#include "convolution_inst.h" #include "convolution_inst.h"
#include "deconvolution_inst.h" #include "deconvolution_inst.h"
#include "fully_connected_inst.h"
#include "detection_output_inst.h" #include "detection_output_inst.h"
#include "binary_convolution_inst.h" #include "binary_convolution_inst.h"
#include "lstm_gemm_inst.h"
#include "generic_layer.hpp"
#include "non_max_suppression_inst.h"
#include "region_yolo_inst.h"
// TODO: add generic interface for weights_reorder_params and get rid of this dependency
#include "impls/ocl/kernel_selector_helper.h"
#include <vector> #include <vector>
#include <memory> #include <memory>
@ -52,10 +46,8 @@ public:
const layout& in_layout, const layout& in_layout,
const layout& out_layout); const layout& out_layout);
std::vector<std::pair<std::shared_ptr<primitive>, bool>> get_weights_reorder( std::pair<std::shared_ptr<primitive>, bool> get_weights_reorder(primitive_id input_id,
primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params);
const layout& old_layout,
const kernel_selector::weights_reorder_params& reorder_params);
private: private:
struct cache_key { struct cache_key {

View File

@ -5,6 +5,7 @@
#pragma once #pragma once
#include "intel_gpu/primitives/primitive.hpp" #include "intel_gpu/primitives/primitive.hpp"
#include "intel_gpu/primitives/concatenation.hpp" #include "intel_gpu/primitives/concatenation.hpp"
#include "intel_gpu/primitives/generic_layer.hpp"
#include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/event.hpp"
#include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/memory.hpp"
#include "intel_gpu/runtime/lru_cache.hpp" #include "intel_gpu/runtime/lru_cache.hpp"
@ -43,21 +44,22 @@ class typed_primitive_inst;
*/ */
struct primitive_impl { struct primitive_impl {
primitive_impl() = default; primitive_impl() = default;
explicit primitive_impl(const kernel_selector::weights_reorder_params& params, std::string kernel_name = "", bool is_dynamic = false) explicit primitive_impl(std::shared_ptr<WeightsReorderParams> params, std::string kernel_name = "", bool is_dynamic = false)
: _weights_reorder_params(params), _kernel_name(kernel_name), _is_dynamic(is_dynamic) {} : _weights_reorder_params(params), _kernel_name(kernel_name), _is_dynamic(is_dynamic) {
}
explicit primitive_impl(std::string kernel_name, bool is_dynamic = false) : explicit primitive_impl(std::string kernel_name, bool is_dynamic = false) :
primitive_impl(kernel_selector::weights_reorder_params{}, kernel_name, is_dynamic) {} primitive_impl(nullptr, kernel_name, is_dynamic) {}
virtual ~primitive_impl() = default; virtual ~primitive_impl() = default;
virtual std::vector<layout> get_internal_buffer_layouts() const = 0; virtual std::vector<layout> get_internal_buffer_layouts() const = 0;
virtual void set_node_params(const program_node&) {} virtual void set_node_params(const program_node&) {}
virtual std::string get_type() const = 0; virtual std::string get_type() const = 0;
virtual void set_arguments(primitive_inst& instance) = 0; virtual void set_arguments(primitive_inst& instance) = 0;
virtual void set_arguments(primitive_inst& instance, kernel_arguments_data& args) = 0;
virtual kernel_arguments_data get_arguments(const primitive_inst& instance) const = 0; virtual kernel_arguments_data get_arguments(const primitive_inst& instance) const = 0;
virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0; virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
std::string get_kernel_name() const { return _kernel_name; } std::string get_kernel_name() const { return _kernel_name; }
// TODO: added a derived class for weights reordering (maybe for all static data reordering)
kernel_selector::weights_reorder_params _weights_reorder_params;
// class typed_primitive_gpu_impl override this with return false; // class typed_primitive_gpu_impl override this with return false;
virtual bool is_cpu() const { return true; } virtual bool is_cpu() const { return true; }
virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0; virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
@ -94,7 +96,14 @@ struct primitive_impl {
virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {} virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {}
virtual std::vector<kernel::ptr> get_kernels() { return {}; } virtual std::vector<kernel::ptr> get_kernels() { return {}; }
bool need_weights_reorder() const { return _weights_reorder_params != nullptr; }
std::shared_ptr<WeightsReorderParams> get_weights_reorder_params() const { return _weights_reorder_params; }
void reset_weights_reorder_params() { _weights_reorder_params = nullptr; }
std::shared_ptr<kernel_impl_params> get_weights_reorder_kernel_params() const;
protected: protected:
std::shared_ptr<WeightsReorderParams> _weights_reorder_params = nullptr;
std::string _kernel_name; std::string _kernel_name;
bool _is_dynamic = false; bool _is_dynamic = false;
}; };
@ -151,6 +160,8 @@ public:
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); } const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead // return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
const primitive_impl* get_impl() const { return _impl.get(); } const primitive_impl* get_impl() const { return _impl.get(); }
primitive_impl* get_impl() { return _impl.get(); }
void set_impl(std::unique_ptr<primitive_impl> impl) { _impl = std::move(impl); }
memory& input_memory(size_t index = 0) const { memory& input_memory(size_t index = 0) const {
if (index >= inputs_memory_count()) if (index >= inputs_memory_count())
@ -418,11 +429,22 @@ private:
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance)); return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance));
} }
void set_arguments(primitive_inst& instance, kernel_arguments_data& args) override {
OPENVINO_ASSERT(instance.type() == PType::type_id(), "[GPU] Implementation type ", instance.type(),
" does not match primitive type ", PType::type_id());
if (instance.get_impl() != this)
throw std::invalid_argument(
"Trying to set_arguments for primitive implementation with mismatching primitive instance");
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance), args);
}
kernel_arguments_data get_arguments(const primitive_inst& instance) const override { kernel_arguments_data get_arguments(const primitive_inst& instance) const override {
return get_arguments_impl(reinterpret_cast<const typed_primitive_inst<PType>&>(instance)); return get_arguments_impl(reinterpret_cast<const typed_primitive_inst<PType>&>(instance));
} }
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {} virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/, kernel_arguments_data& /*args*/) {}
virtual kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& /*instance*/) const { virtual kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& /*instance*/) const {
kernel_arguments_data args; kernel_arguments_data args;
return args; return args;

View File

@ -13,11 +13,12 @@
#include "reshape_inst.h" #include "reshape_inst.h"
#include "arg_max_min_inst.h" #include "arg_max_min_inst.h"
#include "shape_of_inst.h" #include "shape_of_inst.h"
#include "generic_layer.hpp"
#include <sstream> #include <sstream>
#include "gemm_inst.h" #include "gemm_inst.h"
#include "deconvolution_inst.h" #include "deconvolution_inst.h"
#include "fully_connected_inst.h"
#include "non_max_suppression_inst.h"
#include "eltwise_inst.h" #include "eltwise_inst.h"
#include "pooling_inst.h" #include "pooling_inst.h"
#include "reduce_inst.h" #include "reduce_inst.h"
@ -155,50 +156,26 @@ std::pair<std::shared_ptr<reorder>, bool> reorder_factory::get_reorder(primitive
return std::make_pair(reorder, false); return std::make_pair(reorder, false);
} }
std::vector<std::pair<std::shared_ptr<primitive>, bool>> reorder_factory::get_weights_reorder( std::pair<std::shared_ptr<primitive>, bool> reorder_factory::get_weights_reorder(primitive_id input_id,
primitive_id input_id, std::shared_ptr<WeightsReorderParams> reorder_params) {
const layout& old_layout, if (reorder_params == nullptr)
const kernel_selector::weights_reorder_params& reorder_params) {
if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::NONE)
return {}; return {};
std::vector<std::pair<std::shared_ptr<primitive>, bool>> ret; layout expected_layout = reorder_params->get_output_layout();
if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::CPU &&
reorder_params.cpuKernel != nullptr) {
const auto intermediate_format = from_weights_layout(reorder_params.cpuKernel->GetExpectedInputLayout());
const auto intermediate_type = from_weights_type(reorder_params.cpuKernel->GetExpectedInputType());
if (intermediate_format != old_layout.format || intermediate_type != old_layout.data_type) {
const layout intermediate_layout = { intermediate_type,
intermediate_format,
old_layout.get_tensor().transform(intermediate_format, 1) };
auto reorder = get_reorder(input_id, old_layout, intermediate_layout);
if (reorder.first) {
ret.push_back(reorder);
input_id = reorder.first->id;
}
}
}
layout expected_layout = from_weights_tensor(reorder_params.dest);
cache_key ckey{ input_id, expected_layout, false }; cache_key ckey{ input_id, expected_layout, false };
auto itr = _cached_generic_reorders.find(ckey); auto itr = _cached_generic_reorders.find(ckey);
if (itr != _cached_generic_reorders.end()) { if (itr != _cached_generic_reorders.end()) {
ret.push_back(std::make_pair(itr->second, true)); return std::make_pair(itr->second, true);
} else { } else {
auto count = _cached_generic_reorders.size(); auto count = _cached_generic_reorders.size();
std::stringstream ss; std::stringstream ss;
ss << input_id << "_generic_layer_" << count; ss << input_id << "_generic_layer_" << count;
auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), input_id, expected_layout, reorder_params); auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), input_id, reorder_params);
_cached_generic_reorders[ckey] = reorder; _cached_generic_reorders[ckey] = reorder;
ret.push_back(std::make_pair(reorder, false)); return std::make_pair(reorder, false);
} }
return ret;
} }
bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) { bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) {

View File

@ -337,11 +337,6 @@ network::network(program::ptr program, const ExecutionConfig& config, stream::pt
build_exec_order(); build_exec_order();
validate_primitives(); validate_primitives();
add_default_output_chains(); add_default_output_chains();
if (is_dynamic()) {
GPU_DEBUG_DEFINE_MEM_LOGGER("dynamic_network_initialization");
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
}
} }
network::network(engine& engine, network::network(engine& engine,
@ -537,7 +532,8 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
kernels_cache.reset(); kernels_cache.reset();
for (const auto& p_inst : _exec_order) { for (const auto& p_inst : _exec_order) {
if (p_inst->get_impl() != nullptr) { if (p_inst->get_impl() != nullptr) {
kernels_cache.add_to_cached_kernels(p_inst->get_impl()->get_kernels()); auto const_impl = static_cast<const primitive_impl*>(p_inst->get_impl());
kernels_cache.add_to_cached_kernels(const_impl->get_kernels());
} }
} }
ob << kernels_cache; ob << kernels_cache;

View File

@ -11,11 +11,13 @@
#include "fully_connected_inst.h" #include "fully_connected_inst.h"
#include "convolution_inst.h" #include "convolution_inst.h"
#include "crop_inst.h" #include "crop_inst.h"
#include "eltwise_inst.h"
#include "deconvolution_inst.h" #include "deconvolution_inst.h"
#include "shape_of_inst.h" #include "shape_of_inst.h"
#include "gemm_inst.h" #include "gemm_inst.h"
#include "experimental_detectron_roi_feature_extractor_inst.hpp" #include "experimental_detectron_roi_feature_extractor_inst.hpp"
#include "compilation_context.hpp" #include "compilation_context.hpp"
#include "implementation_map.hpp"
#include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/plugin/common_utils.hpp"
#include "intel_gpu/graph/network.hpp" #include "intel_gpu/graph/network.hpp"
@ -93,6 +95,19 @@ bool is_any_user_cpu(const std::list<const program_node*>& users) {
return false; return false;
} }
std::shared_ptr<kernel_impl_params> primitive_impl::get_weights_reorder_kernel_params() const {
if (!need_weights_reorder())
return nullptr;
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
auto prim = std::make_shared<generic_layer>("", "", _weights_reorder_params);
reorder_kernel_params->desc = prim;
reorder_kernel_params->unique_id = _weights_reorder_params->hash();
reorder_kernel_params->input_layouts.push_back(_weights_reorder_params->get_input_layout());
reorder_kernel_params->output_layouts.push_back(_weights_reorder_params->get_output_layout());
return reorder_kernel_params;
}
kernel_impl_params primitive_impl::static_canonicalize_shapes(const kernel_impl_params& impl_params) { kernel_impl_params primitive_impl::static_canonicalize_shapes(const kernel_impl_params& impl_params) {
auto updated_impl_params = canonicalize_fused_shapes(impl_params); auto updated_impl_params = canonicalize_fused_shapes(impl_params);
@ -787,19 +802,19 @@ event::ptr primitive_inst::update_weights() {
return nullptr; return nullptr;
auto& engine = _network.get_engine(); auto& engine = _network.get_engine();
auto& weights_params = _impl->_weights_reorder_params; auto reorder_kernel_params = _impl->get_weights_reorder_kernel_params();
auto weights_idx = _node->get_primitive()->input.size(); auto weights_idx = _node->get_primitive()->input.size();
auto original_weights_memory = dep_memory_ptr(weights_idx); auto original_weights_memory = dep_memory_ptr(weights_idx);
auto original_layout = original_weights_memory->get_layout(); auto original_layout = original_weights_memory->get_layout();
if (weights_params.engine == kernel_selector::GenericKernelParams::Engine::NONE) { if (!reorder_kernel_params) {
// If kernel doesn't says that it doesn't require weights reorder, but weights were reordered previously, then // If kernel doesn't says that it doesn't require weights reorder, but weights were reordered previously, then
// incorrect memory buffer may be assigned, so reset cached weights for such case // incorrect memory buffer may be assigned, so reset cached weights for such case
_reordered_weights_cache.add(original_layout, original_weights_memory); _reordered_weights_cache.add(original_layout, original_weights_memory);
_impl_params->weights_layout = optional_layout(original_layout); _impl_params->weights_layout = optional_layout(original_layout);
} else { } else {
auto expected_layout = from_weights_tensor(weights_params.dest); auto expected_layout = reorder_kernel_params->get_output_layout();
// Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion // Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion
expected_layout.set_partial_shape(original_layout.get_partial_shape()); expected_layout.set_partial_shape(original_layout.get_partial_shape());
_impl_params->weights_layout = optional_layout(expected_layout); _impl_params->weights_layout = optional_layout(expected_layout);
@ -816,30 +831,27 @@ event::ptr primitive_inst::update_weights() {
return nullptr; return nullptr;
} else { } else {
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false); GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false);
auto get_kernel_key = [&]() -> size_t { auto& cache = get_network().get_program()->get_implementations_cache();
auto seed = _node->get_primitive()->hash(); auto reorder_inst = std::make_shared<generic_layer_inst>(get_network());
seed = hash_combine(seed, expected_layout.hash());
seed = hash_combine(seed, original_layout.hash());
return seed;
};
cldnn::kernel::ptr kernel = nullptr; if (auto cached_impl = cache.get(*reorder_kernel_params)) {
auto kernel_key = get_kernel_key();
auto& cache = get_network().get_in_mem_kernels_cache();
if (cache.has(kernel_key)) {
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string() GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl; << " to " << expected_layout.to_short_string() << std::endl;
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true); reorder_inst->set_impl(cached_impl->clone());
kernel = cache.get(kernel_key);
} else { } else {
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string() GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
<< " to " << expected_layout.to_short_string() << std::endl; << " to " << expected_layout.to_short_string() << std::endl;
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
auto reorder_impl = factory(*reorder_kernel_params);
auto& kernels_cache = get_network().get_program()->get_kernels_cache(); auto& kernels_cache = get_network().get_program()->get_kernels_cache();
auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString}); auto kernels = kernels_cache.compile(*_impl_params, reorder_impl->get_kernels_source());
OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue"); OPENVINO_ASSERT(kernels.size() == 1, "[GPU] Expected number of compiled kernels is 1, but got ", kernels.size());
auto& kernel_data = kernels.begin()->second; reorder_impl->set_kernels(kernels);
kernel = kernel_data[0].first;
cache.add(kernel_key, kernel); reorder_inst->set_impl(reorder_impl->clone());
cache.add(*reorder_kernel_params, reorder_impl->clone());
} }
auto& stream = get_network().get_stream(); auto& stream = get_network().get_stream();
@ -867,8 +879,10 @@ event::ptr primitive_inst::update_weights() {
kernel_arguments_data args; kernel_arguments_data args;
args.inputs.push_back(original_weights_memory); args.inputs.push_back(original_weights_memory);
args.outputs.push_back(weights_memory); args.outputs.push_back(weights_memory);
stream.set_arguments(*kernel, weights_params.clKernel->params, args);
auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true); auto reorder_impl = reorder_inst->get_impl();
reorder_impl->set_arguments(*reorder_inst, args);
auto ev = reorder_impl->execute({}, *reorder_inst);
GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_GET_INSTANCE(debug_config);
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) { GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {

View File

@ -67,30 +67,21 @@ struct clKernelData {
bool skip_execution = false; bool skip_execution = false;
}; };
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// CPUKernel
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct CPUKernel {
virtual WeightsType GetExpectedInputType() = 0;
virtual WeightsLayout GetExpectedInputLayout() const { return WeightsLayout::oiyx; }
virtual void Execute(void* input, size_t input_size, void* output, size_t output_size) const = 0;
};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// GenericKernelParams // GenericKernelParams
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct GenericKernelParams { struct GenericKernelParams {
enum class Engine { NONE, CPU, GPU }; enum class Engine { NONE, GPU };
Engine engine = Engine::NONE; Engine engine = Engine::NONE;
std::shared_ptr<clKernelData> clKernel; std::shared_ptr<clKernelData> clKernel;
std::shared_ptr<CPUKernel> cpuKernel;
}; };
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// WeightsReorderParams // WeightsReorderParams
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
struct WeightsReorderParams : public GenericKernelParams { struct WeightsReorderParams : public GenericKernelParams {
WeightsTensor src;
WeightsTensor dest; WeightsTensor dest;
}; };

View File

@ -133,6 +133,7 @@ bool UpdateWeightsParams(weight_bias_params& newParams,
weightsReorderParams.engine = WeightsReorderParams::Engine::GPU; weightsReorderParams.engine = WeightsReorderParams::Engine::GPU;
weightsReorderParams.clKernel = std::make_shared<clKernelData>(kernels_data[0].kernels[0]); weightsReorderParams.clKernel = std::make_shared<clKernelData>(kernels_data[0].kernels[0]);
weightsReorderParams.src = r_params.input;
weightsReorderParams.dest = r_params.output; weightsReorderParams.dest = r_params.output;
newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups); newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups);

View File

@ -0,0 +1,111 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "test_utils.h"
#include "intel_gpu/graph/network.hpp"
#include "intel_gpu/graph/program.hpp"
#include "intel_gpu/primitives/input_layout.hpp"
#include "intel_gpu/primitives/data.hpp"
#include "generic_layer_inst.h"
#include "fully_connected_inst.h"
#include "implementation_map.hpp"
#include "graph/impls/ocl/register.hpp"
#include <memory>
using namespace cldnn;
using namespace ::tests;
TEST(weights_factory, impl_types) {
program::init_primitives();
ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape));
ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::any, shape_types::static_shape));
ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::cpu, shape_types::static_shape));
ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::onednn, shape_types::static_shape));
}
TEST(weights_factory, shape_types) {
program::init_primitives();
ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape));
ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::dynamic_shape));
}
TEST(weights_factory, reorder_test) {
auto& engine = get_test_engine();
const int input_f = 32, output_f = 32;
auto weights_layout = layout(ov::PartialShape{ output_f, input_f }, data_types::f32, format::bfyx);
auto weights_data_input = engine.allocate_memory(weights_layout);
auto weights_data_vec = generate_random_1d<float>(output_f * input_f, -1, 1);
set_values(weights_data_input, weights_data_vec);
cldnn::topology topology {
input_layout("input", layout{ ov::PartialShape{ -1, input_f }, data_types::f32, format::bfyx }),
data("weights", weights_data_input),
fully_connected("fc", input_info("input"), "weights")
};
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc", fc_impl_desc} })),
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
cldnn::network network(engine, topology, config);
auto inst = network.get_primitive("fc");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
// Get required WeightsReorderParams
auto weights_reorder_params = impl->get_weights_reorder_params();
ASSERT_TRUE(weights_reorder_params != nullptr);
// Constuct kernel_impl_params for weights reorder based requested WeightsReorderParams
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
reorder_kernel_params->desc = std::make_shared<generic_layer>("weights_reorder", "", weights_reorder_params);
reorder_kernel_params->unique_id = weights_reorder_params->hash();
reorder_kernel_params->input_layouts.push_back(weights_reorder_params->get_input_layout());
reorder_kernel_params->output_layouts.push_back(weights_reorder_params->get_output_layout());
// Create new generic_layer_impl
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
auto reorder_impl = factory(*reorder_kernel_params);
ASSERT_TRUE(reorder_impl != nullptr);
// Compile kernel
auto& kernel_cache = network.get_program()->get_kernels_cache();
auto kernels = kernel_cache.compile(*reorder_kernel_params, reorder_impl->get_kernels_source());
ASSERT_TRUE(kernels.size() == 1);
reorder_impl->set_kernels(kernels);
// Allocate memmory and execute generic_layer
auto output_weights_layout = weights_reorder_params->get_output_layout();
auto weights_data_output = engine.allocate_memory({ output_weights_layout });
kernel_arguments_data args;
args.inputs.push_back(weights_data_input);
args.outputs.push_back(weights_data_output);
auto reorder_inst = std::make_shared<generic_layer_inst>(network);
reorder_inst->set_impl(reorder_impl->clone());
reorder_inst->get_impl()->set_arguments(*reorder_inst, args);
reorder_inst->get_impl()->execute({}, *reorder_inst);
network.get_stream().finish();
// Compare with expected resutls
cldnn::mem_lock<float> output_ptr(weights_data_output, get_test_stream());
for (int o = 0; o < output_f; o++) {
for (int i = 0; i < input_f; i++) {
auto tensor_coord = tensor(std::vector<tensor::value_type>{o, i}, 0);
size_t input_idx = output_weights_layout.get_linear_offset(tensor_coord);
ASSERT_EQ(weights_data_vec[o * input_f + i], output_ptr[input_idx]);
}
}
}

View File

@ -8,6 +8,7 @@
#include <intel_gpu/primitives/fully_connected.hpp> #include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/data.hpp> #include <intel_gpu/primitives/data.hpp>
#include "fully_connected_inst.h"
#include "compilation_context.hpp" #include "compilation_context.hpp"
#include "program_wrapper.h" #include "program_wrapper.h"

View File

@ -10,6 +10,7 @@
#include <intel_gpu/primitives/fully_connected.hpp> #include <intel_gpu/primitives/fully_connected.hpp>
#include <intel_gpu/primitives/gather.hpp> #include <intel_gpu/primitives/gather.hpp>
#include <intel_gpu/primitives/permute.hpp> #include <intel_gpu/primitives/permute.hpp>
#include <intel_gpu/primitives/generic_layer.hpp>
using namespace cldnn; using namespace cldnn;
using namespace ::tests; using namespace ::tests;
@ -109,3 +110,19 @@ TEST(primitive_comparison, permute) {
ASSERT_EQ(permute_prim, permute_prim_eq); ASSERT_EQ(permute_prim, permute_prim_eq);
ASSERT_NE(permute_prim, permute_prim_order); ASSERT_NE(permute_prim, permute_prim_order);
} }
TEST(primitive_comparison, generic_layer) {
auto shape = ov::PartialShape{1, 2, 3, 4};
auto data_type = data_types::f32;
auto format_in = format::bfyx;
auto format_out = format::os_iyx_osv16;
auto input_layout = layout{shape, data_type, format_in};
auto output_layout = layout{shape, data_type, format_out};
auto generic_layer_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
auto generic_layer_eq_prim = generic_layer("generic_layer_eq", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
auto generic_layer_different_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(output_layout, input_layout));
ASSERT_EQ(generic_layer_prim, generic_layer_eq_prim);
ASSERT_NE(generic_layer_prim, generic_layer_different_prim);
}

View File

@ -13,6 +13,7 @@
#include "reshape_inst.h" #include "reshape_inst.h"
#include "reorder_inst.h" #include "reorder_inst.h"
#include "broadcast_inst.h" #include "broadcast_inst.h"
#include "fully_connected_inst.h"
#include "pass_manager.h" #include "pass_manager.h"
#include "to_string_utils.h" #include "to_string_utils.h"

View File

@ -12,6 +12,7 @@
#include "eltwise_inst.h" #include "eltwise_inst.h"
#include "reduce_inst.h" #include "reduce_inst.h"
#include "reshape_inst.h" #include "reshape_inst.h"
#include "fully_connected_inst.h"
#include "gemm_inst.h" #include "gemm_inst.h"
#include "convolution_inst.h" #include "convolution_inst.h"
#include "pass_manager.h" #include "pass_manager.h"

View File

@ -2351,3 +2351,60 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(2, 9, 16, 32, 64, 128), ::testing::Values(2, 9, 16, 32, 64, 128),
::testing::Values(false, true)) ::testing::Values(false, true))
); );
TEST(fully_connected_gpu, has_cached_weights_reorder) {
auto& engine = get_test_engine();
const int32_t input_f = 3, input_b = 1, weight_b = 4;
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
set_values(input_data, { -0.5f, 2.0f, 0.5f });
set_values(weights_data, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
cldnn::topology topology{
input_layout("input", input_dyn_layout),
data("weights", weights_data),
fully_connected("fc", input_info("input"), "weights")
};
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
ExecutionConfig config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::optimize_data(true));
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc", fc_impl_desc} })),
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
network network(engine, topology, config);
network.set_input_data("input", input_data);
auto outputs = network.execute();
ASSERT_EQ(outputs.size(), size_t(1));
ASSERT_EQ(outputs.begin()->first, "fc");
auto output_prim_mem = outputs.begin()->second.get_memory();
auto inst = network.get_primitive("fc");
auto impl = inst->get_impl();
ASSERT_TRUE(impl != nullptr);
ASSERT_TRUE(impl->is_dynamic());
auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
ASSERT_TRUE(reorder_kernel_params != nullptr);
auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params);
ASSERT_TRUE(reorder_impl != nullptr);
auto out_l = network.get_output_layout(outputs.begin()->first);
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
ASSERT_EQ(out_l.batch(), input_b);
ASSERT_EQ(out_l.feature(), weight_b);
ASSERT_EQ(out_l.spatial(0), 1);
ASSERT_EQ(out_l.spatial(1), 1);
cldnn::mem_lock<float> output_ptr (output_prim_mem, get_test_stream());
ASSERT_EQ(1.5f, output_ptr[0]);
ASSERT_EQ(0.75f, output_ptr[1]);
ASSERT_EQ(-2.25f, output_ptr[2]);
ASSERT_EQ(3.0f, output_ptr[3]);
}

View File

@ -14,10 +14,10 @@
#include <intel_gpu/primitives/mvn.hpp> #include <intel_gpu/primitives/mvn.hpp>
#include <intel_gpu/primitives/permute.hpp> #include <intel_gpu/primitives/permute.hpp>
#include <intel_gpu/primitives/reshape.hpp> #include <intel_gpu/primitives/reshape.hpp>
#include <intel_gpu/primitives/quantize.hpp>
#include "eltwise_inst.h" #include "primitive_inst.h"
// #include "fully_connected_inst.h"
using namespace cldnn; using namespace cldnn;
using namespace tests; using namespace tests;