[GPU] Weights reorders primitive cachcing (#16638)
Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
This commit is contained in:
parent
42ef81a9e6
commit
808647dfb3
@ -225,10 +225,6 @@ public:
|
|||||||
/// Returns memory state @p variable_id of stateful network
|
/// Returns memory state @p variable_id of stateful network
|
||||||
VariableState& get_variable_memory(const std::string &variable_id);
|
VariableState& get_variable_memory(const std::string &variable_id);
|
||||||
|
|
||||||
/// Return in_mem_kernels_cache
|
|
||||||
KernelsCache& get_in_mem_kernels_cache() const { return *_in_mem_kernels_cache; }
|
|
||||||
std::mutex& get_impl_cache_mutex() const { return _in_mem_cache_mutex; }
|
|
||||||
|
|
||||||
const ExecutionConfig& get_config() const { return _config; }
|
const ExecutionConfig& get_config() const { return _config; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
@ -260,8 +256,6 @@ private:
|
|||||||
std::unordered_map<primitive_id, event::ptr> _events;
|
std::unordered_map<primitive_id, event::ptr> _events;
|
||||||
output_chains_map _output_chains;
|
output_chains_map _output_chains;
|
||||||
|
|
||||||
mutable std::mutex _in_mem_cache_mutex;
|
|
||||||
|
|
||||||
void build_exec_order();
|
void build_exec_order();
|
||||||
void allocate_primitive_instance(program_node const& node);
|
void allocate_primitive_instance(program_node const& node);
|
||||||
void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
|
void transfer_memory_to_device(std::shared_ptr<primitive_inst> instance, program_node const& node);
|
||||||
@ -273,10 +267,6 @@ private:
|
|||||||
void calculate_weights_cache_capacity();
|
void calculate_weights_cache_capacity();
|
||||||
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
|
output_chains_map::iterator add_output_chain(std::shared_ptr<primitive_inst>& p_inst);
|
||||||
|
|
||||||
// Move from cldnn::program to cldnn::network for multi-threads issue.
|
|
||||||
std::unique_ptr<KernelsCache> _in_mem_kernels_cache;
|
|
||||||
const size_t _in_mem_kernels_cache_capacity = 10000;
|
|
||||||
|
|
||||||
#ifdef GPU_DEBUG_CONFIG
|
#ifdef GPU_DEBUG_CONFIG
|
||||||
int64_t iteration = 0;
|
int64_t iteration = 0;
|
||||||
#endif
|
#endif
|
||||||
|
@ -0,0 +1,86 @@
|
|||||||
|
// Copyright (C) 2018-2023 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "intel_gpu/primitives/primitive.hpp"
|
||||||
|
#include "intel_gpu/runtime/memory.hpp"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace cldnn {
|
||||||
|
|
||||||
|
struct WeightsReorderParams {
|
||||||
|
WeightsReorderParams(layout in_layout, layout out_layout) : _in_layout(in_layout), _out_layout(out_layout) {}
|
||||||
|
|
||||||
|
virtual size_t hash() const {
|
||||||
|
return hash_combine(_in_layout.hash(), _out_layout.hash());
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual bool operator==(const WeightsReorderParams& rhs) const {
|
||||||
|
if (typeid(*this) != typeid(rhs))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return _in_layout == rhs._in_layout &&
|
||||||
|
_out_layout == rhs._out_layout;
|
||||||
|
}
|
||||||
|
|
||||||
|
layout get_input_layout() const { return _in_layout; }
|
||||||
|
layout get_output_layout() const { return _out_layout; }
|
||||||
|
|
||||||
|
virtual ~WeightsReorderParams() = default;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
layout _in_layout;
|
||||||
|
layout _out_layout;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
|
||||||
|
/// @details Corresponding values are bitwise equal before/after reorder.
|
||||||
|
struct generic_layer : public primitive_base<generic_layer> {
|
||||||
|
CLDNN_DECLARE_PRIMITIVE(generic_layer)
|
||||||
|
|
||||||
|
/// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive.
|
||||||
|
/// @param id This primitive id.
|
||||||
|
/// @param input Input primitive id.
|
||||||
|
/// @param output_layout Requested memory layout.
|
||||||
|
/// @param mean Primitive id to get mean subtract values.
|
||||||
|
generic_layer(const primitive_id& id,
|
||||||
|
const primitive_id& input,
|
||||||
|
std::shared_ptr<WeightsReorderParams> params,
|
||||||
|
const padding& output_padding = padding())
|
||||||
|
: primitive_base(id, {input}, {output_padding}), params(params) {}
|
||||||
|
|
||||||
|
std::shared_ptr<WeightsReorderParams> params;
|
||||||
|
|
||||||
|
size_t hash() const override {
|
||||||
|
size_t seed = primitive::hash();
|
||||||
|
|
||||||
|
if (params)
|
||||||
|
seed = hash_combine(seed, params->hash());
|
||||||
|
|
||||||
|
return seed;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const primitive& rhs) const override {
|
||||||
|
if (!compare_common_params(rhs))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto rhs_casted = downcast<const generic_layer>(rhs);
|
||||||
|
|
||||||
|
if ((params == nullptr) != (rhs_casted.params == nullptr))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (params != nullptr)
|
||||||
|
return *params == *rhs_casted.params;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
|
||||||
|
};
|
||||||
|
/// @}
|
||||||
|
/// @}
|
||||||
|
/// @}
|
||||||
|
} // namespace cldnn
|
@ -21,6 +21,11 @@ generic_layer_node::typed_program_node(const std::shared_ptr<generic_layer> prim
|
|||||||
generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node)
|
generic_layer_inst::typed_primitive_inst(network& network, generic_layer_node const& node)
|
||||||
: parent(network, node) {}
|
: parent(network, node) {}
|
||||||
|
|
||||||
|
generic_layer_inst::typed_primitive_inst(network& network)
|
||||||
|
: parent(network) {
|
||||||
|
_type = generic_layer::type_id();
|
||||||
|
}
|
||||||
|
|
||||||
std::string generic_layer_inst::to_string(generic_layer_node const& node) {
|
std::string generic_layer_inst::to_string(generic_layer_node const& node) {
|
||||||
auto node_info = node.desc_to_json();
|
auto node_info = node.desc_to_json();
|
||||||
|
|
||||||
|
@ -2,14 +2,16 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "program_node.h"
|
#include "program_node.h"
|
||||||
#include "mutable_data_inst.h"
|
#include "mutable_data_inst.h"
|
||||||
#include "convert_color_inst.h"
|
#include "convert_color_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "assign_inst.h"
|
#include "assign_inst.h"
|
||||||
#include "tensor_type.h"
|
#include "tensor_type.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
@ -2,15 +2,18 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
//
|
//
|
||||||
|
|
||||||
|
#include "intel_gpu/runtime/engine.hpp"
|
||||||
|
#include "intel_gpu/runtime/itt.hpp"
|
||||||
|
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "data_inst.h"
|
#include "data_inst.h"
|
||||||
#include "mutable_data_inst.h"
|
#include "mutable_data_inst.h"
|
||||||
#include "reshape_inst.h"
|
#include "reshape_inst.h"
|
||||||
#include "quantize_inst.h"
|
#include "quantize_inst.h"
|
||||||
#include "arg_max_min_inst.h"
|
#include "arg_max_min_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "program_node.h"
|
#include "program_node.h"
|
||||||
#include "intel_gpu/runtime/engine.hpp"
|
|
||||||
#include "intel_gpu/runtime/itt.hpp"
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <iomanip>
|
#include <iomanip>
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
|
|
||||||
#include "gemm_inst.h"
|
#include "gemm_inst.h"
|
||||||
#include "pooling_inst.h"
|
#include "pooling_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
|
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "impls/ocl/primitive_base.hpp"
|
#include "impls/ocl/primitive_base.hpp"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "fully_connected/fully_connected_params.h"
|
#include "fully_connected/fully_connected_params.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
@ -4,8 +4,11 @@
|
|||||||
|
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "program_helpers.h"
|
#include "program_helpers.h"
|
||||||
#include "include/binary_convolution_inst.h"
|
#include "convolution_inst.h"
|
||||||
#include "include/deformable_convolution_inst.h"
|
#include "binary_convolution_inst.h"
|
||||||
|
#include "deconvolution_inst.h"
|
||||||
|
#include "deformable_convolution_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "lstm_dynamic_input_inst.h"
|
#include "lstm_dynamic_input_inst.h"
|
||||||
|
|
||||||
namespace cldnn {
|
namespace cldnn {
|
||||||
@ -13,7 +16,6 @@ namespace cldnn {
|
|||||||
post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref)
|
post_optimize_weights::post_optimize_weights(reorder_factory& rf_ref)
|
||||||
: base_pass("post_optimize_weights"), _rf(rf_ref) {}
|
: base_pass("post_optimize_weights"), _rf(rf_ref) {}
|
||||||
|
|
||||||
// function which prepares given primitive for weights optimization
|
|
||||||
template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
|
template<typename T> post_optimize_weights::weights_bias_offset post_optimize_weights::get_weights_bias_offset(const T& node) {
|
||||||
return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size());
|
return weights_bias_offset(node.get_primitive()->input.size(), program_helpers::wrap_if_single(node.get_primitive()->weights).size());
|
||||||
}
|
}
|
||||||
@ -37,15 +39,13 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
|
|||||||
return;
|
return;
|
||||||
|
|
||||||
auto output_layout = node.get_output_layout();
|
auto output_layout = node.get_output_layout();
|
||||||
auto& weights_reorder_params = impl->_weights_reorder_params;
|
auto weights_reorder_params = impl->get_weights_reorder_params();
|
||||||
|
|
||||||
for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) {
|
for (auto i = offsets.weights_offset; i < offsets.bias_offset; i++) {
|
||||||
auto& weights_node = node.get_dependency(i);
|
auto& weights_node = node.get_dependency(i);
|
||||||
auto weights_layout = weights_node.get_output_layout();
|
|
||||||
|
|
||||||
auto reorders = _rf.get_weights_reorder(weights_node.id(), weights_layout, weights_reorder_params);
|
auto reorder = _rf.get_weights_reorder(weights_node.id(), weights_reorder_params);
|
||||||
|
|
||||||
for (auto& reorder : reorders) {
|
if (reorder.first) {
|
||||||
// insert new generic_layer node to topology
|
// insert new generic_layer node to topology
|
||||||
p.add_intermediate(reorder.first, node, i, !reorder.second);
|
p.add_intermediate(reorder.first, node, i, !reorder.second);
|
||||||
// set generic_layer's node output layout and implementation
|
// set generic_layer's node output layout and implementation
|
||||||
@ -65,9 +65,7 @@ void post_optimize_weights::optimize_weights(T& node, program& p) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Reset weights reorder params to not keep source code pointer
|
// Reset weights reorder params to not keep source code pointer
|
||||||
weights_reorder_params.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
impl->reset_weights_reorder_params();
|
||||||
weights_reorder_params.clKernel = nullptr;
|
|
||||||
weights_reorder_params.cpuKernel = nullptr;
|
|
||||||
|
|
||||||
// set the old output layout and do not invalidate users as change of weights will not affect output layout
|
// set the old output layout and do not invalidate users as change of weights will not affect output layout
|
||||||
node.set_output_layout(output_layout, false);
|
node.set_output_layout(output_layout, false);
|
||||||
|
@ -25,6 +25,7 @@
|
|||||||
#include "softmax_inst.h"
|
#include "softmax_inst.h"
|
||||||
#include "resample_inst.h"
|
#include "resample_inst.h"
|
||||||
#include "depth_to_space_inst.h"
|
#include "depth_to_space_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "space_to_depth_inst.h"
|
#include "space_to_depth_inst.h"
|
||||||
#include "gather_inst.h"
|
#include "gather_inst.h"
|
||||||
#include "gather_nd_inst.h"
|
#include "gather_nd_inst.h"
|
||||||
|
@ -2,13 +2,12 @@
|
|||||||
// SPDX-License-Identifier: Apache-2.0
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
//
|
//
|
||||||
|
|
||||||
|
#include "intel_gpu/runtime/debug_configuration.hpp"
|
||||||
|
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "program_helpers.h"
|
#include "program_helpers.h"
|
||||||
#include "binary_convolution_inst.h"
|
|
||||||
#include <vector>
|
|
||||||
#include <list>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
|
#include "binary_convolution_inst.h"
|
||||||
#include "reshape_inst.h"
|
#include "reshape_inst.h"
|
||||||
#include "convert_color_inst.h"
|
#include "convert_color_inst.h"
|
||||||
#include "one_hot_inst.h"
|
#include "one_hot_inst.h"
|
||||||
@ -16,7 +15,11 @@
|
|||||||
#include "depth_to_space_inst.h"
|
#include "depth_to_space_inst.h"
|
||||||
#include "concatenation_inst.h"
|
#include "concatenation_inst.h"
|
||||||
#include "region_yolo_inst.h"
|
#include "region_yolo_inst.h"
|
||||||
#include "intel_gpu/runtime/debug_configuration.hpp"
|
#include "fully_connected_inst.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <list>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
using namespace cldnn;
|
using namespace cldnn;
|
||||||
|
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
#include "mvn_inst.h"
|
#include "mvn_inst.h"
|
||||||
#include "to_string_utils.h"
|
#include "to_string_utils.h"
|
||||||
#include "pooling_inst.h"
|
#include "pooling_inst.h"
|
||||||
|
#include "reshape_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
|
|
||||||
#ifdef ENABLE_ONEDNN_FOR_GPU
|
#ifdef ENABLE_ONEDNN_FOR_GPU
|
||||||
#include "gemm_inst.h"
|
#include "gemm_inst.h"
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "data_inst.h"
|
#include "data_inst.h"
|
||||||
#include "mutable_data_inst.h"
|
#include "mutable_data_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "gemm_inst.h"
|
#include "gemm_inst.h"
|
||||||
#include "program_node.h"
|
#include "program_node.h"
|
||||||
#include "intel_gpu/runtime/engine.hpp"
|
#include "intel_gpu/runtime/engine.hpp"
|
||||||
|
@ -31,6 +31,7 @@ public:
|
|||||||
|
|
||||||
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
|
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
|
||||||
void set_arguments(primitive_inst& /*instance*/) override {}
|
void set_arguments(primitive_inst& /*instance*/) override {}
|
||||||
|
void set_arguments(primitive_inst& /*instance*/, kernel_arguments_data& /*args*/) override {}
|
||||||
kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override {
|
kernel_arguments_data get_arguments(const primitive_inst& /*instance*/) const override {
|
||||||
kernel_arguments_data args;
|
kernel_arguments_data args;
|
||||||
return args;
|
return args;
|
||||||
|
@ -400,7 +400,7 @@ struct non_max_suppression_impl : typed_primitive_impl<non_max_suppression> {
|
|||||||
return make_unique<non_max_suppression_impl>(*this);
|
return make_unique<non_max_suppression_impl>(*this);
|
||||||
}
|
}
|
||||||
|
|
||||||
non_max_suppression_impl() : parent(kernel_selector::weights_reorder_params(), "non_max_suppression_impl") {}
|
non_max_suppression_impl() : parent("non_max_suppression_impl") {}
|
||||||
|
|
||||||
event::ptr execute_impl(const std::vector<event::ptr>& event, typed_primitive_inst<non_max_suppression>& instance) override {
|
event::ptr execute_impl(const std::vector<event::ptr>& event, typed_primitive_inst<non_max_suppression>& instance) override {
|
||||||
for (auto e : event) {
|
for (auto e : event) {
|
||||||
|
@ -14,8 +14,8 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
|||||||
using parent::parent;
|
using parent::parent;
|
||||||
|
|
||||||
kernel_selector::cl_kernel_data _cl_kernel_data;
|
kernel_selector::cl_kernel_data _cl_kernel_data;
|
||||||
std::vector<kernel::ptr> _kernels;
|
kernel::ptr _kernel;
|
||||||
std::string _cached_kernel_id;
|
kernel_id _cached_kernel_id;
|
||||||
|
|
||||||
DECLARE_OBJECT_TYPE_SERIALIZATION
|
DECLARE_OBJECT_TYPE_SERIALIZATION
|
||||||
|
|
||||||
@ -27,18 +27,21 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
|||||||
|
|
||||||
generic_layer_impl(const generic_layer_impl& other)
|
generic_layer_impl(const generic_layer_impl& other)
|
||||||
: _cl_kernel_data(other._cl_kernel_data)
|
: _cl_kernel_data(other._cl_kernel_data)
|
||||||
, _kernels({})
|
, _kernel(nullptr)
|
||||||
, _cached_kernel_id(other._cached_kernel_id) {
|
, _cached_kernel_id(other._cached_kernel_id) {
|
||||||
if (other._kernels.empty()) {
|
OPENVINO_ASSERT(other._kernel, "[GPU] Can't copy generic_layer_impl node: kernel is nullptr");
|
||||||
throw std::runtime_error("Can't copy generic_layer_impl node: kernels vector is empty");
|
_kernel = other._kernel->clone();
|
||||||
}
|
|
||||||
_kernels.push_back(other._kernels.front()->clone());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
generic_layer_impl(const generic_layer_node& arg)
|
generic_layer_impl(const kernel_impl_params& params)
|
||||||
: _cl_kernel_data(*arg.get_primitive()->generic_params.clKernel.get())
|
: _cl_kernel_data()
|
||||||
, _kernels()
|
, _kernel(nullptr)
|
||||||
, _cached_kernel_id() { }
|
, _cached_kernel_id() {
|
||||||
|
auto reorder_params = params.typed_desc<generic_layer>()->params;
|
||||||
|
auto casted_params = std::dynamic_pointer_cast<WeightsReorderParamsOCL>(reorder_params);
|
||||||
|
OPENVINO_ASSERT(casted_params, "[GPU] Invalid weights reorder parameters type for ", params.desc->id, " node");
|
||||||
|
_cl_kernel_data = *casted_params->get_cl_kernel();
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
|
std::vector<std::shared_ptr<cldnn::kernel_string>> get_kernels_source() override {
|
||||||
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
|
std::vector<std::shared_ptr<cldnn::kernel_string>> kernel_strings;
|
||||||
@ -47,11 +50,11 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<kernel::ptr> get_kernels() const override {
|
std::vector<kernel::ptr> get_kernels() const override {
|
||||||
return _kernels;
|
return {_kernel};
|
||||||
}
|
}
|
||||||
|
|
||||||
void save(BinaryOutputBuffer& ob) const override {
|
void save(BinaryOutputBuffer& ob) const override {
|
||||||
ob <<_cl_kernel_data;
|
ob << _cl_kernel_data;
|
||||||
ob << _cached_kernel_id;
|
ob << _cached_kernel_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -61,21 +64,27 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
|
void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) override {
|
||||||
_kernels.clear();
|
_kernel = nullptr;
|
||||||
auto compiled_kernels = kernels_cache.get_kernels(params);
|
auto compiled_kernels = kernels_cache.get_kernels(params);
|
||||||
_kernels.insert(_kernels.begin(), compiled_kernels.begin(), compiled_kernels.end());
|
OPENVINO_ASSERT(compiled_kernels.size() == 1, "[GPU] Unexpected number of kernels for generic_layer during init_kernels() call");
|
||||||
|
_kernel = compiled_kernels.front();
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
|
void init_by_cached_kernels(const kernels_cache& kernels_cache) override {
|
||||||
_kernels.emplace_back(kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id));
|
_kernel = kernels_cache.get_kernel_from_cached_kernels(_cached_kernel_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
|
void set_cached_kernel_ids(const kernels_cache& kernels_cache) override {
|
||||||
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernels[0]);
|
_cached_kernel_id = kernels_cache.get_cached_kernel_id(_kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) override {
|
||||||
|
OPENVINO_ASSERT(kernels.size() == 1 &&
|
||||||
|
kernels.begin()->second.size() == 1, "[GPU] Unexpected number of kernels for generic_layer");
|
||||||
|
_kernel = kernels.begin()->second[0].first;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_arguments_impl(generic_layer_inst& instance) override {
|
void set_arguments_impl(generic_layer_inst& instance) override {
|
||||||
stream& stream = instance.get_network().get_stream();
|
|
||||||
kernel_arguments_data args;
|
kernel_arguments_data args;
|
||||||
args.scalars = &_cl_kernel_data.params.scalars;
|
args.scalars = &_cl_kernel_data.params.scalars;
|
||||||
|
|
||||||
@ -83,7 +92,13 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
|||||||
args.inputs.push_back(instance.input_memory_ptr(i));
|
args.inputs.push_back(instance.input_memory_ptr(i));
|
||||||
}
|
}
|
||||||
args.outputs.push_back(instance.output_memory_ptr());
|
args.outputs.push_back(instance.output_memory_ptr());
|
||||||
stream.set_arguments(*_kernels.front(), _cl_kernel_data.params, args);
|
|
||||||
|
set_arguments_impl(instance, args);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_arguments_impl(generic_layer_inst& instance, kernel_arguments_data& args) override {
|
||||||
|
stream& stream = instance.get_network().get_stream();
|
||||||
|
stream.set_arguments(*_kernel, _cl_kernel_data.params, args);
|
||||||
}
|
}
|
||||||
|
|
||||||
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
|
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
|
||||||
@ -95,58 +110,23 @@ struct generic_layer_impl : typed_primitive_impl<generic_layer> {
|
|||||||
args.inputs.push_back(instance.input_memory_ptr(i));
|
args.inputs.push_back(instance.input_memory_ptr(i));
|
||||||
}
|
}
|
||||||
args.outputs.push_back(instance.output_memory_ptr());
|
args.outputs.push_back(instance.output_memory_ptr());
|
||||||
return stream.enqueue_kernel(*_kernels.front(), _cl_kernel_data.params, args, events, true);
|
return stream.enqueue_kernel(*_kernel, _cl_kernel_data.params, args, events, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::unique_ptr<primitive_impl> create(const kernel_impl_params& params) {
|
||||||
|
return make_unique<generic_layer_impl>(params);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: move this file to cpu folder and add a new traget to 'cldnn::engine_types'
|
static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params& params) {
|
||||||
struct generic_layer_cpu : typed_primitive_impl<generic_layer> {
|
return make_unique<generic_layer_impl>(params);
|
||||||
const generic_layer_node& outer;
|
|
||||||
DECLARE_OBJECT_TYPE_SERIALIZATION
|
|
||||||
|
|
||||||
std::unique_ptr<primitive_impl> clone() const override {
|
|
||||||
return make_unique<generic_layer_cpu>(*this);
|
|
||||||
}
|
|
||||||
|
|
||||||
explicit generic_layer_cpu(const generic_layer_node& arg) : outer(arg) {}
|
|
||||||
|
|
||||||
event::ptr execute_impl(const std::vector<event::ptr>& events, generic_layer_inst& instance) override {
|
|
||||||
stream& stream = instance.get_network().get_stream();
|
|
||||||
auto input_mem = instance.input_memory_ptr();
|
|
||||||
auto output_mem = instance.output_memory_ptr();
|
|
||||||
|
|
||||||
auto ev = stream.create_user_event(false);
|
|
||||||
std::vector<event::ptr> tmp_events(events);
|
|
||||||
|
|
||||||
for (auto& a : events) {
|
|
||||||
a->wait();
|
|
||||||
}
|
|
||||||
|
|
||||||
mem_lock<uint8_t, mem_lock_type::read> old_pointer(input_mem, stream);
|
|
||||||
mem_lock<uint8_t, mem_lock_type::write> new_pointer(output_mem, stream);
|
|
||||||
|
|
||||||
const auto& cpu_kernel = *outer.get_primitive()->generic_params.cpuKernel.get();
|
|
||||||
|
|
||||||
cpu_kernel.Execute(old_pointer.data(), old_pointer.size(), new_pointer.data(), new_pointer.size());
|
|
||||||
|
|
||||||
ev->set();
|
|
||||||
return ev;
|
|
||||||
}
|
|
||||||
|
|
||||||
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
|
|
||||||
};
|
|
||||||
|
|
||||||
static std::unique_ptr<primitive_impl> create(const generic_layer_node& arg, const kernel_impl_params&) {
|
|
||||||
if (arg.get_primitive()->generic_params.engine == kernel_selector::generic_kernel_params::Engine::GPU) {
|
|
||||||
return make_unique<generic_layer_impl>(arg);
|
|
||||||
} else {
|
|
||||||
return make_unique<generic_layer_cpu>(arg);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace detail {
|
namespace detail {
|
||||||
attach_generic_layer_impl::attach_generic_layer_impl() {
|
attach_generic_layer_impl::attach_generic_layer_impl() {
|
||||||
implementation_map<generic_layer>::add(cldnn::impl_types::ocl, create, {});
|
implementation_map<generic_layer>::add(cldnn::impl_types::ocl, create, {});
|
||||||
|
|
||||||
|
WeightsReordersFactory::add(cldnn::impl_types::ocl, shape_types::static_shape, generic_layer_impl::create);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace detail
|
} // namespace detail
|
||||||
@ -154,4 +134,3 @@ attach_generic_layer_impl::attach_generic_layer_impl() {
|
|||||||
} // namespace cldnn
|
} // namespace cldnn
|
||||||
|
|
||||||
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl)
|
BIND_BINARY_BUFFER_WITH_TYPE(cldnn::ocl::generic_layer_impl)
|
||||||
ASSIGN_TYPE_NAME(cldnn::ocl::generic_layer_cpu)
|
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include "intel_gpu/primitives/eltwise.hpp"
|
#include "intel_gpu/primitives/eltwise.hpp"
|
||||||
#include "intel_gpu/primitives/quantize.hpp"
|
#include "intel_gpu/primitives/quantize.hpp"
|
||||||
#include "intel_gpu/primitives/activation.hpp"
|
#include "intel_gpu/primitives/activation.hpp"
|
||||||
|
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||||
#include "intel_gpu/primitives/primitive.hpp"
|
#include "intel_gpu/primitives/primitive.hpp"
|
||||||
|
|
||||||
#include "kernel_selector_params.h"
|
#include "kernel_selector_params.h"
|
||||||
@ -166,7 +167,7 @@ inline optional_params_t get_default_weights_bias_optional_params(const program&
|
|||||||
}
|
}
|
||||||
|
|
||||||
inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) {
|
inline kernel_selector::eltwise_mode convert_to_eltwise_mode(eltwise_mode mode) {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case eltwise_mode::sum:
|
case eltwise_mode::sum:
|
||||||
return kernel_selector::eltwise_mode::ADD;
|
return kernel_selector::eltwise_mode::ADD;
|
||||||
case eltwise_mode::sub:
|
case eltwise_mode::sub:
|
||||||
@ -269,4 +270,106 @@ inline kernel_impl_params canonicalize_fused_shapes(const kernel_impl_params& im
|
|||||||
return updated_impl_params;
|
return updated_impl_params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class WeightsReorderParamsOCL : public WeightsReorderParams {
|
||||||
|
public:
|
||||||
|
explicit WeightsReorderParamsOCL(const kernel_selector::WeightsReorderParams& params)
|
||||||
|
: WeightsReorderParams(from_weights_tensor(params.src), from_weights_tensor(params.dest)) {
|
||||||
|
cl_kernel = params.clKernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t hash() const override {
|
||||||
|
size_t seed = WeightsReorderParams::hash();
|
||||||
|
|
||||||
|
if (cl_kernel == nullptr)
|
||||||
|
return seed;
|
||||||
|
|
||||||
|
seed = hash_combine(seed, cl_kernel->skip_execution);
|
||||||
|
|
||||||
|
auto& gws = cl_kernel->params.workGroups.global;
|
||||||
|
seed = hash_range(seed, gws.begin(), gws.end());
|
||||||
|
|
||||||
|
auto& lws = cl_kernel->params.workGroups.local;
|
||||||
|
seed = hash_range(seed, lws.begin(), lws.end());
|
||||||
|
|
||||||
|
auto& arguments = cl_kernel->params.arguments;
|
||||||
|
for (auto& args : arguments) {
|
||||||
|
seed = hash_combine(seed, args.index);
|
||||||
|
seed = hash_combine(seed, args.t);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto& scalars = cl_kernel->params.scalars;
|
||||||
|
for (auto& s : scalars) {
|
||||||
|
seed = hash_combine(seed, s.t);
|
||||||
|
}
|
||||||
|
|
||||||
|
return seed;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const WeightsReorderParams& rhs) const override {
|
||||||
|
if (typeid(*this) != typeid(rhs))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!WeightsReorderParams::operator==(rhs))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto rhs_casted = downcast<const WeightsReorderParamsOCL>(rhs);
|
||||||
|
|
||||||
|
if (cl_kernel != nullptr && rhs_casted.cl_kernel != nullptr) {
|
||||||
|
auto& clKernel_rhs = rhs_casted.cl_kernel;
|
||||||
|
if (cl_kernel->skip_execution != clKernel_rhs->skip_execution)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto& gws = cl_kernel->params.workGroups.global;
|
||||||
|
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
|
||||||
|
if (gws != gws_rhs)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto& lws = cl_kernel->params.workGroups.local;
|
||||||
|
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
|
||||||
|
if (lws != lws_rhs)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto& arguments = cl_kernel->params.arguments;
|
||||||
|
auto& arguments_rhs = clKernel_rhs->params.arguments;
|
||||||
|
if (arguments.size() != arguments_rhs.size())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (size_t idx = 0; idx < arguments.size(); idx++) {
|
||||||
|
if (arguments[idx].index != arguments_rhs[idx].index)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (arguments[idx].t != arguments_rhs[idx].t)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto& scalars = cl_kernel->params.scalars;
|
||||||
|
auto& scalars_rhs = clKernel_rhs->params.scalars;
|
||||||
|
if (scalars.size() != scalars_rhs.size())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (size_t idx = 0; idx < scalars.size(); idx++) {
|
||||||
|
if (scalars[idx].t != scalars_rhs[idx].t)
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<kernel_selector::clKernelData> get_cl_kernel() {
|
||||||
|
return cl_kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::shared_ptr<kernel_selector::clKernelData> cl_kernel;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline std::shared_ptr<WeightsReorderParams> create_weights_reorder_params(const kernel_selector::WeightsReorderParams& params) {
|
||||||
|
if (params.engine == kernel_selector::generic_kernel_params::Engine::NONE) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::make_shared<WeightsReorderParamsOCL>(params);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace cldnn
|
} // namespace cldnn
|
||||||
|
@ -40,7 +40,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
|||||||
|
|
||||||
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
|
typed_primitive_impl_ocl() : _kernel_data({}), _cached_kernel_ids({}), _kernels({}) {
|
||||||
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
||||||
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
|
|
||||||
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -57,11 +56,10 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl<PType> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd)
|
typed_primitive_impl_ocl(const kernel_selector::kernel_data& kd)
|
||||||
: typed_primitive_impl<PType>(kd.weightsReorderParams, kd.kernelName),
|
: typed_primitive_impl<PType>(create_weights_reorder_params(kd.weightsReorderParams), kd.kernelName),
|
||||||
_kernel_data(kd) {
|
_kernel_data(kd) {
|
||||||
// weights reorder params got copied to parent, clear in _kernel_data to release shared ptr
|
// weights reorder params got copied to parent, clear in _kernel_data to release shared ptr
|
||||||
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
_kernel_data.weightsReorderParams.engine = kernel_selector::generic_kernel_params::Engine::NONE;
|
||||||
_kernel_data.weightsReorderParams.cpuKernel = nullptr;
|
|
||||||
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
_kernel_data.weightsReorderParams.clKernel = nullptr;
|
||||||
|
|
||||||
this->can_reuse_memory = _kernel_data.can_reuse_memory;
|
this->can_reuse_memory = _kernel_data.can_reuse_memory;
|
||||||
@ -214,6 +212,21 @@ protected:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void set_arguments_impl(typed_primitive_inst<PType>& instance, kernel_arguments_data& args) override {
|
||||||
|
if (instance.can_be_optimized()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
stream& stream = instance.get_network().get_stream();
|
||||||
|
|
||||||
|
for (size_t k = 0; k < _kernels.size(); ++k) {
|
||||||
|
if (_kernel_data.kernels[k].skip_execution)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
stream.set_arguments(*_kernels[k], _kernel_data.kernels[k].params, args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& instance) const override {
|
kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& instance) const override {
|
||||||
for (size_t k = 0; k < _kernels.size(); ++k) {
|
for (size_t k = 0; k < _kernels.size(); ++k) {
|
||||||
auto args = get_arguments(instance);
|
auto args = get_arguments(instance);
|
||||||
|
@ -4,7 +4,6 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "generic_layer.hpp"
|
|
||||||
#include "intel_gpu/primitives/activation.hpp"
|
#include "intel_gpu/primitives/activation.hpp"
|
||||||
#include "intel_gpu/primitives/arg_max_min.hpp"
|
#include "intel_gpu/primitives/arg_max_min.hpp"
|
||||||
#include "intel_gpu/primitives/batch_to_space.hpp"
|
#include "intel_gpu/primitives/batch_to_space.hpp"
|
||||||
@ -29,6 +28,7 @@
|
|||||||
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
|
#include "intel_gpu/primitives/experimental_detectron_topk_rois.hpp"
|
||||||
#include "intel_gpu/primitives/eye.hpp"
|
#include "intel_gpu/primitives/eye.hpp"
|
||||||
#include "intel_gpu/primitives/fully_connected.hpp"
|
#include "intel_gpu/primitives/fully_connected.hpp"
|
||||||
|
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||||
#include "intel_gpu/primitives/gather.hpp"
|
#include "intel_gpu/primitives/gather.hpp"
|
||||||
#include "intel_gpu/primitives/gather_elements.hpp"
|
#include "intel_gpu/primitives/gather_elements.hpp"
|
||||||
#include "intel_gpu/primitives/gather_nd.hpp"
|
#include "intel_gpu/primitives/gather_nd.hpp"
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include "primitive_onednn_base.h"
|
#include "primitive_onednn_base.h"
|
||||||
#include "implementation_map.hpp"
|
#include "implementation_map.hpp"
|
||||||
|
|
||||||
#include "kernel_selector_common.h"
|
#include "impls/ocl/kernel_selector_helper.h"
|
||||||
|
|
||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
|
|
||||||
@ -158,6 +158,7 @@ protected:
|
|||||||
|
|
||||||
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
||||||
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
||||||
|
weights_reorder_params.src = r_params.input;
|
||||||
weights_reorder_params.dest = r_params.output;
|
weights_reorder_params.dest = r_params.output;
|
||||||
|
|
||||||
return weights_reorder_params;
|
return weights_reorder_params;
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
#include "primitive_onednn_base.h"
|
#include "primitive_onednn_base.h"
|
||||||
#include "implementation_map.hpp"
|
#include "implementation_map.hpp"
|
||||||
|
|
||||||
#include "kernel_selector_common.h"
|
#include "impls/ocl/kernel_selector_helper.h"
|
||||||
|
|
||||||
#include <oneapi/dnnl/dnnl.hpp>
|
#include <oneapi/dnnl/dnnl.hpp>
|
||||||
|
|
||||||
@ -79,6 +79,7 @@ protected:
|
|||||||
|
|
||||||
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
||||||
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
||||||
|
weights_reorder_params.src = r_params.input;
|
||||||
weights_reorder_params.dest = r_params.output;
|
weights_reorder_params.dest = r_params.output;
|
||||||
|
|
||||||
return weights_reorder_params;
|
return weights_reorder_params;
|
||||||
|
@ -6,7 +6,7 @@
|
|||||||
#include "primitive_onednn_base.h"
|
#include "primitive_onednn_base.h"
|
||||||
#include "implementation_map.hpp"
|
#include "implementation_map.hpp"
|
||||||
|
|
||||||
#include "kernel_selector_common.h"
|
#include "impls/ocl/kernel_selector_helper.h"
|
||||||
|
|
||||||
#include <oneapi/dnnl/dnnl.hpp>
|
#include <oneapi/dnnl/dnnl.hpp>
|
||||||
|
|
||||||
@ -91,6 +91,7 @@ protected:
|
|||||||
|
|
||||||
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
weights_reorder_params.engine = kernel_selector::WeightsReorderParams::Engine::GPU;
|
||||||
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
weights_reorder_params.clKernel = std::make_shared<kernel_selector::clKernelData>(kernels_data[0].kernels[0]);
|
||||||
|
weights_reorder_params.src = r_params.input;
|
||||||
weights_reorder_params.dest = r_params.output;
|
weights_reorder_params.dest = r_params.output;
|
||||||
|
|
||||||
return weights_reorder_params;
|
return weights_reorder_params;
|
||||||
|
@ -20,6 +20,7 @@
|
|||||||
|
|
||||||
#include "reorder/reorder_weights_kernel_selector.h"
|
#include "reorder/reorder_weights_kernel_selector.h"
|
||||||
#include "reorder/reorder_kernel_base.h"
|
#include "reorder/reorder_kernel_base.h"
|
||||||
|
#include "impls/ocl/kernel_selector_helper.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <list>
|
#include <list>
|
||||||
@ -46,7 +47,7 @@ struct typed_primitive_onednn_impl : public typed_primitive_impl<PType> {
|
|||||||
std::shared_ptr<dnnl::primitive_attr> attrs,
|
std::shared_ptr<dnnl::primitive_attr> attrs,
|
||||||
const PrimDescType& pd,
|
const PrimDescType& pd,
|
||||||
kernel_selector::WeightsReorderParams weights_reorder = {})
|
kernel_selector::WeightsReorderParams weights_reorder = {})
|
||||||
: typed_primitive_impl<PType>(weights_reorder, pd.impl_info_str()),
|
: typed_primitive_impl<PType>(create_weights_reorder_params(weights_reorder), pd.impl_info_str()),
|
||||||
_engine(&engine),
|
_engine(&engine),
|
||||||
_attrs(attrs),
|
_attrs(attrs),
|
||||||
_pd(pd) {
|
_pd(pd) {
|
||||||
|
@ -1,143 +0,0 @@
|
|||||||
// Copyright (C) 2018-2023 Intel Corporation
|
|
||||||
// SPDX-License-Identifier: Apache-2.0
|
|
||||||
//
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include "intel_gpu/primitives/primitive.hpp"
|
|
||||||
#include "intel_gpu/runtime/memory.hpp"
|
|
||||||
|
|
||||||
// TODO: Remove OCL impl dependency here or move to OCL folder
|
|
||||||
#include "impls/ocl/kernel_selector_helper.h"
|
|
||||||
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace cldnn {
|
|
||||||
|
|
||||||
/// @brief Changes how data is ordered in memory. Value type is not changed & all information is preserved.
|
|
||||||
/// @details Corresponding values are bitwise equal before/after reorder.
|
|
||||||
/// Also merged with subtraction layer, which can subtract values while doing reordering.
|
|
||||||
/// NOTE THAT THIS WILL SUBTRACT THE SAME VALUES FROM EACH BATCH.
|
|
||||||
struct generic_layer : public primitive_base<generic_layer> {
|
|
||||||
CLDNN_DECLARE_PRIMITIVE(generic_layer)
|
|
||||||
|
|
||||||
/// @brief Constructs generic_layer primitive which takes mean subtract values from another primitive.
|
|
||||||
/// @param id This primitive id.
|
|
||||||
/// @param input Input primitive id.
|
|
||||||
/// @param output_layout Requested memory layout.
|
|
||||||
/// @param mean Primitive id to get mean subtract values.
|
|
||||||
generic_layer(const primitive_id& id,
|
|
||||||
const primitive_id& input,
|
|
||||||
const layout& output_layout,
|
|
||||||
const kernel_selector::generic_kernel_params& generic_params,
|
|
||||||
const padding& output_padding = padding())
|
|
||||||
: primitive_base(id, {input}, {output_padding}), output_layout(output_layout), generic_params(generic_params) {}
|
|
||||||
|
|
||||||
/// @brief Requested memory layout.
|
|
||||||
layout output_layout;
|
|
||||||
const kernel_selector::generic_kernel_params generic_params;
|
|
||||||
|
|
||||||
size_t hash() const override {
|
|
||||||
size_t seed = primitive::hash();
|
|
||||||
seed = hash_combine(seed, generic_params.engine);
|
|
||||||
|
|
||||||
if (generic_params.cpuKernel != nullptr) {
|
|
||||||
auto& cpuKernel = generic_params.cpuKernel;
|
|
||||||
seed = hash_combine(seed, cpuKernel->GetExpectedInputLayout());
|
|
||||||
seed = hash_combine(seed, cpuKernel->GetExpectedInputType());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (generic_params.clKernel != nullptr) {
|
|
||||||
auto& clKernel = generic_params.clKernel;
|
|
||||||
seed = hash_combine(seed, clKernel->skip_execution);
|
|
||||||
|
|
||||||
auto& gws = clKernel->params.workGroups.global;
|
|
||||||
seed = hash_range(seed, gws.begin(), gws.end());
|
|
||||||
|
|
||||||
auto& lws = clKernel->params.workGroups.local;
|
|
||||||
seed = hash_range(seed, lws.begin(), lws.end());
|
|
||||||
|
|
||||||
auto& arguments = clKernel->params.arguments;
|
|
||||||
for (auto& args : arguments) {
|
|
||||||
seed = hash_combine(seed, args.index);
|
|
||||||
seed = hash_combine(seed, args.t);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto& scalars = clKernel->params.scalars;
|
|
||||||
for (auto& s : scalars) {
|
|
||||||
seed = hash_combine(seed, s.t);
|
|
||||||
}
|
|
||||||
|
|
||||||
seed = hash_combine(seed, clKernel->code.kernelString->get_hash());
|
|
||||||
}
|
|
||||||
return seed;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool operator==(const primitive& rhs) const override {
|
|
||||||
if (!compare_common_params(rhs))
|
|
||||||
return false;
|
|
||||||
|
|
||||||
auto rhs_casted = downcast<const generic_layer>(rhs);
|
|
||||||
|
|
||||||
if (generic_params.engine != rhs_casted.generic_params.engine)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (generic_params.cpuKernel != nullptr) {
|
|
||||||
if (generic_params.cpuKernel->GetExpectedInputLayout() != rhs_casted.generic_params.cpuKernel->GetExpectedInputLayout())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (generic_params.cpuKernel->GetExpectedInputType() != rhs_casted.generic_params.cpuKernel->GetExpectedInputType())
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (generic_params.clKernel != nullptr) {
|
|
||||||
auto& clKernel = generic_params.clKernel;
|
|
||||||
auto& clKernel_rhs = rhs_casted.generic_params.clKernel;
|
|
||||||
if (clKernel->skip_execution != clKernel_rhs->skip_execution)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
auto& gws = clKernel->params.workGroups.global;
|
|
||||||
auto& gws_rhs = clKernel_rhs->params.workGroups.global;
|
|
||||||
if (gws != gws_rhs)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
auto& lws = clKernel->params.workGroups.local;
|
|
||||||
auto& lws_rhs = clKernel_rhs->params.workGroups.local;
|
|
||||||
if (lws != lws_rhs)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
auto& arguments = clKernel->params.arguments;
|
|
||||||
auto& arguments_rhs = clKernel_rhs->params.arguments;
|
|
||||||
if (arguments.size() != arguments_rhs.size())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (size_t idx = 0; idx < arguments.size(); idx++) {
|
|
||||||
if (arguments[idx].index != arguments_rhs[idx].index)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
if (arguments[idx].t != arguments_rhs[idx].t)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto& scalars = clKernel->params.scalars;
|
|
||||||
auto& scalars_rhs = clKernel_rhs->params.scalars;
|
|
||||||
if (scalars.size() != scalars_rhs.size())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (size_t idx = 0; idx < scalars.size(); idx++) {
|
|
||||||
if (scalars[idx].t != scalars_rhs[idx].t)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (clKernel->code.kernelString->get_str() != clKernel_rhs->code.kernelString->get_str())
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected:
|
|
||||||
std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override { return {}; }
|
|
||||||
};
|
|
||||||
/// @}
|
|
||||||
/// @}
|
|
||||||
/// @}
|
|
||||||
} // namespace cldnn
|
|
@ -3,7 +3,7 @@
|
|||||||
//
|
//
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
#include "generic_layer.hpp"
|
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||||
#include "primitive_inst.h"
|
#include "primitive_inst.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -31,12 +31,13 @@ class typed_primitive_inst<generic_layer> : public typed_primitive_inst_base<gen
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) {
|
static layout calc_output_layout(generic_layer_node const& node, kernel_impl_params const& impl_param) {
|
||||||
return impl_param.typed_desc<generic_layer>()->output_layout;
|
return impl_param.typed_desc<generic_layer>()->params->get_output_layout();
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string to_string(generic_layer_node const& node);
|
static std::string to_string(generic_layer_node const& node);
|
||||||
|
|
||||||
typed_primitive_inst(network& network, generic_layer_node const& node);
|
typed_primitive_inst(network& network, generic_layer_node const& node);
|
||||||
|
typed_primitive_inst(network& network);
|
||||||
};
|
};
|
||||||
|
|
||||||
using generic_layer_inst = typed_primitive_inst<generic_layer>;
|
using generic_layer_inst = typed_primitive_inst<generic_layer>;
|
||||||
|
@ -130,4 +130,28 @@ public:
|
|||||||
return keys;
|
return keys;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct WeightsReordersFactory {
|
||||||
|
using factory_type = std::function<std::unique_ptr<primitive_impl>(const kernel_impl_params&)>;
|
||||||
|
using map_type = singleton_map<std::pair<impl_types, shape_types>, factory_type>;
|
||||||
|
static void add(impl_types impl_type, shape_types shape_type, factory_type factory) {
|
||||||
|
OPENVINO_ASSERT(impl_type != impl_types::any, "[GPU] Can't register WeightsReordersFactory with type any");
|
||||||
|
map_type::instance().insert({{impl_type, shape_type}, factory});
|
||||||
|
}
|
||||||
|
|
||||||
|
static factory_type get(impl_types preferred_impl_type, shape_types target_shape_type) {
|
||||||
|
for (auto& kv : map_type::instance()) {
|
||||||
|
impl_types impl_type = kv.first.first;
|
||||||
|
shape_types supported_shape_type = kv.first.second;
|
||||||
|
if ((preferred_impl_type & impl_type) != impl_type)
|
||||||
|
continue;
|
||||||
|
if ((target_shape_type & supported_shape_type) != target_shape_type)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
return kv.second;
|
||||||
|
}
|
||||||
|
OPENVINO_THROW("[GPU] WeightsReordersFactory doesn't have any implementation for "
|
||||||
|
" impl_type: ", preferred_impl_type, ", shape_type: ", target_shape_type);
|
||||||
|
}
|
||||||
|
};
|
||||||
} // namespace cldnn
|
} // namespace cldnn
|
||||||
|
@ -7,21 +7,15 @@
|
|||||||
#include "intel_gpu/runtime/memory.hpp"
|
#include "intel_gpu/runtime/memory.hpp"
|
||||||
#include "intel_gpu/runtime/engine.hpp"
|
#include "intel_gpu/runtime/engine.hpp"
|
||||||
#include "intel_gpu/runtime/utils.hpp"
|
#include "intel_gpu/runtime/utils.hpp"
|
||||||
|
#include "intel_gpu/runtime/lru_cache.hpp"
|
||||||
|
|
||||||
#include "data_inst.h"
|
#include "data_inst.h"
|
||||||
|
#include "generic_layer_inst.h"
|
||||||
#include "reorder_inst.h"
|
#include "reorder_inst.h"
|
||||||
#include "convolution_inst.h"
|
#include "convolution_inst.h"
|
||||||
#include "deconvolution_inst.h"
|
#include "deconvolution_inst.h"
|
||||||
#include "fully_connected_inst.h"
|
|
||||||
#include "detection_output_inst.h"
|
#include "detection_output_inst.h"
|
||||||
#include "binary_convolution_inst.h"
|
#include "binary_convolution_inst.h"
|
||||||
#include "lstm_gemm_inst.h"
|
|
||||||
#include "generic_layer.hpp"
|
|
||||||
#include "non_max_suppression_inst.h"
|
|
||||||
#include "region_yolo_inst.h"
|
|
||||||
|
|
||||||
// TODO: add generic interface for weights_reorder_params and get rid of this dependency
|
|
||||||
#include "impls/ocl/kernel_selector_helper.h"
|
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -52,10 +46,8 @@ public:
|
|||||||
const layout& in_layout,
|
const layout& in_layout,
|
||||||
const layout& out_layout);
|
const layout& out_layout);
|
||||||
|
|
||||||
std::vector<std::pair<std::shared_ptr<primitive>, bool>> get_weights_reorder(
|
std::pair<std::shared_ptr<primitive>, bool> get_weights_reorder(primitive_id input_id,
|
||||||
primitive_id input_id,
|
std::shared_ptr<WeightsReorderParams> reorder_params);
|
||||||
const layout& old_layout,
|
|
||||||
const kernel_selector::weights_reorder_params& reorder_params);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct cache_key {
|
struct cache_key {
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include "intel_gpu/primitives/primitive.hpp"
|
#include "intel_gpu/primitives/primitive.hpp"
|
||||||
#include "intel_gpu/primitives/concatenation.hpp"
|
#include "intel_gpu/primitives/concatenation.hpp"
|
||||||
|
#include "intel_gpu/primitives/generic_layer.hpp"
|
||||||
#include "intel_gpu/runtime/event.hpp"
|
#include "intel_gpu/runtime/event.hpp"
|
||||||
#include "intel_gpu/runtime/memory.hpp"
|
#include "intel_gpu/runtime/memory.hpp"
|
||||||
#include "intel_gpu/runtime/lru_cache.hpp"
|
#include "intel_gpu/runtime/lru_cache.hpp"
|
||||||
@ -43,21 +44,22 @@ class typed_primitive_inst;
|
|||||||
*/
|
*/
|
||||||
struct primitive_impl {
|
struct primitive_impl {
|
||||||
primitive_impl() = default;
|
primitive_impl() = default;
|
||||||
explicit primitive_impl(const kernel_selector::weights_reorder_params& params, std::string kernel_name = "", bool is_dynamic = false)
|
explicit primitive_impl(std::shared_ptr<WeightsReorderParams> params, std::string kernel_name = "", bool is_dynamic = false)
|
||||||
: _weights_reorder_params(params), _kernel_name(kernel_name), _is_dynamic(is_dynamic) {}
|
: _weights_reorder_params(params), _kernel_name(kernel_name), _is_dynamic(is_dynamic) {
|
||||||
|
}
|
||||||
explicit primitive_impl(std::string kernel_name, bool is_dynamic = false) :
|
explicit primitive_impl(std::string kernel_name, bool is_dynamic = false) :
|
||||||
primitive_impl(kernel_selector::weights_reorder_params{}, kernel_name, is_dynamic) {}
|
primitive_impl(nullptr, kernel_name, is_dynamic) {}
|
||||||
virtual ~primitive_impl() = default;
|
virtual ~primitive_impl() = default;
|
||||||
|
|
||||||
virtual std::vector<layout> get_internal_buffer_layouts() const = 0;
|
virtual std::vector<layout> get_internal_buffer_layouts() const = 0;
|
||||||
virtual void set_node_params(const program_node&) {}
|
virtual void set_node_params(const program_node&) {}
|
||||||
virtual std::string get_type() const = 0;
|
virtual std::string get_type() const = 0;
|
||||||
virtual void set_arguments(primitive_inst& instance) = 0;
|
virtual void set_arguments(primitive_inst& instance) = 0;
|
||||||
|
virtual void set_arguments(primitive_inst& instance, kernel_arguments_data& args) = 0;
|
||||||
virtual kernel_arguments_data get_arguments(const primitive_inst& instance) const = 0;
|
virtual kernel_arguments_data get_arguments(const primitive_inst& instance) const = 0;
|
||||||
virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
|
virtual event::ptr execute(const std::vector<event::ptr>& events, primitive_inst& instance) = 0;
|
||||||
std::string get_kernel_name() const { return _kernel_name; }
|
std::string get_kernel_name() const { return _kernel_name; }
|
||||||
// TODO: added a derived class for weights reordering (maybe for all static data reordering)
|
|
||||||
kernel_selector::weights_reorder_params _weights_reorder_params;
|
|
||||||
// class typed_primitive_gpu_impl override this with return false;
|
// class typed_primitive_gpu_impl override this with return false;
|
||||||
virtual bool is_cpu() const { return true; }
|
virtual bool is_cpu() const { return true; }
|
||||||
virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
|
virtual void init_kernels(const kernels_cache& kernels_cache, const kernel_impl_params& params) = 0;
|
||||||
@ -94,7 +96,14 @@ struct primitive_impl {
|
|||||||
virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {}
|
virtual void set_kernels(cldnn::kernels_cache::compiled_kernels kernels) {}
|
||||||
virtual std::vector<kernel::ptr> get_kernels() { return {}; }
|
virtual std::vector<kernel::ptr> get_kernels() { return {}; }
|
||||||
|
|
||||||
|
bool need_weights_reorder() const { return _weights_reorder_params != nullptr; }
|
||||||
|
std::shared_ptr<WeightsReorderParams> get_weights_reorder_params() const { return _weights_reorder_params; }
|
||||||
|
void reset_weights_reorder_params() { _weights_reorder_params = nullptr; }
|
||||||
|
|
||||||
|
std::shared_ptr<kernel_impl_params> get_weights_reorder_kernel_params() const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
std::shared_ptr<WeightsReorderParams> _weights_reorder_params = nullptr;
|
||||||
std::string _kernel_name;
|
std::string _kernel_name;
|
||||||
bool _is_dynamic = false;
|
bool _is_dynamic = false;
|
||||||
};
|
};
|
||||||
@ -151,6 +160,8 @@ public:
|
|||||||
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
|
const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
|
||||||
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
|
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
|
||||||
const primitive_impl* get_impl() const { return _impl.get(); }
|
const primitive_impl* get_impl() const { return _impl.get(); }
|
||||||
|
primitive_impl* get_impl() { return _impl.get(); }
|
||||||
|
void set_impl(std::unique_ptr<primitive_impl> impl) { _impl = std::move(impl); }
|
||||||
|
|
||||||
memory& input_memory(size_t index = 0) const {
|
memory& input_memory(size_t index = 0) const {
|
||||||
if (index >= inputs_memory_count())
|
if (index >= inputs_memory_count())
|
||||||
@ -418,11 +429,22 @@ private:
|
|||||||
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance));
|
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void set_arguments(primitive_inst& instance, kernel_arguments_data& args) override {
|
||||||
|
OPENVINO_ASSERT(instance.type() == PType::type_id(), "[GPU] Implementation type ", instance.type(),
|
||||||
|
" does not match primitive type ", PType::type_id());
|
||||||
|
if (instance.get_impl() != this)
|
||||||
|
throw std::invalid_argument(
|
||||||
|
"Trying to set_arguments for primitive implementation with mismatching primitive instance");
|
||||||
|
|
||||||
|
return set_arguments_impl(reinterpret_cast<typed_primitive_inst<PType>&>(instance), args);
|
||||||
|
}
|
||||||
|
|
||||||
kernel_arguments_data get_arguments(const primitive_inst& instance) const override {
|
kernel_arguments_data get_arguments(const primitive_inst& instance) const override {
|
||||||
return get_arguments_impl(reinterpret_cast<const typed_primitive_inst<PType>&>(instance));
|
return get_arguments_impl(reinterpret_cast<const typed_primitive_inst<PType>&>(instance));
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
|
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/) {}
|
||||||
|
virtual void set_arguments_impl(typed_primitive_inst<PType>& /*instance*/, kernel_arguments_data& /*args*/) {}
|
||||||
virtual kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& /*instance*/) const {
|
virtual kernel_arguments_data get_arguments_impl(const typed_primitive_inst<PType>& /*instance*/) const {
|
||||||
kernel_arguments_data args;
|
kernel_arguments_data args;
|
||||||
return args;
|
return args;
|
||||||
|
@ -13,11 +13,12 @@
|
|||||||
#include "reshape_inst.h"
|
#include "reshape_inst.h"
|
||||||
#include "arg_max_min_inst.h"
|
#include "arg_max_min_inst.h"
|
||||||
#include "shape_of_inst.h"
|
#include "shape_of_inst.h"
|
||||||
#include "generic_layer.hpp"
|
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
#include "gemm_inst.h"
|
#include "gemm_inst.h"
|
||||||
#include "deconvolution_inst.h"
|
#include "deconvolution_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
|
#include "non_max_suppression_inst.h"
|
||||||
#include "eltwise_inst.h"
|
#include "eltwise_inst.h"
|
||||||
#include "pooling_inst.h"
|
#include "pooling_inst.h"
|
||||||
#include "reduce_inst.h"
|
#include "reduce_inst.h"
|
||||||
@ -155,50 +156,26 @@ std::pair<std::shared_ptr<reorder>, bool> reorder_factory::get_reorder(primitive
|
|||||||
return std::make_pair(reorder, false);
|
return std::make_pair(reorder, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::pair<std::shared_ptr<primitive>, bool>> reorder_factory::get_weights_reorder(
|
std::pair<std::shared_ptr<primitive>, bool> reorder_factory::get_weights_reorder(primitive_id input_id,
|
||||||
primitive_id input_id,
|
std::shared_ptr<WeightsReorderParams> reorder_params) {
|
||||||
const layout& old_layout,
|
if (reorder_params == nullptr)
|
||||||
const kernel_selector::weights_reorder_params& reorder_params) {
|
|
||||||
|
|
||||||
if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::NONE)
|
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
std::vector<std::pair<std::shared_ptr<primitive>, bool>> ret;
|
layout expected_layout = reorder_params->get_output_layout();
|
||||||
|
|
||||||
if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::CPU &&
|
|
||||||
reorder_params.cpuKernel != nullptr) {
|
|
||||||
const auto intermediate_format = from_weights_layout(reorder_params.cpuKernel->GetExpectedInputLayout());
|
|
||||||
const auto intermediate_type = from_weights_type(reorder_params.cpuKernel->GetExpectedInputType());
|
|
||||||
if (intermediate_format != old_layout.format || intermediate_type != old_layout.data_type) {
|
|
||||||
const layout intermediate_layout = { intermediate_type,
|
|
||||||
intermediate_format,
|
|
||||||
old_layout.get_tensor().transform(intermediate_format, 1) };
|
|
||||||
|
|
||||||
auto reorder = get_reorder(input_id, old_layout, intermediate_layout);
|
|
||||||
if (reorder.first) {
|
|
||||||
ret.push_back(reorder);
|
|
||||||
input_id = reorder.first->id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
layout expected_layout = from_weights_tensor(reorder_params.dest);
|
|
||||||
|
|
||||||
cache_key ckey{ input_id, expected_layout, false };
|
cache_key ckey{ input_id, expected_layout, false };
|
||||||
auto itr = _cached_generic_reorders.find(ckey);
|
auto itr = _cached_generic_reorders.find(ckey);
|
||||||
if (itr != _cached_generic_reorders.end()) {
|
if (itr != _cached_generic_reorders.end()) {
|
||||||
ret.push_back(std::make_pair(itr->second, true));
|
return std::make_pair(itr->second, true);
|
||||||
} else {
|
} else {
|
||||||
auto count = _cached_generic_reorders.size();
|
auto count = _cached_generic_reorders.size();
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << input_id << "_generic_layer_" << count;
|
ss << input_id << "_generic_layer_" << count;
|
||||||
|
|
||||||
auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), input_id, expected_layout, reorder_params);
|
auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), input_id, reorder_params);
|
||||||
_cached_generic_reorders[ckey] = reorder;
|
_cached_generic_reorders[ckey] = reorder;
|
||||||
ret.push_back(std::make_pair(reorder, false));
|
return std::make_pair(reorder, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) {
|
bool layout_optimizer::is_format_supported(program_node& node, format::type fmt) {
|
||||||
|
@ -337,11 +337,6 @@ network::network(program::ptr program, const ExecutionConfig& config, stream::pt
|
|||||||
build_exec_order();
|
build_exec_order();
|
||||||
validate_primitives();
|
validate_primitives();
|
||||||
add_default_output_chains();
|
add_default_output_chains();
|
||||||
|
|
||||||
if (is_dynamic()) {
|
|
||||||
GPU_DEBUG_DEFINE_MEM_LOGGER("dynamic_network_initialization");
|
|
||||||
_in_mem_kernels_cache = std::unique_ptr<KernelsCache>(new KernelsCache(_in_mem_kernels_cache_capacity));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
network::network(engine& engine,
|
network::network(engine& engine,
|
||||||
@ -537,7 +532,8 @@ void network::save(cldnn::BinaryOutputBuffer& ob) {
|
|||||||
kernels_cache.reset();
|
kernels_cache.reset();
|
||||||
for (const auto& p_inst : _exec_order) {
|
for (const auto& p_inst : _exec_order) {
|
||||||
if (p_inst->get_impl() != nullptr) {
|
if (p_inst->get_impl() != nullptr) {
|
||||||
kernels_cache.add_to_cached_kernels(p_inst->get_impl()->get_kernels());
|
auto const_impl = static_cast<const primitive_impl*>(p_inst->get_impl());
|
||||||
|
kernels_cache.add_to_cached_kernels(const_impl->get_kernels());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ob << kernels_cache;
|
ob << kernels_cache;
|
||||||
|
@ -11,11 +11,13 @@
|
|||||||
#include "fully_connected_inst.h"
|
#include "fully_connected_inst.h"
|
||||||
#include "convolution_inst.h"
|
#include "convolution_inst.h"
|
||||||
#include "crop_inst.h"
|
#include "crop_inst.h"
|
||||||
|
#include "eltwise_inst.h"
|
||||||
#include "deconvolution_inst.h"
|
#include "deconvolution_inst.h"
|
||||||
#include "shape_of_inst.h"
|
#include "shape_of_inst.h"
|
||||||
#include "gemm_inst.h"
|
#include "gemm_inst.h"
|
||||||
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
|
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
|
||||||
#include "compilation_context.hpp"
|
#include "compilation_context.hpp"
|
||||||
|
#include "implementation_map.hpp"
|
||||||
|
|
||||||
#include "intel_gpu/plugin/common_utils.hpp"
|
#include "intel_gpu/plugin/common_utils.hpp"
|
||||||
#include "intel_gpu/graph/network.hpp"
|
#include "intel_gpu/graph/network.hpp"
|
||||||
@ -93,6 +95,19 @@ bool is_any_user_cpu(const std::list<const program_node*>& users) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::shared_ptr<kernel_impl_params> primitive_impl::get_weights_reorder_kernel_params() const {
|
||||||
|
if (!need_weights_reorder())
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
|
||||||
|
auto prim = std::make_shared<generic_layer>("", "", _weights_reorder_params);
|
||||||
|
reorder_kernel_params->desc = prim;
|
||||||
|
reorder_kernel_params->unique_id = _weights_reorder_params->hash();
|
||||||
|
reorder_kernel_params->input_layouts.push_back(_weights_reorder_params->get_input_layout());
|
||||||
|
reorder_kernel_params->output_layouts.push_back(_weights_reorder_params->get_output_layout());
|
||||||
|
return reorder_kernel_params;
|
||||||
|
}
|
||||||
|
|
||||||
kernel_impl_params primitive_impl::static_canonicalize_shapes(const kernel_impl_params& impl_params) {
|
kernel_impl_params primitive_impl::static_canonicalize_shapes(const kernel_impl_params& impl_params) {
|
||||||
auto updated_impl_params = canonicalize_fused_shapes(impl_params);
|
auto updated_impl_params = canonicalize_fused_shapes(impl_params);
|
||||||
|
|
||||||
@ -787,19 +802,19 @@ event::ptr primitive_inst::update_weights() {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
auto& engine = _network.get_engine();
|
auto& engine = _network.get_engine();
|
||||||
auto& weights_params = _impl->_weights_reorder_params;
|
auto reorder_kernel_params = _impl->get_weights_reorder_kernel_params();
|
||||||
|
|
||||||
auto weights_idx = _node->get_primitive()->input.size();
|
auto weights_idx = _node->get_primitive()->input.size();
|
||||||
auto original_weights_memory = dep_memory_ptr(weights_idx);
|
auto original_weights_memory = dep_memory_ptr(weights_idx);
|
||||||
auto original_layout = original_weights_memory->get_layout();
|
auto original_layout = original_weights_memory->get_layout();
|
||||||
|
|
||||||
if (weights_params.engine == kernel_selector::GenericKernelParams::Engine::NONE) {
|
if (!reorder_kernel_params) {
|
||||||
// If kernel doesn't says that it doesn't require weights reorder, but weights were reordered previously, then
|
// If kernel doesn't says that it doesn't require weights reorder, but weights were reordered previously, then
|
||||||
// incorrect memory buffer may be assigned, so reset cached weights for such case
|
// incorrect memory buffer may be assigned, so reset cached weights for such case
|
||||||
_reordered_weights_cache.add(original_layout, original_weights_memory);
|
_reordered_weights_cache.add(original_layout, original_weights_memory);
|
||||||
_impl_params->weights_layout = optional_layout(original_layout);
|
_impl_params->weights_layout = optional_layout(original_layout);
|
||||||
} else {
|
} else {
|
||||||
auto expected_layout = from_weights_tensor(weights_params.dest);
|
auto expected_layout = reorder_kernel_params->get_output_layout();
|
||||||
// Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion
|
// Set original patrial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion
|
||||||
expected_layout.set_partial_shape(original_layout.get_partial_shape());
|
expected_layout.set_partial_shape(original_layout.get_partial_shape());
|
||||||
_impl_params->weights_layout = optional_layout(expected_layout);
|
_impl_params->weights_layout = optional_layout(expected_layout);
|
||||||
@ -816,30 +831,27 @@ event::ptr primitive_inst::update_weights() {
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
} else {
|
} else {
|
||||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false);
|
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(false);
|
||||||
auto get_kernel_key = [&]() -> size_t {
|
auto& cache = get_network().get_program()->get_implementations_cache();
|
||||||
auto seed = _node->get_primitive()->hash();
|
auto reorder_inst = std::make_shared<generic_layer_inst>(get_network());
|
||||||
seed = hash_combine(seed, expected_layout.hash());
|
|
||||||
seed = hash_combine(seed, original_layout.hash());
|
|
||||||
return seed;
|
|
||||||
};
|
|
||||||
|
|
||||||
cldnn::kernel::ptr kernel = nullptr;
|
if (auto cached_impl = cache.get(*reorder_kernel_params)) {
|
||||||
auto kernel_key = get_kernel_key();
|
|
||||||
auto& cache = get_network().get_in_mem_kernels_cache();
|
|
||||||
if (cache.has(kernel_key)) {
|
|
||||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
|
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights (cached) from " << original_layout.to_short_string()
|
||||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||||
GPU_DEBUG_PROFILED_STAGE_CACHE_HIT(true);
|
reorder_inst->set_impl(cached_impl->clone());
|
||||||
kernel = cache.get(kernel_key);
|
|
||||||
} else {
|
} else {
|
||||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
|
GPU_DEBUG_TRACE_DETAIL << id() << ": reorder weights from " << original_layout.to_short_string()
|
||||||
<< " to " << expected_layout.to_short_string() << std::endl;
|
<< " to " << expected_layout.to_short_string() << std::endl;
|
||||||
|
|
||||||
|
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
|
||||||
|
auto reorder_impl = factory(*reorder_kernel_params);
|
||||||
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
|
auto& kernels_cache = get_network().get_program()->get_kernels_cache();
|
||||||
auto kernels = kernels_cache.compile(*_impl_params, {weights_params.clKernel->code.kernelString});
|
auto kernels = kernels_cache.compile(*_impl_params, reorder_impl->get_kernels_source());
|
||||||
OPENVINO_ASSERT(kernels.size() == 1, "The output of kernel compile has issue");
|
OPENVINO_ASSERT(kernels.size() == 1, "[GPU] Expected number of compiled kernels is 1, but got ", kernels.size());
|
||||||
auto& kernel_data = kernels.begin()->second;
|
reorder_impl->set_kernels(kernels);
|
||||||
kernel = kernel_data[0].first;
|
|
||||||
cache.add(kernel_key, kernel);
|
reorder_inst->set_impl(reorder_impl->clone());
|
||||||
|
|
||||||
|
cache.add(*reorder_kernel_params, reorder_impl->clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto& stream = get_network().get_stream();
|
auto& stream = get_network().get_stream();
|
||||||
@ -867,8 +879,10 @@ event::ptr primitive_inst::update_weights() {
|
|||||||
kernel_arguments_data args;
|
kernel_arguments_data args;
|
||||||
args.inputs.push_back(original_weights_memory);
|
args.inputs.push_back(original_weights_memory);
|
||||||
args.outputs.push_back(weights_memory);
|
args.outputs.push_back(weights_memory);
|
||||||
stream.set_arguments(*kernel, weights_params.clKernel->params, args);
|
|
||||||
auto ev = stream.enqueue_kernel(*kernel, weights_params.clKernel->params, args, {}, true);
|
auto reorder_impl = reorder_inst->get_impl();
|
||||||
|
reorder_impl->set_arguments(*reorder_inst, args);
|
||||||
|
auto ev = reorder_impl->execute({}, *reorder_inst);
|
||||||
|
|
||||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||||
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
|
GPU_DEBUG_IF(!debug_config->dump_profiling_data.empty()) {
|
||||||
|
@ -67,30 +67,21 @@ struct clKernelData {
|
|||||||
bool skip_execution = false;
|
bool skip_execution = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
// CPUKernel
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
||||||
struct CPUKernel {
|
|
||||||
virtual WeightsType GetExpectedInputType() = 0;
|
|
||||||
virtual WeightsLayout GetExpectedInputLayout() const { return WeightsLayout::oiyx; }
|
|
||||||
virtual void Execute(void* input, size_t input_size, void* output, size_t output_size) const = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// GenericKernelParams
|
// GenericKernelParams
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
struct GenericKernelParams {
|
struct GenericKernelParams {
|
||||||
enum class Engine { NONE, CPU, GPU };
|
enum class Engine { NONE, GPU };
|
||||||
|
|
||||||
Engine engine = Engine::NONE;
|
Engine engine = Engine::NONE;
|
||||||
std::shared_ptr<clKernelData> clKernel;
|
std::shared_ptr<clKernelData> clKernel;
|
||||||
std::shared_ptr<CPUKernel> cpuKernel;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// WeightsReorderParams
|
// WeightsReorderParams
|
||||||
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
struct WeightsReorderParams : public GenericKernelParams {
|
struct WeightsReorderParams : public GenericKernelParams {
|
||||||
|
WeightsTensor src;
|
||||||
WeightsTensor dest;
|
WeightsTensor dest;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -133,6 +133,7 @@ bool UpdateWeightsParams(weight_bias_params& newParams,
|
|||||||
|
|
||||||
weightsReorderParams.engine = WeightsReorderParams::Engine::GPU;
|
weightsReorderParams.engine = WeightsReorderParams::Engine::GPU;
|
||||||
weightsReorderParams.clKernel = std::make_shared<clKernelData>(kernels_data[0].kernels[0]);
|
weightsReorderParams.clKernel = std::make_shared<clKernelData>(kernels_data[0].kernels[0]);
|
||||||
|
weightsReorderParams.src = r_params.input;
|
||||||
weightsReorderParams.dest = r_params.output;
|
weightsReorderParams.dest = r_params.output;
|
||||||
|
|
||||||
newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups);
|
newParams.weights = newParams.weights.TransformIgnorePadding(reqLayout, dtype, groups);
|
||||||
|
@ -0,0 +1,111 @@
|
|||||||
|
// Copyright (C) 2023 Intel Corporation
|
||||||
|
// SPDX-License-Identifier: Apache-2.0
|
||||||
|
//
|
||||||
|
|
||||||
|
#include "test_utils.h"
|
||||||
|
|
||||||
|
#include "intel_gpu/graph/network.hpp"
|
||||||
|
#include "intel_gpu/graph/program.hpp"
|
||||||
|
#include "intel_gpu/primitives/input_layout.hpp"
|
||||||
|
#include "intel_gpu/primitives/data.hpp"
|
||||||
|
|
||||||
|
#include "generic_layer_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
|
#include "implementation_map.hpp"
|
||||||
|
#include "graph/impls/ocl/register.hpp"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
using namespace cldnn;
|
||||||
|
using namespace ::tests;
|
||||||
|
|
||||||
|
TEST(weights_factory, impl_types) {
|
||||||
|
program::init_primitives();
|
||||||
|
ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape));
|
||||||
|
ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::any, shape_types::static_shape));
|
||||||
|
|
||||||
|
ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::cpu, shape_types::static_shape));
|
||||||
|
ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::onednn, shape_types::static_shape));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(weights_factory, shape_types) {
|
||||||
|
program::init_primitives();
|
||||||
|
ASSERT_NO_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape));
|
||||||
|
|
||||||
|
ASSERT_ANY_THROW(WeightsReordersFactory::get(impl_types::ocl, shape_types::dynamic_shape));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(weights_factory, reorder_test) {
|
||||||
|
auto& engine = get_test_engine();
|
||||||
|
const int input_f = 32, output_f = 32;
|
||||||
|
|
||||||
|
auto weights_layout = layout(ov::PartialShape{ output_f, input_f }, data_types::f32, format::bfyx);
|
||||||
|
auto weights_data_input = engine.allocate_memory(weights_layout);
|
||||||
|
auto weights_data_vec = generate_random_1d<float>(output_f * input_f, -1, 1);
|
||||||
|
set_values(weights_data_input, weights_data_vec);
|
||||||
|
|
||||||
|
cldnn::topology topology {
|
||||||
|
input_layout("input", layout{ ov::PartialShape{ -1, input_f }, data_types::f32, format::bfyx }),
|
||||||
|
data("weights", weights_data_input),
|
||||||
|
fully_connected("fc", input_info("input"), "weights")
|
||||||
|
};
|
||||||
|
|
||||||
|
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
|
||||||
|
ExecutionConfig config = get_test_default_config(engine);
|
||||||
|
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||||
|
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc", fc_impl_desc} })),
|
||||||
|
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||||
|
cldnn::network network(engine, topology, config);
|
||||||
|
|
||||||
|
auto inst = network.get_primitive("fc");
|
||||||
|
auto impl = inst->get_impl();
|
||||||
|
ASSERT_TRUE(impl != nullptr);
|
||||||
|
|
||||||
|
// Get required WeightsReorderParams
|
||||||
|
auto weights_reorder_params = impl->get_weights_reorder_params();
|
||||||
|
ASSERT_TRUE(weights_reorder_params != nullptr);
|
||||||
|
|
||||||
|
// Constuct kernel_impl_params for weights reorder based requested WeightsReorderParams
|
||||||
|
auto reorder_kernel_params = std::make_shared<kernel_impl_params>();
|
||||||
|
reorder_kernel_params->desc = std::make_shared<generic_layer>("weights_reorder", "", weights_reorder_params);
|
||||||
|
reorder_kernel_params->unique_id = weights_reorder_params->hash();
|
||||||
|
reorder_kernel_params->input_layouts.push_back(weights_reorder_params->get_input_layout());
|
||||||
|
reorder_kernel_params->output_layouts.push_back(weights_reorder_params->get_output_layout());
|
||||||
|
|
||||||
|
// Create new generic_layer_impl
|
||||||
|
auto factory = WeightsReordersFactory::get(impl_types::ocl, shape_types::static_shape);
|
||||||
|
auto reorder_impl = factory(*reorder_kernel_params);
|
||||||
|
ASSERT_TRUE(reorder_impl != nullptr);
|
||||||
|
|
||||||
|
// Compile kernel
|
||||||
|
auto& kernel_cache = network.get_program()->get_kernels_cache();
|
||||||
|
auto kernels = kernel_cache.compile(*reorder_kernel_params, reorder_impl->get_kernels_source());
|
||||||
|
ASSERT_TRUE(kernels.size() == 1);
|
||||||
|
reorder_impl->set_kernels(kernels);
|
||||||
|
|
||||||
|
// Allocate memmory and execute generic_layer
|
||||||
|
auto output_weights_layout = weights_reorder_params->get_output_layout();
|
||||||
|
auto weights_data_output = engine.allocate_memory({ output_weights_layout });
|
||||||
|
|
||||||
|
kernel_arguments_data args;
|
||||||
|
args.inputs.push_back(weights_data_input);
|
||||||
|
args.outputs.push_back(weights_data_output);
|
||||||
|
|
||||||
|
auto reorder_inst = std::make_shared<generic_layer_inst>(network);
|
||||||
|
reorder_inst->set_impl(reorder_impl->clone());
|
||||||
|
|
||||||
|
reorder_inst->get_impl()->set_arguments(*reorder_inst, args);
|
||||||
|
reorder_inst->get_impl()->execute({}, *reorder_inst);
|
||||||
|
|
||||||
|
network.get_stream().finish();
|
||||||
|
|
||||||
|
// Compare with expected resutls
|
||||||
|
cldnn::mem_lock<float> output_ptr(weights_data_output, get_test_stream());
|
||||||
|
for (int o = 0; o < output_f; o++) {
|
||||||
|
for (int i = 0; i < input_f; i++) {
|
||||||
|
auto tensor_coord = tensor(std::vector<tensor::value_type>{o, i}, 0);
|
||||||
|
size_t input_idx = output_weights_layout.get_linear_offset(tensor_coord);
|
||||||
|
ASSERT_EQ(weights_data_vec[o * input_f + i], output_ptr[input_idx]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -8,6 +8,7 @@
|
|||||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||||
#include <intel_gpu/primitives/data.hpp>
|
#include <intel_gpu/primitives/data.hpp>
|
||||||
|
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "compilation_context.hpp"
|
#include "compilation_context.hpp"
|
||||||
|
|
||||||
#include "program_wrapper.h"
|
#include "program_wrapper.h"
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
#include <intel_gpu/primitives/fully_connected.hpp>
|
#include <intel_gpu/primitives/fully_connected.hpp>
|
||||||
#include <intel_gpu/primitives/gather.hpp>
|
#include <intel_gpu/primitives/gather.hpp>
|
||||||
#include <intel_gpu/primitives/permute.hpp>
|
#include <intel_gpu/primitives/permute.hpp>
|
||||||
|
#include <intel_gpu/primitives/generic_layer.hpp>
|
||||||
|
|
||||||
using namespace cldnn;
|
using namespace cldnn;
|
||||||
using namespace ::tests;
|
using namespace ::tests;
|
||||||
@ -109,3 +110,19 @@ TEST(primitive_comparison, permute) {
|
|||||||
ASSERT_EQ(permute_prim, permute_prim_eq);
|
ASSERT_EQ(permute_prim, permute_prim_eq);
|
||||||
ASSERT_NE(permute_prim, permute_prim_order);
|
ASSERT_NE(permute_prim, permute_prim_order);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(primitive_comparison, generic_layer) {
|
||||||
|
auto shape = ov::PartialShape{1, 2, 3, 4};
|
||||||
|
auto data_type = data_types::f32;
|
||||||
|
auto format_in = format::bfyx;
|
||||||
|
auto format_out = format::os_iyx_osv16;
|
||||||
|
|
||||||
|
auto input_layout = layout{shape, data_type, format_in};
|
||||||
|
auto output_layout = layout{shape, data_type, format_out};
|
||||||
|
auto generic_layer_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
|
||||||
|
auto generic_layer_eq_prim = generic_layer("generic_layer_eq", "", std::make_shared<WeightsReorderParams>(input_layout, output_layout));
|
||||||
|
auto generic_layer_different_prim = generic_layer("generic_layer", "", std::make_shared<WeightsReorderParams>(output_layout, input_layout));
|
||||||
|
|
||||||
|
ASSERT_EQ(generic_layer_prim, generic_layer_eq_prim);
|
||||||
|
ASSERT_NE(generic_layer_prim, generic_layer_different_prim);
|
||||||
|
}
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include "reshape_inst.h"
|
#include "reshape_inst.h"
|
||||||
#include "reorder_inst.h"
|
#include "reorder_inst.h"
|
||||||
#include "broadcast_inst.h"
|
#include "broadcast_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
#include "to_string_utils.h"
|
#include "to_string_utils.h"
|
||||||
|
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include "eltwise_inst.h"
|
#include "eltwise_inst.h"
|
||||||
#include "reduce_inst.h"
|
#include "reduce_inst.h"
|
||||||
#include "reshape_inst.h"
|
#include "reshape_inst.h"
|
||||||
|
#include "fully_connected_inst.h"
|
||||||
#include "gemm_inst.h"
|
#include "gemm_inst.h"
|
||||||
#include "convolution_inst.h"
|
#include "convolution_inst.h"
|
||||||
#include "pass_manager.h"
|
#include "pass_manager.h"
|
||||||
|
@ -2351,3 +2351,60 @@ INSTANTIATE_TEST_SUITE_P(
|
|||||||
::testing::Values(2, 9, 16, 32, 64, 128),
|
::testing::Values(2, 9, 16, 32, 64, 128),
|
||||||
::testing::Values(false, true))
|
::testing::Values(false, true))
|
||||||
);
|
);
|
||||||
|
|
||||||
|
TEST(fully_connected_gpu, has_cached_weights_reorder) {
|
||||||
|
auto& engine = get_test_engine();
|
||||||
|
|
||||||
|
const int32_t input_f = 3, input_b = 1, weight_b = 4;
|
||||||
|
|
||||||
|
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f }, data_types::f32,format::bfyx };
|
||||||
|
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f }, data_types::f32,format::bfyx });
|
||||||
|
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f }, data_types::f32,format::bfyx });
|
||||||
|
|
||||||
|
set_values(input_data, { -0.5f, 2.0f, 0.5f });
|
||||||
|
set_values(weights_data, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
|
||||||
|
|
||||||
|
cldnn::topology topology{
|
||||||
|
input_layout("input", input_dyn_layout),
|
||||||
|
data("weights", weights_data),
|
||||||
|
fully_connected("fc", input_info("input"), "weights")
|
||||||
|
};
|
||||||
|
|
||||||
|
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
|
||||||
|
ExecutionConfig config = get_test_default_config(engine);
|
||||||
|
config.set_property(ov::intel_gpu::optimize_data(true));
|
||||||
|
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc", fc_impl_desc} })),
|
||||||
|
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||||
|
network network(engine, topology, config);
|
||||||
|
network.set_input_data("input", input_data);
|
||||||
|
|
||||||
|
auto outputs = network.execute();
|
||||||
|
ASSERT_EQ(outputs.size(), size_t(1));
|
||||||
|
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||||
|
|
||||||
|
auto output_prim_mem = outputs.begin()->second.get_memory();
|
||||||
|
|
||||||
|
auto inst = network.get_primitive("fc");
|
||||||
|
auto impl = inst->get_impl();
|
||||||
|
ASSERT_TRUE(impl != nullptr);
|
||||||
|
ASSERT_TRUE(impl->is_dynamic());
|
||||||
|
|
||||||
|
auto reorder_kernel_params = impl->get_weights_reorder_kernel_params();
|
||||||
|
ASSERT_TRUE(reorder_kernel_params != nullptr);
|
||||||
|
auto reorder_impl = network.get_program()->get_implementations_cache().get(*reorder_kernel_params);
|
||||||
|
ASSERT_TRUE(reorder_impl != nullptr);
|
||||||
|
|
||||||
|
auto out_l = network.get_output_layout(outputs.begin()->first);
|
||||||
|
ASSERT_EQ(output_prim_mem->get_layout().batch(), align_to(input_b, 8)); // fake_alignment
|
||||||
|
ASSERT_EQ(out_l.batch(), input_b);
|
||||||
|
ASSERT_EQ(out_l.feature(), weight_b);
|
||||||
|
ASSERT_EQ(out_l.spatial(0), 1);
|
||||||
|
ASSERT_EQ(out_l.spatial(1), 1);
|
||||||
|
|
||||||
|
cldnn::mem_lock<float> output_ptr (output_prim_mem, get_test_stream());
|
||||||
|
|
||||||
|
ASSERT_EQ(1.5f, output_ptr[0]);
|
||||||
|
ASSERT_EQ(0.75f, output_ptr[1]);
|
||||||
|
ASSERT_EQ(-2.25f, output_ptr[2]);
|
||||||
|
ASSERT_EQ(3.0f, output_ptr[3]);
|
||||||
|
}
|
||||||
|
@ -14,10 +14,10 @@
|
|||||||
#include <intel_gpu/primitives/mvn.hpp>
|
#include <intel_gpu/primitives/mvn.hpp>
|
||||||
#include <intel_gpu/primitives/permute.hpp>
|
#include <intel_gpu/primitives/permute.hpp>
|
||||||
#include <intel_gpu/primitives/reshape.hpp>
|
#include <intel_gpu/primitives/reshape.hpp>
|
||||||
|
#include <intel_gpu/primitives/quantize.hpp>
|
||||||
|
|
||||||
|
|
||||||
#include "eltwise_inst.h"
|
#include "primitive_inst.h"
|
||||||
// #include "fully_connected_inst.h"
|
|
||||||
|
|
||||||
using namespace cldnn;
|
using namespace cldnn;
|
||||||
using namespace tests;
|
using namespace tests;
|
||||||
|
Loading…
Reference in New Issue
Block a user