[GPU] Dynamic FC unit test and several fixes (#12563)
This commit is contained in:
parent
5a8f5b630b
commit
382028e9c2
@ -101,6 +101,12 @@ layout fully_connected_inst::calc_output_layout(fully_connected_node const& node
|
||||
output_type = impl_param.get_fused_output_layout().data_type;
|
||||
}
|
||||
|
||||
if (input_layout.is_dynamic()) {
|
||||
auto rank = input_layout.get_rank();
|
||||
format output_format = format::get_default_format(rank);
|
||||
return layout(ov::PartialShape::dynamic(rank), output_type, output_format);
|
||||
}
|
||||
|
||||
auto output_size = tensor(input_layout.batch(), weights_layout.batch(), 1, 1);
|
||||
if (desc->input_size == 3) {
|
||||
output_size = tensor(input_layout.batch(), input_layout.feature(), 1, weights_layout.batch());
|
||||
|
@ -108,9 +108,9 @@ void add_required_reorders::run(program& p) {
|
||||
ToDo: Here we should handle also the situation where primitive usr has data inputs in different
|
||||
formats
|
||||
*/
|
||||
layout current_layout(original_layout.data_type,
|
||||
node->get_output_layout().format,
|
||||
original_layout.get_tensor());
|
||||
layout current_layout(original_layout.get_partial_shape(),
|
||||
original_layout.data_type,
|
||||
node->get_output_layout().format);
|
||||
usr->set_output_layout(current_layout, false);
|
||||
if (usr->type()->does_possible_implementation_exist(*usr)) {
|
||||
correct_layout_selected = true;
|
||||
|
@ -57,9 +57,9 @@ void post_input_reorder::run(program& p) {
|
||||
|
||||
if (input_layout.format != layout_format) {
|
||||
auto previous_layout = node->get_output_layout();
|
||||
layout current_layout(input_layout.data_type,
|
||||
layout current_layout(input_layout.get_partial_shape(),
|
||||
input_layout.data_type,
|
||||
layout_format,
|
||||
input_layout.get_tensor(),
|
||||
input_layout.data_padding);
|
||||
auto& reorder = add_reorder(p, input, node, current_layout);
|
||||
reorder.set_unique_id();
|
||||
|
@ -540,9 +540,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
|
||||
continue;
|
||||
|
||||
auto is_grouped_conv = [](convolution_node& node) -> bool {
|
||||
auto in_size = node.get_dependency(0).get_output_layout().get_tensor();
|
||||
return (node.get_split() > 1 && node.get_split() != in_size.feature[0]) ||
|
||||
(node.get_groups() > 1 && node.get_groups() != static_cast<uint32_t>(in_size.feature[0]));
|
||||
auto in_layout = node.get_dependency(0).get_output_layout();
|
||||
return (node.get_split() > 1 && node.get_split() != in_layout.feature()) ||
|
||||
(node.get_groups() > 1 && node.get_groups() != static_cast<uint32_t>(in_layout.feature()));
|
||||
};
|
||||
|
||||
auto conv_supports_fusings = [&](convolution_node& node) -> bool {
|
||||
|
@ -556,7 +556,7 @@ void remove_redundant_reorders::run(program& p) {
|
||||
bool remove_dep = reshape_input_node.get_users().size() == 1 && !reshape_input_node.is_output() &&
|
||||
reshape_input_node.get_fused_activations_funcs().empty() && reshape_input_node.get_fused_primitives().empty();
|
||||
bool remove_current = remove_dep && !reshape_input_node.get_dependencies().empty() &&
|
||||
reshape_input_node.get_dependency(0).get_output_layout().get_tensor() == reshape_node.get_output_layout().get_tensor() &&
|
||||
reshape_input_node.get_dependency(0).get_output_layout() == reshape_node.get_output_layout() &&
|
||||
reshape_node.get_fused_activations_funcs().empty() && reshape_node.get_fused_primitives().empty();
|
||||
|
||||
if (remove_dep) {
|
||||
|
@ -274,7 +274,9 @@ reorder_cnt count_reorders_in_dir(const std::map<program_node*, format::type>& f
|
||||
travel_direction_wrapper<dir>::first(sel_fmt, next_fmt),
|
||||
travel_direction_wrapper<dir>::second(sel_fmt, next_fmt)))) {
|
||||
cnt += 1;
|
||||
size += travel_direction_wrapper<dir>::first(node, next)->get_output_layout().count();
|
||||
auto l = travel_direction_wrapper<dir>::first(node, next)->get_output_layout();
|
||||
if (l.is_static())
|
||||
size += l.count();
|
||||
}
|
||||
}
|
||||
|
||||
@ -780,7 +782,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf)
|
||||
}
|
||||
|
||||
// Change input data of fully-connected node from bx to bf
|
||||
if (format::is_simple_data_format(input_layout.format) && weights.is_constant() && input_layout.format.dimension() == 4 &&
|
||||
if (input_layout.is_static() && format::is_simple_data_format(input_layout.format) && weights.is_constant() && input_layout.format.dimension() == 4 &&
|
||||
input_layout.feature() == 1 && input_layout.spatial(0) != 1 && input_layout.spatial(1) == 1) {
|
||||
auto new_tensor = input_layout.get_tensor();
|
||||
new_tensor.feature[0] = input_layout.spatial(0);
|
||||
|
@ -19,7 +19,7 @@ void strided_slice_optimize::run(program& p) {
|
||||
auto node_itr = p.get_processing_order().begin();
|
||||
while (node_itr != p.get_processing_order().end()) {
|
||||
auto& node = (*node_itr++);
|
||||
if (node->is_type<strided_slice>()) {
|
||||
if (node->is_type<strided_slice>() && node->get_output_layout().is_static()) {
|
||||
auto& strided_slice_node = node->as<strided_slice>();
|
||||
auto& new_axis_mask = strided_slice_node.get_primitive()->new_axis_mask;
|
||||
|
||||
|
@ -15,7 +15,8 @@ namespace common {
|
||||
|
||||
class wait_for_events_impl : public primitive_impl {
|
||||
public:
|
||||
explicit wait_for_events_impl(const program_node& /*node*/) {}
|
||||
explicit wait_for_events_impl(const program_node& /*node*/)
|
||||
: primitive_impl(kernel_selector::weights_reorder_params{}, "wait_for_events") { }
|
||||
|
||||
std::unique_ptr<primitive_impl> clone() const override {
|
||||
return make_unique<wait_for_events_impl>(*this);
|
||||
|
@ -33,7 +33,7 @@ public:
|
||||
}
|
||||
static std::string to_string(input_layout_node const& node);
|
||||
|
||||
public:
|
||||
void update_shape() override;
|
||||
typed_primitive_inst(network& network, input_layout_node const& node);
|
||||
|
||||
void set_data(memory::ptr mem);
|
||||
|
@ -157,7 +157,7 @@ struct kernel_impl_params {
|
||||
|
||||
layout get_non_padded_input_layout(size_t idx = 0) const {
|
||||
auto input_layout = get_input_layout(idx);
|
||||
auto result = layout({input_layout.data_type, input_layout.format, input_layout.get_tensor()});
|
||||
auto result = layout({input_layout.get_partial_shape(), input_layout.data_type, input_layout.format});
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -218,7 +218,7 @@ inline params_t get_default_params(const kernel_impl_params& param_info, uint32_
|
||||
|
||||
set_params(param_info, params);
|
||||
|
||||
const auto& input_layout = param_info.input_layouts[0];
|
||||
const auto& input_layout = param_info.get_input_layout(0);
|
||||
const auto& output_layout = param_info.output_layout;
|
||||
|
||||
params.inputs[0] = convert_data_tensor(input_layout, split);
|
||||
@ -244,7 +244,7 @@ inline params_t get_default_params(const kernel_impl_params& param_info, uint32_
|
||||
prim_id_type_map[fused_prim.desc->id] = std::make_pair(desc.op_id, desc.output_tensor.GetDType());
|
||||
|
||||
for (size_t i = desc.dep_idx_start; i < desc.dep_idx_start + desc.dep_size; i++) {
|
||||
desc.tensors.push_back(convert_data_tensor(param_info.input_layouts[i]));
|
||||
desc.tensors.push_back(convert_data_tensor(param_info.get_input_layout(i)));
|
||||
}
|
||||
|
||||
if (fused_prim.total_num_deps > 0) {
|
||||
|
@ -156,7 +156,7 @@ public:
|
||||
}
|
||||
|
||||
bool is_dynamic() const {
|
||||
return _node.is_dynamic() || _node.generates_dynamic_output();
|
||||
return _is_dynamic;
|
||||
}
|
||||
|
||||
void allocate_internal_buffers();
|
||||
@ -200,6 +200,7 @@ protected:
|
||||
true; // by default all primitives has valid inputs, exception is input_layout (see input_layout_inst)
|
||||
bool _has_mutable_input = false;
|
||||
bool _mem_allocated = false;
|
||||
bool _is_dynamic = false;
|
||||
|
||||
size_t max_output_layout_size = 0;
|
||||
|
||||
|
@ -145,7 +145,7 @@ public:
|
||||
auto params = std::unique_ptr<kernel_impl_params>(new kernel_impl_params(get_program(), get_primitive(), get_unique_id(), in_layouts, out_layout,
|
||||
get_fused_primitives(),
|
||||
get_fused_activations_funcs(), get_fused_activations_params()));
|
||||
|
||||
params->memory_deps = get_const_memory_deps();
|
||||
return params;
|
||||
}
|
||||
|
||||
|
@ -36,7 +36,7 @@ input_layout_node::typed_program_node(const std::shared_ptr<input_layout> dprim,
|
||||
}
|
||||
|
||||
input_layout_inst::typed_primitive_inst(network& network, input_layout_node const& node)
|
||||
: parent(network, node, !network.is_internal() || has_optimized_users(node)) {
|
||||
: parent(network, node, !node.is_dynamic() && (!network.is_internal() || has_optimized_users(node))) {
|
||||
_has_valid_input = false; // by default input for 'input_layout' is invalid as long as user doesn't call set_data
|
||||
}
|
||||
|
||||
@ -57,6 +57,15 @@ void input_layout_inst::set_data(memory::ptr mem) {
|
||||
_output_changed = true;
|
||||
}
|
||||
|
||||
void input_layout_inst::update_shape() {
|
||||
OPENVINO_ASSERT(_output != nullptr, "[GPU] input memory is not set");
|
||||
auto mem_layout = _output->get_layout();
|
||||
if (_impl_params->output_layout != mem_layout) {
|
||||
set_shape_change();
|
||||
}
|
||||
_impl_params->output_layout = mem_layout;
|
||||
}
|
||||
|
||||
std::string input_layout_inst::to_string(input_layout_node const& node) {
|
||||
auto node_info = node.desc_to_json();
|
||||
|
||||
|
@ -320,11 +320,12 @@ void network::validate_primitives() {
|
||||
}
|
||||
|
||||
void network::set_arguments() {
|
||||
if (!_reset_arguments || is_dynamic())
|
||||
if (!_reset_arguments)
|
||||
return;
|
||||
|
||||
for (auto const& prim : _exec_order) {
|
||||
prim->set_arguments();
|
||||
if (!prim->is_dynamic())
|
||||
prim->set_arguments();
|
||||
}
|
||||
_reset_arguments = false;
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "fully_connected_inst.h"
|
||||
#include "convolution_inst.h"
|
||||
#include "deconvolution_inst.h"
|
||||
#include "shape_of_inst.h"
|
||||
#include "experimental_detectron_roi_feature_extractor_inst.hpp"
|
||||
|
||||
#include "intel_gpu/graph/network.hpp"
|
||||
@ -148,6 +149,13 @@ void primitive_inst::update_shape() {
|
||||
}
|
||||
}
|
||||
|
||||
if (input_shape_changed)
|
||||
set_shape_change();
|
||||
|
||||
// We assume that tensor ranks are static, thus shape_of doesn't need to update anything even if input shape is dynamic
|
||||
if (_node.is_type<shape_of>())
|
||||
return;
|
||||
|
||||
if (!input_shape_changed && !_node.generates_dynamic_output() && _impl_params->output_layout.is_static())
|
||||
return;
|
||||
|
||||
@ -173,7 +181,8 @@ void primitive_inst::update_shape() {
|
||||
_network.get_stream().wait_for_events(dependencies_events);
|
||||
|
||||
_impl_params->memory_deps = memory_deps;
|
||||
layout new_layout = _node.type()->calc_output_layout(_node, *_impl_params);
|
||||
auto new_layouts = _node.type()->calc_output_layouts(_node, *_impl_params);
|
||||
auto new_layout = new_layouts.empty() ? _node.type()->calc_output_layout(_node, *_impl_params) : new_layouts[0];
|
||||
new_layout.data_padding = padding::max(_node.get_primitive()->output_padding, new_layout.data_padding);
|
||||
|
||||
if (_impl_params->output_layout != new_layout) {
|
||||
@ -192,17 +201,26 @@ void primitive_inst::realloc_if_needed() {
|
||||
auto actual_layout = _impl_params->output_layout;
|
||||
OPENVINO_ASSERT(actual_layout.is_static(), "[GPU] Can't realloc mem for dynamic layout");
|
||||
|
||||
if (!_output
|
||||
|| ((_output->get_layout().count() < actual_layout.count())
|
||||
&& (max_output_layout_size < actual_layout.count()))) {
|
||||
// input_layout node is supposed to always use external memory in dynamic case
|
||||
if (_node.is_type<input_layout>())
|
||||
return;
|
||||
|
||||
bool can_reuse_buffer = _output && actual_layout.count() <= max_output_layout_size;
|
||||
|
||||
if (can_reuse_buffer) {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": realloc output memory" << std::endl;
|
||||
GPU_DEBUG_COUT << id() << ": reuse previously allocated output buffer" << std::endl;
|
||||
}
|
||||
_output = _network.get_engine().reinterpret_buffer(*_output, actual_layout);
|
||||
} else {
|
||||
GPU_DEBUG_IF(debug_config->verbose >= 4) {
|
||||
GPU_DEBUG_COUT << id() << ": realloc output memory. "
|
||||
<< " Current buffer_size=" << max_output_layout_size
|
||||
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
|
||||
}
|
||||
_output = allocate_output();
|
||||
} else {
|
||||
_output = _network.get_engine().reinterpret_buffer(*_output, actual_layout);
|
||||
max_output_layout_size = _output->get_layout().count();
|
||||
}
|
||||
max_output_layout_size = std::max(_output->get_layout().count(), max_output_layout_size);
|
||||
}
|
||||
|
||||
void primitive_inst::update_impl() {
|
||||
@ -214,10 +232,8 @@ void primitive_inst::update_impl() {
|
||||
layout_key_str = id() + "_" + std::to_string(_node.get_unique_id());
|
||||
layout_key_str += "_" + _impl_params->output_layout.to_string();
|
||||
|
||||
for (auto in : _node.get_dependencies()) {
|
||||
if (!in->is_constant()) {
|
||||
layout_key_str += "_" + in->get_output_layout().to_string();
|
||||
}
|
||||
for (auto in : _impl_params->input_layouts) {
|
||||
layout_key_str += "_" + in.to_string();
|
||||
}
|
||||
}
|
||||
return layout_key_str;
|
||||
@ -301,12 +317,12 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
<< out_ptr << ")" << std::endl;
|
||||
}
|
||||
|
||||
if (_exec_deps.empty())
|
||||
if (_exec_deps.empty() && dependencies.empty())
|
||||
return _impl->execute(events, *this);
|
||||
|
||||
auto queue_type = get_network().get_stream().get_queue_type();
|
||||
if (queue_type == queue_types::out_of_order) {
|
||||
dependencies.reserve(_exec_deps.size());
|
||||
dependencies.reserve(dependencies.size() + _exec_deps.size());
|
||||
for (auto& input : _exec_deps) {
|
||||
auto id = input->id();
|
||||
try {
|
||||
@ -315,9 +331,7 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
auto ev = get_network().get_primitive_event(id);
|
||||
dependencies.emplace_back(ev);
|
||||
} catch (const std::out_of_range& oor) {
|
||||
std::string temp = std::string("internal CLDNN error: execution order corrupted.") + std::string("\n") +
|
||||
std::string(oor.what() + std::string("\n"));
|
||||
CLDNN_ERROR_MESSAGE(id, temp);
|
||||
OPENVINO_ASSERT(false, "[GPU] execution order corrupted: ", oor.what());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -343,7 +357,8 @@ primitive_inst::primitive_inst(network& network, program_node const& node, bool
|
||||
, _impl(node.get_selected_impl() ? node.get_selected_impl()->clone() : nullptr)
|
||||
, _output()
|
||||
, _output_changed(false)
|
||||
, _mem_allocated(allocate_memory) {
|
||||
, _mem_allocated(allocate_memory)
|
||||
, _is_dynamic(_node.is_dynamic() || _node.generates_dynamic_output()) {
|
||||
if (allocate_memory) {
|
||||
// In case when output is mutable_data primitive, and other users dependencies are only used for
|
||||
// suychronization, The output memory of such primitive will be fused with mutable_data
|
||||
@ -435,7 +450,6 @@ event::ptr primitive_inst::update_weights() {
|
||||
if (!weightable_node)
|
||||
return nullptr;
|
||||
|
||||
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
|
||||
auto& weights_params = _impl->_weights_reorder_params;
|
||||
@ -480,7 +494,12 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
auto layout = impl_params.output_layout;
|
||||
OPENVINO_ASSERT(layout.is_static(), "[GPU] Can't allocate output for dynamic layout");
|
||||
auto device_mem_acc = [&](size_t a, const cldnn::layout& l) {
|
||||
return a + l.bytes_count();
|
||||
// Input shape may be dynamic is some cases (shape_of). It means that output shape of node doesn't depend on input shape
|
||||
// and out memory can be allocated on program build stage.
|
||||
if (l.is_static())
|
||||
return a + l.bytes_count();
|
||||
|
||||
return a;
|
||||
};
|
||||
|
||||
bool usm_device_allocatable = true;
|
||||
@ -564,7 +583,7 @@ std::vector<std::shared_ptr<primitive_inst>> primitive_inst::build_exec_deps(
|
||||
std::vector<std::shared_ptr<primitive_inst>> exec_deps;
|
||||
exec_deps.reserve(deps.size());
|
||||
for (auto& dep : deps)
|
||||
if (dep->get_impl() != nullptr)
|
||||
if (dep->get_impl() != nullptr || dep->is_dynamic())
|
||||
exec_deps.push_back(dep);
|
||||
|
||||
return exec_deps;
|
||||
|
@ -96,7 +96,7 @@ std::vector<layout> reshape_inst::calc_output_layouts(reshape_node const& /*node
|
||||
}
|
||||
};
|
||||
|
||||
if (!memory_deps.empty()) {
|
||||
if (memory_deps.count(1) > 0) {
|
||||
auto pattern_mem = memory_deps.at(1);
|
||||
|
||||
cldnn::mem_lock<uint8_t, mem_lock_type::read> pattern_lock(pattern_mem, impl_param.prog.get_stream());
|
||||
|
@ -47,5 +47,5 @@ std::string shape_of_inst::to_string(shape_of_node const& node) {
|
||||
return primitive_description.str();
|
||||
}
|
||||
|
||||
shape_of_inst::typed_primitive_inst(network& network, shape_of_node const& node) : parent(network, node) { }
|
||||
shape_of_inst::typed_primitive_inst(network& network, shape_of_node const& node) : parent(network, node, true) { }
|
||||
} // namespace cldnn
|
||||
|
@ -20,6 +20,10 @@ namespace {
|
||||
std::pair<bool, bool> are_layouts_identical(layout const& l1, layout const& l2) {
|
||||
const auto& l1_pad = l1.data_padding;
|
||||
const auto& l2_pad = l2.data_padding;
|
||||
|
||||
if (l1.is_dynamic() || l2.is_dynamic())
|
||||
return {false, false};
|
||||
|
||||
auto l1_size = l1.get_tensor();
|
||||
auto l2_size = l2.get_tensor();
|
||||
int64_t offset_last_element_l1 = l1.get_linear_offset(l1_size - tensor{1});
|
||||
|
@ -19,13 +19,9 @@ using namespace ::tests;
|
||||
|
||||
namespace {
|
||||
struct fully_connected_test_params {
|
||||
tensor in_shape;
|
||||
tensor out_shape;
|
||||
tensor kernel;
|
||||
tensor stride;
|
||||
tensor pad;
|
||||
tensor dilation;
|
||||
uint32_t groups;
|
||||
ov::PartialShape in_shape;
|
||||
ov::PartialShape out_shape;
|
||||
ov::PartialShape weights_shape;
|
||||
data_types data_type;
|
||||
format input_format;
|
||||
data_types weights_type;
|
||||
@ -50,45 +46,24 @@ public:
|
||||
}
|
||||
|
||||
layout get_input_layout(fully_connected_test_params& p) {
|
||||
auto pad = p.pad;
|
||||
std::vector<int> pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] };
|
||||
return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
|
||||
return layout{ p.in_shape, p.data_type, p.input_format,};
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(fully_connected_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } };
|
||||
return layout{ ov::PartialShape{1, p.out_shape[1]}, p.default_type, p.default_format };
|
||||
}
|
||||
|
||||
size_t get_output_dim_size(fully_connected_test_params& p) {
|
||||
size_t size = 2;
|
||||
for (auto i : p.out_shape.spatial) {
|
||||
if (i > 1)
|
||||
size++;
|
||||
}
|
||||
return size;
|
||||
return p.out_shape.size();
|
||||
}
|
||||
|
||||
layout get_weights_layout(fully_connected_test_params& p) {
|
||||
cldnn::tensor weights_tensor;
|
||||
if (p.out_shape.spatial[1] > 1) {
|
||||
// 3d case
|
||||
weights_tensor = cldnn::tensor(p.kernel.batch[0], p.kernel.feature[0], 1, 1);
|
||||
}
|
||||
else {
|
||||
weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]),
|
||||
spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2]));
|
||||
}
|
||||
return layout{ p.weights_type, p.weights_format, weights_tensor };
|
||||
return layout{ p.weights_shape, p.weights_type, p.weights_format };
|
||||
}
|
||||
|
||||
layout get_bias_layout(fully_connected_test_params& p) {
|
||||
if (p.out_shape.spatial[1] > 1) {
|
||||
// 3d case
|
||||
return layout{ p.default_type, format::bfyx, tensor{ 1, 1, 1, p.out_shape.spatial[1] } };
|
||||
}
|
||||
else {
|
||||
return layout{ p.default_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } };
|
||||
}
|
||||
auto bias_shape = p.out_shape.size() == 3 ? ov::PartialShape{1, 1, p.out_shape[2]} : ov::PartialShape{1, p.out_shape[1]};
|
||||
return layout{ bias_shape, p.default_type, p.default_format };
|
||||
}
|
||||
};
|
||||
|
||||
@ -103,75 +78,57 @@ public:
|
||||
|
||||
auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p));
|
||||
|
||||
auto impl_forcing_bo = bo_fused.get<build_option_type::force_implementations>();
|
||||
const auto& impl_forcing = impl_forcing_bo->forcing;
|
||||
|
||||
auto forcing_format = p.input_format;
|
||||
for (auto& forcing : impl_forcing) {
|
||||
if (forcing.first == "conv_prim") {
|
||||
forcing_format = forcing.second.output_format;
|
||||
}
|
||||
}
|
||||
|
||||
implementation_desc conv_impl = { forcing_format, "", impl_types::onednn };
|
||||
bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } }));
|
||||
|
||||
network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
|
||||
network network_fused(this->engine, this->topology_fused, bo_fused);
|
||||
network_fused.set_input_data("input", input_prim);
|
||||
network_not_fused.set_input_data("input", input_prim);
|
||||
|
||||
compare(network_not_fused, network_fused, p);
|
||||
auto find_conv = [](primitive_info& p) -> bool {
|
||||
if (p.original_id == "conv_prim")
|
||||
return true;
|
||||
return false;
|
||||
};
|
||||
|
||||
auto pi_fused = network_fused.get_primitives_info();
|
||||
auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv);
|
||||
if (info_fused != pi_fused.end())
|
||||
std::cout << "kernel: " << info_fused->kernel_id << std::endl;
|
||||
}
|
||||
|
||||
layout get_input_layout(fully_connected_test_params& p) {
|
||||
auto pad = p.pad;
|
||||
std::vector<int> pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] };
|
||||
return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } };
|
||||
return layout{ p.in_shape, p.data_type, p.input_format,};
|
||||
}
|
||||
|
||||
layout get_per_channel_layout(fully_connected_test_params& p) {
|
||||
return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} };
|
||||
return layout{ ov::PartialShape{1, p.out_shape[1]}, p.default_type, p.default_format };
|
||||
}
|
||||
|
||||
size_t get_output_dim_size(fully_connected_test_params& p) {
|
||||
size_t size = 2;
|
||||
for (auto i : p.out_shape.spatial) {
|
||||
if (i > 1)
|
||||
size++;
|
||||
}
|
||||
return size;
|
||||
return p.out_shape.size();
|
||||
}
|
||||
|
||||
layout get_weights_layout(fully_connected_test_params& p) {
|
||||
return layout{ p.weights_shape, p.weights_type, p.weights_format };
|
||||
}
|
||||
|
||||
layout get_bias_layout(fully_connected_test_params& p) {
|
||||
return get_per_channel_layout(p);
|
||||
}
|
||||
|
||||
layout get_output_layout(fully_connected_test_params& p) {
|
||||
return layout{ p.out_shape, p.data_type, p.input_format };
|
||||
}
|
||||
};
|
||||
#endif // ENABLE_ONEDNN_FOR_GPU
|
||||
|
||||
} // namespace
|
||||
|
||||
// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format;
|
||||
#define CASE_FC_FP32_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_1 { 5, 3, 1, 3 }, { 5, 3, 1, 5 }, { 5, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_2 { 2, 1, 1, 1 }, { 2, 1, 1, 32 }, { 32, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_3 { 2, 32, 1, 32 }, { 2, 32, 1, 16 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
// in_shape; out_shape; kernel; data_type; input_format; weights_type; weights_format; default_type; default_format;
|
||||
#define CASE_FC_FP32_1 { 1, 3 }, { 1, 4 }, { 4, 3 }, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_2 { 2, 3 }, { 2, 4 }, { 4, 3 }, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3 { 2, 32 }, { 2, 16 }, { 16, 32 }, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_1 { 5, 3, 3 }, { 5, 3, 5 }, { 5, 3, 1 }, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_2 { 2, 1, 1 }, { 2, 1, 32 }, { 32, 1, 1 }, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
#define CASE_FC_FP32_3D_3 { 2, 32, 32 }, { 2, 32, 16 }, { 16, 32, 1 }, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx
|
||||
|
||||
#define CASE_FC_U8S8_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_1 { 2, 32, 1, 3 }, { 2, 32, 1, 16 }, { 16, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_2 { 1, 1, 1, 3 }, { 1, 1, 1, 32 }, { 32, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_3 { 2, 3, 1, 1 }, { 2, 3, 1, 15 }, { 15, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_4 { 1, 512, 1, 1024 }, { 1, 384, 1, 1024 }, { 1024, 1024, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_1 { 1, 3 }, { 1, 4 }, { 4, 3 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_2 { 2, 3 }, { 2, 4 }, { 4, 3 }, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3 { 2, 32 }, { 2, 16 }, { 16, 32 }, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_1 { 2, 32, 3 }, { 2, 32, 16 }, { 16, 3, 1 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_2 { 1, 1, 3 }, { 1, 1, 32 }, { 32, 3, 1 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_3 { 2, 3, 1 }, { 2, 3, 15 }, { 15, 1, 1 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
#define CASE_FC_U8S8_3D_4 { 1, 512, 1024 }, { 1, 384, 1024 }, { 1024, 1024, 1 }, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx
|
||||
|
||||
/* ----------------------------------------------------------------------------------------------------- */
|
||||
/* ---------------------------------------- FC cases --------------------------------------------------- */
|
||||
@ -225,48 +182,6 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_bias, ::testing::ValuesIn(std::vec
|
||||
fully_connected_test_params{ CASE_FC_FP32_3D_3, 2, 3 },
|
||||
}));
|
||||
|
||||
class fc_int8_scale : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_scale, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data"),
|
||||
reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
TEST_P(fc_int8_scale, fp16_scale_out) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
data("weights", get_mem(get_weights_layout(p))),
|
||||
data("bias", get_mem(get_bias_layout(p))),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data", optional_data_type{ data_types::f16 }),
|
||||
reorder("reorder_bfyx", "scale", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
tolerance = 1e-5f;
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
|
||||
fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 3 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 3 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
|
||||
}));
|
||||
|
||||
class fc_int8_quantize_u8 : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_quantize_u8, basic) {
|
||||
auto p = GetParam();
|
||||
@ -296,8 +211,8 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesI
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 },
|
||||
}));
|
||||
|
||||
class fc_int8_scale_quantize_i8 : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_scale_quantize_i8, basic) {
|
||||
class fc_int8_eltwise_quantize_i8 : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_eltwise_quantize_i8, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
@ -307,10 +222,10 @@ TEST_P(fc_int8_scale_quantize_i8, basic) {
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -127)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)),
|
||||
data("eltwise_data", get_mem(get_per_channel_layout(p), 1.0f / get_weights_layout(p).count() / 255)),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data"),
|
||||
quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
eltwise("eltwise", {"fc_prim", "eltwise_data"}, eltwise_mode::prod),
|
||||
quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
@ -318,7 +233,7 @@ TEST_P(fc_int8_scale_quantize_i8, basic) {
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_quantize_i8, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise_quantize_i8, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
|
||||
fully_connected_test_params{ CASE_FC_U8S8_1, 2, 4 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_2, 2, 4 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3, 2, 4 },
|
||||
@ -327,8 +242,8 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_quantize_i8, ::testing::Valu
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 4 },
|
||||
}));
|
||||
|
||||
class fc_int8_scale_activation_quantize_i8 : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
|
||||
class fc_int8_eltwise_activation_quantize_i8 : public FullyConnectedFusingTest {};
|
||||
TEST_P(fc_int8_eltwise_activation_quantize_i8, basic) {
|
||||
auto p = GetParam();
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
@ -338,11 +253,11 @@ TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
|
||||
data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
|
||||
data("out_lo", get_mem(get_single_element_layout(p), -127)),
|
||||
data("out_hi", get_mem(get_single_element_layout(p), 127)),
|
||||
data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)),
|
||||
data("eltwise_data", get_mem(get_per_channel_layout(p), 1.0f / get_weights_layout(p).count() / 255)),
|
||||
fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, padding(), get_output_dim_size(p)),
|
||||
scale("scale", "fc_prim", "scale_data"),
|
||||
activation("activation_scale", "scale", activation_func::exp),
|
||||
quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
eltwise("eltwise", {"fc_prim", "eltwise_data"}, eltwise_mode::prod),
|
||||
activation("activation_eltwise", "eltwise", activation_func::exp),
|
||||
quantize("quantize", "activation_eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
|
||||
reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32)
|
||||
);
|
||||
|
||||
@ -350,7 +265,7 @@ TEST_P(fc_int8_scale_activation_quantize_i8, basic) {
|
||||
execute(p);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
|
||||
INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_eltwise_activation_quantize_i8, ::testing::ValuesIn(std::vector<fully_connected_test_params>{
|
||||
fully_connected_test_params{ CASE_FC_U8S8_1, 2, 5 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_2, 2, 5 },
|
||||
fully_connected_test_params{ CASE_FC_U8S8_3, 2, 5 },
|
||||
@ -370,7 +285,7 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8, ::te
|
||||
class fc_int8_inputs_fused_fp32_sum : public FullyConnectedFusingTestOneDNN {};
|
||||
TEST_P(fc_int8_inputs_fused_fp32_sum, basic) {
|
||||
auto p = GetParam();
|
||||
auto shift_layout = layout{ p.default_type, p.default_format, tensor{ 1, 1, 1, p.kernel.batch[0] } };
|
||||
auto shift_layout = layout{ ov::PartialShape{p.weights_shape[0]}, p.default_type, p.default_format };
|
||||
|
||||
create_topologies(
|
||||
input_layout("input", get_input_layout(p)),
|
||||
|
@ -0,0 +1,44 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "test_utils.h"
|
||||
|
||||
#include "intel_gpu/runtime/engine.hpp"
|
||||
|
||||
#include "intel_gpu/graph/program.hpp"
|
||||
#include "data_inst.h"
|
||||
#include "eltwise_inst.h"
|
||||
#include "intel_gpu/graph/network.hpp"
|
||||
#include "pass_manager.h"
|
||||
#include "to_string_utils.h"
|
||||
|
||||
#include "program_wrapper.h"
|
||||
|
||||
#include <memory>
|
||||
|
||||
using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
TEST(prepare_primitive_fusing, fuse_to_fc_dyn) {
|
||||
auto& engine = get_test_engine();
|
||||
auto weights = engine.allocate_memory({ ov::PartialShape{ 16, 32 }, data_types::u8, format::bfyx });
|
||||
auto in_layout = layout{ ov::PartialShape::dynamic(2), data_types::u8, format::bfyx };
|
||||
|
||||
topology topology;
|
||||
topology.add(data("weights", weights));
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(fully_connected("fc", "input", { "weights" }));
|
||||
topology.add(activation("act", "fc", activation_func::relu));
|
||||
topology.add(reorder("reorder", "act", format::bfyx, data_types::f32));
|
||||
|
||||
build_options build_opts;
|
||||
auto prog = program::build_program(engine, topology, build_opts, false, true);
|
||||
|
||||
layout_optimizer lo(true);
|
||||
|
||||
program_wrapper::apply_opt_pass<prepare_primitive_fusing>(*prog, lo);
|
||||
|
||||
ASSERT_NE(prog, nullptr);
|
||||
ASSERT_FALSE(has_node_with_type<activation>(*prog));
|
||||
}
|
@ -1770,3 +1770,202 @@ TEST(fully_connected_3d_onednn_gpu, no_biases_int8) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
TEST(fully_connected_gpu, dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const int32_t input_f = 3, input_b = 1, weight_b = 4;
|
||||
|
||||
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f, 1, 1}, data_types::f32,format::bfyx };
|
||||
auto input_data = engine.allocate_memory(layout{ ov::PartialShape{ input_b, input_f, 1, 1}, data_types::f32,format::bfyx });
|
||||
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f, 1, 1 }, data_types::f32,format::bfyx });
|
||||
|
||||
set_values(input_data, { -0.5f, 2.0f, 0.5f });
|
||||
set_values(weights_data, { 1.5f, 1.0f, 0.5f, -1.0f, 0.0f, 0.5f, 0.5f, -0.5f, -2.0f, -0.5f, 1.0f, 1.5f });
|
||||
|
||||
cldnn::topology topology{
|
||||
input_layout("input", input_dyn_layout),
|
||||
data("weights", weights_data),
|
||||
fully_connected("fc", "input", "weights")
|
||||
};
|
||||
|
||||
build_options options;
|
||||
options.set_option(build_option::optimize_data(true));
|
||||
network network(engine, topology, options);
|
||||
network.set_input_data("input", input_data);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = output_prim->get_layout();
|
||||
ASSERT_EQ(out_l.batch(), input_b);
|
||||
ASSERT_EQ(out_l.feature(), weight_b);
|
||||
ASSERT_EQ(out_l.spatial(0), 1);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_prim, get_test_stream());
|
||||
|
||||
ASSERT_EQ(1.5f, output_ptr[0]);
|
||||
ASSERT_EQ(0.75f, output_ptr[1]);
|
||||
ASSERT_EQ(-2.25f, output_ptr[2]);
|
||||
ASSERT_EQ(3.0f, output_ptr[3]);
|
||||
}
|
||||
|
||||
TEST(fully_connected_gpu, dynamic_multi_inference_same_shape) {
|
||||
auto& engine = get_test_engine();
|
||||
const int32_t input_f = 3, input_b = 1, weight_b = 4;
|
||||
|
||||
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f, 1, 1}, data_types::f32,format::bfyx };
|
||||
auto input_actual_layout = layout{ ov::PartialShape{ input_b, input_f, 1, 1}, data_types::f32,format::bfyx };
|
||||
auto input_data1 = engine.allocate_memory(input_actual_layout);
|
||||
auto input_data2 = engine.allocate_memory(input_actual_layout);
|
||||
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f, 1, 1 }, data_types::f32,format::bfyx });
|
||||
|
||||
set_values(input_data1, { 0.5f, -2.0f, -0.5f });
|
||||
set_values(input_data2, { -0.5f, 2.0f, 0.5f });
|
||||
set_values(weights_data, { 1.5f, 1.0f, 0.5f,
|
||||
-1.0f, 0.0f, 0.5f,
|
||||
0.5f, -0.5f, -2.0f,
|
||||
-0.5f, 1.0f, 1.5f });
|
||||
|
||||
cldnn::topology topology{
|
||||
input_layout("input", input_dyn_layout),
|
||||
data("weights", weights_data),
|
||||
fully_connected("fc", "input", "weights")
|
||||
};
|
||||
|
||||
build_options options;
|
||||
options.set_option(build_option::optimize_data(true));
|
||||
network network(engine, topology, options);
|
||||
|
||||
{
|
||||
network.set_input_data("input", input_data1);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = output_prim->get_layout();
|
||||
ASSERT_EQ(out_l.batch(), input_b);
|
||||
ASSERT_EQ(out_l.feature(), weight_b);
|
||||
ASSERT_EQ(out_l.spatial(0), 1);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_prim, get_test_stream());
|
||||
|
||||
ASSERT_EQ(-1.5f, output_ptr[0]);
|
||||
ASSERT_EQ(-0.75f, output_ptr[1]);
|
||||
ASSERT_EQ(2.25f, output_ptr[2]);
|
||||
ASSERT_EQ(-3.0f, output_ptr[3]);
|
||||
}
|
||||
|
||||
{
|
||||
network.set_input_data("input", input_data2);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = output_prim->get_layout();
|
||||
ASSERT_EQ(out_l.batch(), input_b);
|
||||
ASSERT_EQ(out_l.feature(), weight_b);
|
||||
ASSERT_EQ(out_l.spatial(0), 1);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_prim, get_test_stream());
|
||||
|
||||
ASSERT_EQ(1.5f, output_ptr[0]);
|
||||
ASSERT_EQ(0.75f, output_ptr[1]);
|
||||
ASSERT_EQ(-2.25f, output_ptr[2]);
|
||||
ASSERT_EQ(3.0f, output_ptr[3]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(fully_connected_gpu, dynamic_multi_inference_different_shape) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
const int32_t input_f = 3, weight_b = 4;
|
||||
|
||||
auto input_dyn_layout = layout{ ov::PartialShape{ ov::Dimension(1, 10), input_f, 1, 1}, data_types::f32,format::bfyx };
|
||||
auto input_actual_layout1 = layout{ ov::PartialShape{ 2, input_f, 1, 1}, data_types::f32,format::bfyx};
|
||||
auto input_actual_layout2 = layout{ ov::PartialShape{ 1, input_f, 1, 1}, data_types::f32,format::bfyx};
|
||||
auto input_data1 = engine.allocate_memory(input_actual_layout1);
|
||||
auto input_data2 = engine.allocate_memory(input_actual_layout2);
|
||||
auto weights_data = engine.allocate_memory({ ov::PartialShape{ weight_b, input_f, 1, 1 }, data_types::f32,format::bfyx});
|
||||
|
||||
set_values(input_data1, { 0.5f, -2.0f, -0.5f,
|
||||
-0.5f, 2.0f, 0.5f });
|
||||
set_values(input_data2, { -0.5f, 2.0f, 0.5f });
|
||||
set_values(weights_data, { 1.5f, 1.0f, 0.5f,
|
||||
-1.0f, 0.0f, 0.5f,
|
||||
0.5f, -0.5f, -2.0f,
|
||||
-0.5f, 1.0f, 1.5f });
|
||||
|
||||
cldnn::topology topology{
|
||||
input_layout("input", input_dyn_layout),
|
||||
data("weights", weights_data),
|
||||
fully_connected("fc", "input", "weights")
|
||||
};
|
||||
|
||||
build_options options;
|
||||
options.set_option(build_option::optimize_data(true));
|
||||
network network(engine, topology, options);
|
||||
|
||||
{
|
||||
network.set_input_data("input", input_data1);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = output_prim->get_layout();
|
||||
ASSERT_EQ(out_l.batch(), 2);
|
||||
ASSERT_EQ(out_l.feature(), weight_b);
|
||||
ASSERT_EQ(out_l.spatial(0), 1);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_prim, get_test_stream());
|
||||
|
||||
ASSERT_EQ(-1.5f, output_ptr[0]);
|
||||
ASSERT_EQ(-0.75f, output_ptr[1]);
|
||||
ASSERT_EQ(2.25f, output_ptr[2]);
|
||||
ASSERT_EQ(-3.0f, output_ptr[3]);
|
||||
|
||||
ASSERT_EQ(1.5f, output_ptr[4]);
|
||||
ASSERT_EQ(0.75f, output_ptr[5]);
|
||||
ASSERT_EQ(-2.25f, output_ptr[6]);
|
||||
ASSERT_EQ(3.0f, output_ptr[7]);
|
||||
}
|
||||
|
||||
{
|
||||
network.set_input_data("input", input_data2);
|
||||
|
||||
auto outputs = network.execute();
|
||||
ASSERT_EQ(outputs.size(), size_t(1));
|
||||
ASSERT_EQ(outputs.begin()->first, "fc");
|
||||
|
||||
auto output_prim = outputs.begin()->second.get_memory();
|
||||
|
||||
auto out_l = output_prim->get_layout();
|
||||
ASSERT_EQ(out_l.batch(), 1);
|
||||
ASSERT_EQ(out_l.feature(), weight_b);
|
||||
ASSERT_EQ(out_l.spatial(0), 1);
|
||||
ASSERT_EQ(out_l.spatial(1), 1);
|
||||
|
||||
cldnn::mem_lock<float> output_ptr (output_prim, get_test_stream());
|
||||
|
||||
ASSERT_EQ(1.5f, output_ptr[0]);
|
||||
ASSERT_EQ(0.75f, output_ptr[1]);
|
||||
ASSERT_EQ(-2.25f, output_ptr[2]);
|
||||
ASSERT_EQ(3.0f, output_ptr[3]);
|
||||
}
|
||||
}
|
||||
|
@ -116,3 +116,49 @@ TEST(shape_of_gpu, bfzyx) {
|
||||
EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
TEST(shape_of_gpu, dynamic) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
layout in_layout = {ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
|
||||
layout in_mem_layout0 = {ov::PartialShape{1, 2, 3, 4}, data_types::f32, format::bfyx};
|
||||
layout in_mem_layout1 = {ov::PartialShape{4, 3, 2, 1}, data_types::f32, format::bfyx};
|
||||
auto input_mem0 = engine.allocate_memory(in_mem_layout0);
|
||||
auto input_mem1 = engine.allocate_memory(in_mem_layout1);
|
||||
|
||||
cldnn::topology topology;
|
||||
topology.add(input_layout("input", in_layout));
|
||||
topology.add(shape_of("shape_of", "input", 5, data_types::i32));
|
||||
|
||||
network network(engine, topology);
|
||||
|
||||
{
|
||||
network.set_input_data("input", input_mem0);
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("shape_of").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
|
||||
std::vector<int32_t> expected_results = {1, 2, 3, 4};
|
||||
|
||||
for (size_t i = 0; i < expected_results.size(); ++i) {
|
||||
EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i]));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
network.set_input_data("input", input_mem1);
|
||||
|
||||
auto outputs = network.execute();
|
||||
|
||||
auto output = outputs.at("shape_of").get_memory();
|
||||
cldnn::mem_lock<int32_t> output_ptr(output, get_test_stream());
|
||||
|
||||
std::vector<int32_t> expected_results = {4, 3, 2, 1};
|
||||
|
||||
for (size_t i = 0; i < expected_results.size(); ++i) {
|
||||
EXPECT_TRUE(are_equal(expected_results[i], output_ptr[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -33,6 +33,7 @@
|
||||
#include "random_gen.h"
|
||||
#include "uniform_quantized_real_distribution.hpp"
|
||||
#include "to_string_utils.h"
|
||||
#include "program_node.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
@ -59,6 +60,16 @@ cldnn::engine& get_onednn_test_engine();
|
||||
#endif
|
||||
cldnn::stream& get_test_stream();
|
||||
|
||||
template<typename T>
|
||||
bool has_node_with_type(cldnn::program& prog) {
|
||||
for (auto node : prog.get_processing_order()) {
|
||||
if (node->is_type<T>())
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#define USE_RANDOM_SEED 0
|
||||
#if USE_RANDOM_SEED
|
||||
std::random_device rnd_device;
|
||||
|
Loading…
Reference in New Issue
Block a user