[GPU] Prevent memory reset at runtime allocation for dynamic shape, fix wrong padding handling (#16351)
* Prevent memory reset at runtime allocation for dynamic shape * Set default alloc to reset mem * Additional fixes : - If there is any convolution/deconvolution users which requires padded input, enqueue reset buffer when reuse buffer. - Removed cl finish from gpu_buffer::fill. (Hopefully it should be waited only when needed. Otherwise sync is to be done by event) - Removed buffer reset from on_execute of nonzero count, which is not needed any more. * Remove unused API * Fix tensor offset to project the padding * Added unittest * Applied review comment
This commit is contained in:
parent
1ef94ec069
commit
6a25143045
@ -214,13 +214,6 @@ public:
|
||||
bool is_primary_stream() const { return _is_primary_stream; }
|
||||
bool is_dynamic() const { return _is_dynamic; }
|
||||
|
||||
/// Create memory object with specified @p layout and allocation @p type for primitive with @p id
|
||||
/// Underlying memory handle can be reused with other primitives from memory pool based on @p dependencies
|
||||
memory_ptr get_memory_from_pool(const layout& layout,
|
||||
primitive_id id,
|
||||
std::set<primitive_id> dependencies,
|
||||
allocation_type type,
|
||||
bool reusable = true);
|
||||
memory_pool& get_memory_pool() {
|
||||
return *_memory_pool;
|
||||
}
|
||||
|
@ -106,13 +106,15 @@ public:
|
||||
uint32_t network_id,
|
||||
const std::set<primitive_id>& restrictions,
|
||||
allocation_type type,
|
||||
bool reusable = true); // get from pool or create memory allocation
|
||||
bool reusable = true,
|
||||
bool reset = true); // get from pool or create memory allocation
|
||||
memory_ptr get_memory(const layout& layout, allocation_type type, bool reset = true);
|
||||
memory_ptr get_from_non_padded_pool(const layout& layout,
|
||||
const primitive_id& id,
|
||||
uint32_t network_id,
|
||||
const std::set<primitive_id>&,
|
||||
allocation_type type);
|
||||
allocation_type type,
|
||||
bool reset = true);
|
||||
memory_ptr get_from_padded_pool(const layout& layout,
|
||||
const primitive_id& id,
|
||||
uint32_t network_id,
|
||||
|
@ -42,6 +42,23 @@ public:
|
||||
static std::string to_string(binary_convolution_node const& node);
|
||||
typed_primitive_inst(network& network, binary_convolution_node const& node);
|
||||
|
||||
bool need_reset_input_memory() const override {
|
||||
auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
|
||||
if (input_layout.data_padding) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool need_reset_output_memory() const override {
|
||||
bool res = parent::need_reset_output_memory();
|
||||
auto output_layout = _impl_params->get_output_layout(0);
|
||||
if (output_layout.data_padding) {
|
||||
return true;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
memory::ptr weights_memory() const { return dep_memory_ptr(1); }
|
||||
};
|
||||
|
||||
|
@ -127,6 +127,23 @@ public:
|
||||
static layout calc_output_layout(convolution_node const& node, kernel_impl_params const& impl_param);
|
||||
static std::string to_string(convolution_node const& node);
|
||||
|
||||
bool need_reset_input_memory() const override {
|
||||
auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
|
||||
if (input_layout.data_padding) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool need_reset_output_memory() const override {
|
||||
bool res = parent::need_reset_output_memory();
|
||||
auto output_layout = _impl_params->get_output_layout(0);
|
||||
if (output_layout.data_padding) {
|
||||
return true;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
public:
|
||||
typed_primitive_inst(network& network, convolution_node const& node);
|
||||
|
||||
|
@ -60,6 +60,23 @@ public:
|
||||
static layout calc_output_layout(deconvolution_node const& node, kernel_impl_params const& impl_param);
|
||||
static std::string to_string(deconvolution_node const& node);
|
||||
|
||||
bool need_reset_input_memory() const override {
|
||||
auto input_layout = _deps[0].first->_impl_params->get_output_layout(0);
|
||||
if (input_layout.data_padding) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool need_reset_output_memory() const override {
|
||||
bool res = parent::need_reset_output_memory();
|
||||
auto output_layout = _impl_params->get_output_layout(0);
|
||||
if (output_layout.data_padding) {
|
||||
return true;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
typed_primitive_inst(network& network, deconvolution_node const& node);
|
||||
|
||||
memory::ptr weights_memory() const {
|
||||
|
@ -40,9 +40,6 @@ public:
|
||||
static std::string to_string(count_nonzero_node const& node);
|
||||
|
||||
typed_primitive_inst(network& network, count_nonzero_node const& node);
|
||||
|
||||
private:
|
||||
void on_execute() override;
|
||||
};
|
||||
|
||||
using count_nonzero_inst = typed_primitive_inst<count_nonzero>;
|
||||
|
@ -192,7 +192,7 @@ public:
|
||||
|
||||
void allocate_internal_buffers();
|
||||
static memory::ptr allocate_output(engine& engine, memory_pool& pool, const program_node& _node,
|
||||
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0);
|
||||
const kernel_impl_params& impl_params, uint32_t net_id, bool is_internal, size_t idx = 0, bool reset_mem = true);
|
||||
|
||||
std::vector<memory::cptr> get_intermediates_memories() const { return _intermediates_memory; }
|
||||
|
||||
@ -284,7 +284,7 @@ protected:
|
||||
size_t max_output_layout_size = 0;
|
||||
std::vector<size_t> max_intermediates_memory_sizes;
|
||||
|
||||
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr);
|
||||
std::vector<memory::ptr> allocate_outputs(kernel_impl_params* updated_params = nullptr, bool reset_mem = true);
|
||||
memory::ptr allocate_internal_buffer(size_t idx);
|
||||
static std::vector<std::shared_ptr<primitive_inst>> build_exec_deps(
|
||||
std::vector<std::pair<std::shared_ptr<primitive_inst>, int32_t>> const& mem_deps);
|
||||
@ -298,7 +298,7 @@ protected:
|
||||
virtual event::ptr update_weights();
|
||||
// if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false
|
||||
bool update_impl();
|
||||
void realloc_if_needed();
|
||||
event::ptr realloc_if_needed();
|
||||
|
||||
cldnn::network::ptr get_unfused_subgraph();
|
||||
|
||||
@ -332,6 +332,21 @@ protected:
|
||||
return { layout(in_layout.get<ShapeType>(), output_type, in_layout.format) };
|
||||
}
|
||||
|
||||
virtual bool need_reset_input_memory() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool need_reset_output_memory() const {
|
||||
std::vector<primitive_id> users;
|
||||
for (auto u : _node->get_users())
|
||||
users.push_back(u->id());
|
||||
|
||||
for (auto u : _network.get_primitives(users)) {
|
||||
if (u->need_reset_input_memory())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// This could be implemented via single map std::unordered_map<instrumentation::perf_counter_key, std::tuple<int64_t, size_t>>
|
||||
// but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key
|
||||
|
@ -1322,16 +1322,6 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
|
||||
}
|
||||
}
|
||||
|
||||
memory::ptr network::get_memory_from_pool(const layout& layout,
|
||||
primitive_id id,
|
||||
std::set<primitive_id> dependencies,
|
||||
allocation_type type,
|
||||
bool reusable) {
|
||||
if (_config.get_property(ov::intel_gpu::enable_memory_pool))
|
||||
return _memory_pool->get_memory(layout, id, get_id(), dependencies, type, reusable);
|
||||
return _memory_pool->get_memory(layout, type);
|
||||
}
|
||||
|
||||
network::VariableState& network::get_variable_memory(const std::string &variable_id) {
|
||||
auto it = _variables_states.find(variable_id);
|
||||
if (it == _variables_states.end()) {
|
||||
|
@ -48,10 +48,6 @@ std::string count_nonzero_inst::to_string(count_nonzero_node const& node) {
|
||||
|
||||
count_nonzero_inst::typed_primitive_inst(network& network, count_nonzero_node const& node) : parent(network, node) {}
|
||||
|
||||
void count_nonzero_inst::on_execute() {
|
||||
output_memory().fill(_network.get_stream(), 0);
|
||||
}
|
||||
|
||||
// -----------------------------------------------
|
||||
// gather_nonzero
|
||||
// -----------------------------------------------
|
||||
|
@ -242,11 +242,11 @@ void primitive_inst::update_shape() {
|
||||
}
|
||||
}
|
||||
|
||||
void primitive_inst::realloc_if_needed() {
|
||||
event::ptr primitive_inst::realloc_if_needed() {
|
||||
GPU_DEBUG_GET_INSTANCE(debug_config);
|
||||
GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation);
|
||||
|
||||
|
||||
event::ptr ev = nullptr;
|
||||
// Update param if fake_alignment is available
|
||||
auto updated_params = _node->type()->get_fake_aligned_params(*_impl_params);
|
||||
auto actual_layout = updated_params.get_output_layout();
|
||||
@ -254,28 +254,31 @@ void primitive_inst::realloc_if_needed() {
|
||||
|
||||
// input_layout node is supposed to always use external memory in dynamic case
|
||||
if (_node->is_type<input_layout>())
|
||||
return;
|
||||
return ev;
|
||||
|
||||
bool can_reuse_buffer = _outputs[0] && actual_layout.count() <= max_output_layout_size;
|
||||
|
||||
if (can_reuse_buffer) {
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": reuse previously allocated output buffer" << std::endl;
|
||||
_outputs[0] = _network.get_engine().reinterpret_buffer(*_outputs[0], actual_layout);
|
||||
if (need_reset_output_memory()) {
|
||||
ev = _outputs[0]->fill(_network.get_stream());
|
||||
}
|
||||
} else {
|
||||
GPU_DEBUG_TRACE_DETAIL << id() << ": realloc output memory. "
|
||||
<< " Current buffer_size=" << max_output_layout_size
|
||||
<< " Requested buffer_size=" << actual_layout.count() << std::endl;
|
||||
_outputs = allocate_outputs(&updated_params);
|
||||
_outputs = allocate_outputs(&updated_params, need_reset_output_memory());
|
||||
// TODO : need to handle multiple outputs
|
||||
max_output_layout_size = updated_params.output_layouts[0].count();
|
||||
}
|
||||
// intermediate memory allocation is required for primitives consisting of multiple kernels in dynamic case
|
||||
{
|
||||
if (_impl == nullptr)
|
||||
return;
|
||||
return ev;
|
||||
const auto& ibuf_layouts = _impl->get_internal_buffer_layouts();
|
||||
if (ibuf_layouts.empty())
|
||||
return;
|
||||
return ev;
|
||||
|
||||
for (size_t i = 0; i < ibuf_layouts.size(); ++i) {
|
||||
if (i < _intermediates_memory.size() && ibuf_layouts[i].bytes_count() <= max_intermediates_memory_sizes[i]) {
|
||||
@ -293,6 +296,7 @@ void primitive_inst::realloc_if_needed() {
|
||||
}
|
||||
}
|
||||
}
|
||||
return ev;
|
||||
}
|
||||
|
||||
bool primitive_inst::update_impl() {
|
||||
@ -431,7 +435,9 @@ event::ptr primitive_inst::execute(const std::vector<event::ptr>& events) {
|
||||
auto ev = update_weights();
|
||||
if (ev)
|
||||
dependencies.push_back(ev);
|
||||
realloc_if_needed();
|
||||
auto ev_reset = realloc_if_needed();
|
||||
if (ev_reset)
|
||||
dependencies.push_back(ev_reset);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -763,15 +769,15 @@ static bool user_requesting_mem_reuse_false(const program_node& node) {
|
||||
}
|
||||
|
||||
memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool, const program_node& _node, const kernel_impl_params& impl_params,
|
||||
uint32_t net_id, bool is_internal, size_t idx) {
|
||||
uint32_t net_id, bool is_internal, size_t idx, bool reset) {
|
||||
auto get_memory_from_pool = [&](engine& _engine, const layout& layout, const primitive_id id, std::set<primitive_id> dependencies,
|
||||
allocation_type type, bool reusable) {
|
||||
allocation_type type, bool reusable, bool reset = true) {
|
||||
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(), "[GPU] Can't allocate output for dynamic layout without upper bound");
|
||||
// Use layout with max tensor for dynamic shape with upper bound
|
||||
auto static_layout = cldnn::layout(layout.data_type, layout.format, layout.get_tensor(), layout.data_padding);
|
||||
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool))
|
||||
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable);
|
||||
return pool.get_memory(static_layout, type);
|
||||
return pool.get_memory(static_layout, id, net_id, dependencies, type, reusable, reset);
|
||||
return pool.get_memory(static_layout, type, reset);
|
||||
};
|
||||
|
||||
|
||||
@ -817,7 +823,8 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
_node.id(),
|
||||
_node.get_memory_dependencies(),
|
||||
alloc_type,
|
||||
false);
|
||||
false,
|
||||
reset);
|
||||
} else if (is_internal && _node.is_output() && _node.is_type<generic_layer>() &&
|
||||
_engine.supports_allocation(allocation_type::usm_device) && usm_device_allocatable) {
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
|
||||
@ -829,23 +836,24 @@ memory::ptr primitive_inst::allocate_output(engine& _engine, memory_pool& pool,
|
||||
return _engine.allocate_memory(layout, alloc_type, false);
|
||||
} else if (is_internal || (!_node.can_share_buffer()) || _node.can_be_optimized() || _node.is_output()) {
|
||||
GPU_DEBUG_LOG << "[" << _node.id() << ": output]" << std::endl;
|
||||
return _engine.allocate_memory(layout, alloc_type);
|
||||
return _engine.allocate_memory(layout, alloc_type, reset);
|
||||
} else {
|
||||
return get_memory_from_pool(_engine,
|
||||
layout,
|
||||
_node.id(),
|
||||
_node.get_memory_dependencies(),
|
||||
alloc_type,
|
||||
true);
|
||||
true,
|
||||
reset);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params) {
|
||||
std::vector<memory::ptr> primitive_inst::allocate_outputs(kernel_impl_params* updated_params, bool reset_mem) {
|
||||
std::vector<memory::ptr> outputs;
|
||||
for (size_t i = 0; i < get_node().get_outputs_count() ; ++i) {
|
||||
outputs.push_back(allocate_output(get_network().get_engine(), _network.get_memory_pool(),
|
||||
*_node, (updated_params != nullptr) ? *updated_params : *_impl_params,
|
||||
get_network_id(), _network.is_internal(), i));
|
||||
get_network_id(), _network.is_internal(), i, reset_mem));
|
||||
}
|
||||
return outputs;
|
||||
}
|
||||
|
@ -245,7 +245,6 @@ public:
|
||||
|
||||
JitDefinitions GetDefinitions(const Tensor::TensorBaseT<DType, Layout>& t) const {
|
||||
JitDefinitions definitions{
|
||||
{_name + "_OFFSET", toCodeString(t.GetFirstElementOffset())},
|
||||
{_name + "_VIEW_OFFSET", toCodeString(t.GetViewOffset())},
|
||||
{_name + "_LENGTH", toCodeString(t.LogicalSize())},
|
||||
{_name + "_DIMS", toCodeString(t.GetDims().size())},
|
||||
@ -258,6 +257,7 @@ public:
|
||||
definitions.insert(definitions.end(), type_defs.begin(), type_defs.end());
|
||||
|
||||
if (!t.is_dynamic()) {
|
||||
definitions.push_back({_name + "_OFFSET", toCodeString(t.GetFirstElementOffset())});
|
||||
definitions.push_back({_name + "_SIZE", toCodeString(t.GetDims().size())});
|
||||
definitions.push_back(
|
||||
{_name + "_SIZES_DATA",
|
||||
@ -265,13 +265,34 @@ public:
|
||||
definitions.push_back(
|
||||
{_name + "_PITCHES",
|
||||
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 1, [](const Tensor::Dim& d) { return d.pitch; })});
|
||||
} else {
|
||||
// calculate tensor offset
|
||||
std::vector<std::string> padded_pitches = {
|
||||
toVectorMulString({_name + "_X_PITCH", _name + "_PAD_BEFORE_SIZE_X"}),
|
||||
toVectorMulString({_name + "_Y_PITCH", _name + "_PAD_BEFORE_SIZE_Y"}),
|
||||
toVectorMulString({_name + "_Z_PITCH", _name + "_PAD_BEFORE_SIZE_Z"}),
|
||||
toVectorMulString({_name + "_W_PITCH", _name + "_PAD_BEFORE_SIZE_W"}),
|
||||
toVectorMulString({_name + "_FEATURE_PITCH", _name + "_PAD_BEFORE_FEATURE_NUM"}),
|
||||
toVectorMulString({_name + "_BATCH_PITCH", _name + "_PAD_BEFORE_BATCH_NUM"})};
|
||||
std::string offset_str = "(";
|
||||
for (size_t i = 0; i < padded_pitches.size(); ++i) {
|
||||
offset_str += padded_pitches[i];
|
||||
if (i < padded_pitches.size() - 1)
|
||||
offset_str += " + ";
|
||||
}
|
||||
offset_str += ")";
|
||||
definitions.push_back({_name + "_OFFSET", offset_str});
|
||||
}
|
||||
definitions.push_back(
|
||||
{_name + "_PAD_BEFORE",
|
||||
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.before; })});
|
||||
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) {
|
||||
return d.pad.before;
|
||||
})});
|
||||
definitions.push_back(
|
||||
{_name + "_PAD_AFTER",
|
||||
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) { return d.pad.after; })});
|
||||
toVectorString(t.GetDims(), "size_t", KERNEL_SELECTOR_TENSOR_DIM_MAX, 0, [](const Tensor::Dim& d) {
|
||||
return d.pad.after;
|
||||
})});
|
||||
|
||||
return definitions;
|
||||
}
|
||||
|
@ -120,7 +120,8 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
|
||||
const primitive_id& id,
|
||||
uint32_t network_id,
|
||||
const std::set<primitive_id>& restrictions,
|
||||
allocation_type type) {
|
||||
allocation_type type,
|
||||
bool reset) {
|
||||
auto it = _non_padded_pool.lower_bound(layout.bytes_count());
|
||||
while (it != _non_padded_pool.end()) {
|
||||
if (it->second._network_id == network_id &&
|
||||
@ -139,7 +140,7 @@ memory::ptr memory_pool::get_from_non_padded_pool(const layout& layout,
|
||||
}
|
||||
GPU_DEBUG_LOG << "[" << id << ": output]" << std::endl;
|
||||
// didn't find anything for you? create new resource
|
||||
auto mem = alloc_memory(layout, type);
|
||||
auto mem = alloc_memory(layout, type, reset);
|
||||
{
|
||||
_non_padded_pool.emplace(layout.bytes_count(),
|
||||
memory_record({{id, network_id}}, mem, network_id, type));
|
||||
@ -221,21 +222,22 @@ memory::ptr memory_pool::get_memory(const layout& layout,
|
||||
uint32_t network_id,
|
||||
const std::set<primitive_id>& restrictions,
|
||||
allocation_type type,
|
||||
bool reusable_across_network) {
|
||||
bool reusable_across_network,
|
||||
bool reset) {
|
||||
if (reusable_across_network) {
|
||||
// reusable within the same network
|
||||
if (!layout.format.is_image() && layout.data_padding == padding{{0, 0, 0, 0}, 0}) {
|
||||
// non-padded buffers
|
||||
return get_from_non_padded_pool(layout, id, network_id, restrictions, type);
|
||||
return get_from_non_padded_pool(layout, id, network_id, restrictions, type, reset);
|
||||
} else if (!layout.format.is_image()) {
|
||||
// padded buffers
|
||||
return get_from_padded_pool(layout, id, network_id, restrictions, type);
|
||||
} else {
|
||||
// images (reuse not yet implemented)
|
||||
return alloc_memory(layout, type);
|
||||
return alloc_memory(layout, type, reset);
|
||||
}
|
||||
} else {
|
||||
return alloc_memory(layout, type);
|
||||
return alloc_memory(layout, type, reset);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -151,7 +151,7 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty
|
||||
}
|
||||
|
||||
if (reset || res->is_memory_reset_needed(layout)) {
|
||||
res->fill(get_service_stream());
|
||||
get_service_stream().wait_for_events({res->fill(get_service_stream())});
|
||||
}
|
||||
|
||||
return res;
|
||||
|
@ -73,9 +73,6 @@ event::ptr gpu_buffer::fill(stream& stream, unsigned char pattern) {
|
||||
cl::Event& ev_ocl = downcast<ocl_event>(ev.get())->get();
|
||||
cl_stream.get_cl_queue().enqueueFillBuffer<unsigned char>(_buffer, pattern, 0, size(), nullptr, &ev_ocl);
|
||||
|
||||
// TODO: do we need sync here?
|
||||
cl_stream.finish();
|
||||
|
||||
return ev;
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include <intel_gpu/primitives/input_layout.hpp>
|
||||
#include <intel_gpu/primitives/softmax.hpp>
|
||||
#include <intel_gpu/primitives/reorder.hpp>
|
||||
#include <intel_gpu/primitives/data.hpp>
|
||||
|
||||
#include "softmax_inst.h"
|
||||
@ -19,6 +20,106 @@ using namespace cldnn;
|
||||
using namespace ::tests;
|
||||
|
||||
namespace memory_realloc_tests {
|
||||
TEST(memory_reuse_realloc_reset_test, basic_conv_with_padding) {
|
||||
auto& engine = get_test_engine();
|
||||
|
||||
layout weight_layout = layout{ov::PartialShape{1, 3, 3, 3}, data_types::f16, format::bfyx};
|
||||
|
||||
auto weights = engine.allocate_memory(weight_layout);
|
||||
set_values<FLOAT16>(weights, {
|
||||
1.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f,
|
||||
1.0f, 1.0f, 1.0f,
|
||||
//
|
||||
2.0f, 2.0f, 2.0f,
|
||||
2.0f, 2.0f, 2.0f,
|
||||
2.0f, 2.0f, 2.0f,
|
||||
//
|
||||
3.0f, 3.0f, 3.0f,
|
||||
3.0f, 3.0f, 3.0f,
|
||||
3.0f, 3.0f, 3.0f,
|
||||
});
|
||||
|
||||
layout input_layout_1 = layout{ov::PartialShape{1, 3, 5, 5}, data_types::f32, format::bfyx};
|
||||
auto input_mem_1 = engine.allocate_memory(input_layout_1);
|
||||
set_values(input_mem_1, {
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
//
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
//
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
|
||||
});
|
||||
|
||||
std::vector<float> ref_output_1 = {6, 18, 36, 54, 72, 54, 30, 12, 36, 72, 108, 144, 108,
|
||||
60, 18, 54, 108, 162, 216, 162, 90, 18, 54, 108, 162, 216,
|
||||
162, 90, 18, 54, 108, 162, 216, 162, 90, 12, 36, 72, 108,
|
||||
144, 108, 60, 6, 18, 36, 54, 72, 54, 30};
|
||||
|
||||
layout input_layout_2 = layout{ov::PartialShape{1, 3, 2, 2}, data_types::f32, format::bfyx};
|
||||
auto input_mem_2 = engine.allocate_memory(input_layout_2);
|
||||
set_values(input_mem_2, {11.0f, 11.0f, 11.0f, 11.0f,
|
||||
11.0f, 11.0f, 11.0f, 11.0f,
|
||||
11.0f, 11.0f, 11.0f, 11.0f});
|
||||
std::vector<float> ref_output_2 = { 66, 132, 132, 66, 132, 264, 264, 132, 132, 264, 264, 132, 66, 132, 132, 66};
|
||||
std::vector<float> values_to_subtract = {};
|
||||
auto input_l = layout{ov::PartialShape::dynamic(4), data_types::f32, format::bfyx};
|
||||
topology topology(input_layout("input", input_l),
|
||||
data("weights", weights),
|
||||
reorder("reorder", input_info("input"), format::bfyx, data_types::f16,
|
||||
values_to_subtract, reorder_mean_mode::subtract, padding{{0, 0, 2, 2}, 0}),
|
||||
convolution("conv",
|
||||
input_info("reorder"),
|
||||
{"weights"},
|
||||
{}, /*bias*/
|
||||
{1, 1}, /*stride*/
|
||||
{2, 2}, /*pad*/
|
||||
{1, 1}, /*dilation*/
|
||||
{2, 2}, /*pad_above*/
|
||||
{2, 2}, /*pad_below*/
|
||||
padding{{0, 0, 0, 0}, 0}),
|
||||
reorder("output", input_info("conv"), format::bfyx, data_types::f32)); /*output padding*/
|
||||
|
||||
ExecutionConfig config = get_test_default_config(engine);
|
||||
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
|
||||
|
||||
network network(engine, topology, config);
|
||||
network.set_input_data("input", input_mem_1);
|
||||
auto outputs_1 = network.execute();
|
||||
network.set_input_data("input", input_mem_2);
|
||||
auto outputs_2 = network.execute();
|
||||
auto output_mem_2 = outputs_2.begin()->second.get_memory();
|
||||
cldnn::mem_lock<float> output_mem_2_ptr(output_mem_2, get_test_stream());
|
||||
for (size_t i = 0; i < output_mem_2->get_layout().get_buffer_size().count(); ++i) {
|
||||
ASSERT_EQ(output_mem_2_ptr[i], ref_output_2[i]);
|
||||
}
|
||||
// check padding of second run of reorder
|
||||
// 0, 0, 0, 0, 0, 0,
|
||||
// 0, 0, 0, 0, 0, 0,
|
||||
// 0, 0, 11, 11, 0, 0,
|
||||
// 0, 0, 11, 11, 0, 0,
|
||||
// 0, 0,"0","0","0","0", // !! check pad_after
|
||||
// 0, 0,"0","0","0","0", // !! check pad_after
|
||||
auto reorder_mem = network.get_primitive("reorder")->output_memory_ptr();
|
||||
cldnn::mem_lock<FLOAT16, mem_lock_type::read> reorder_mem_ptr(reorder_mem, get_test_stream());
|
||||
for (size_t i = 26; i < 29; ++i) {
|
||||
ASSERT_EQ((float)reorder_mem_ptr[i], 0.f);
|
||||
}
|
||||
for (size_t i = 32; i < 35; ++i) {
|
||||
ASSERT_EQ((float)reorder_mem_ptr[i], 0.f);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(softmax_gpu_dynamic_f32_test_upper_bound, input_same_values) {
|
||||
static const int32_t
|
||||
|
Loading…
Reference in New Issue
Block a user