From 32bc7840fc6e0589b893fa490efa8ed7c76199c0 Mon Sep 17 00:00:00 2001 From: Alexander Zhogov Date: Mon, 27 Dec 2021 22:51:23 +0300 Subject: [PATCH 01/78] Revert "Revert "[LPT] Assign + ReadValue transformation (#8690)" (#9457)" (#9460) This reverts commit d51f33793409e6f673bed92801d12510af453dbc. --- .../backend/int_executable.cpp | 19 +- .../template_plugin/backend/opset_int_tbl.hpp | 2 + .../low_precision/assign_and_read_value.hpp | 27 +++ .../common/fake_quantize_dequantization.hpp | 1 + .../include/low_precision/fake_quantize.hpp | 5 +- .../include/low_precision/network_helper.hpp | 94 -------- .../src/assign_and_read_value.cpp | 132 ++++++++++++ .../src/fake_quantize.cpp | 22 +- .../src/fake_quantize_dequantization.cpp | 20 ++ .../src/low_precision.cpp | 2 + .../src/network_helper.cpp | 29 +-- .../assign_and_read_value_transformation.cpp | 200 ++++++++++++++++++ .../simple_low_precision_transformer.hpp | 4 + .../assign_and_read_value_transformation.cpp | 55 +++++ .../assign_and_read_value_transformation.cpp | 55 +++++ .../assign_and_read_value_transformation.hpp | 35 +++ .../assign_and_read_value_transformation.cpp | 49 +++++ .../assign_and_read_value_function.hpp | 46 ++++ .../src/assign_and_read_value_function.cpp | 185 ++++++++++++++++ 19 files changed, 847 insertions(+), 135 deletions(-) create mode 100644 src/common/low_precision_transformations/include/low_precision/assign_and_read_value.hpp create mode 100644 src/common/low_precision_transformations/src/assign_and_read_value.cpp create mode 100644 src/tests/functional/inference_engine/lp_transformations/assign_and_read_value_transformation.cpp create mode 100644 src/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp create mode 100644 src/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp create mode 100644 src/tests/functional/plugin/shared/include/low_precision_transformations/assign_and_read_value_transformation.hpp create mode 100644 src/tests/functional/plugin/shared/src/low_precision_transformations/assign_and_read_value_transformation.cpp create mode 100644 src/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/assign_and_read_value_function.hpp create mode 100644 src/tests/ngraph_helpers/lpt_ngraph_functions/src/assign_and_read_value_function.cpp diff --git a/docs/template_plugin/backend/int_executable.cpp b/docs/template_plugin/backend/int_executable.cpp index 4c119438e3c..0628a8371df 100644 --- a/docs/template_plugin/backend/int_executable.cpp +++ b/docs/template_plugin/backend/int_executable.cpp @@ -5,6 +5,7 @@ #include "int_executable.hpp" #include +#include #include "evaluates_map.hpp" #include "ngraph/except.hpp" @@ -87,6 +88,10 @@ bool runtime::interpreter::INTExecutable::call(const vector(op) != nullptr) { @@ -143,8 +148,20 @@ bool runtime::interpreter::INTExecutable::call(const vector(cloned_node)) { + auto variable = var_extension->get_variable(); + if (!variable_context.get_variable_value(variable)) { + auto h_tensor = std::make_shared(cloned_node->get_input_element_type(0), + cloned_node->get_input_shape(0)); + std::vector data(ov::shape_size(cloned_node->get_input_shape(0)), 0); + h_tensor->write(data.data(), data.size() * sizeof(float)); + variable_context.set_variable_value(variable, std::make_shared(h_tensor)); + } + } + // Call evaluate for cloned_node with static shapes - if (!cloned_node->evaluate(op_outputs, op_inputs)) { + if (!cloned_node->evaluate(op_outputs, op_inputs, eval_context)) { evaluate_node(cloned_node, op_outputs, op_inputs); } if (m_performance_counters_enabled) { diff --git a/docs/template_plugin/backend/opset_int_tbl.hpp b/docs/template_plugin/backend/opset_int_tbl.hpp index 287bf9a0d11..4d4383da233 100644 --- a/docs/template_plugin/backend/opset_int_tbl.hpp +++ b/docs/template_plugin/backend/opset_int_tbl.hpp @@ -85,6 +85,7 @@ NGRAPH_OP(NonMaxSuppression, op::v5) NGRAPH_OP(RNNSequence, op::v5) NGRAPH_OP(Round, op::v5) +NGRAPH_OP(Assign, ngraph::op::v6) NGRAPH_OP(CTCGreedyDecoderSeqLen, op::v6) NGRAPH_OP(ExperimentalDetectronDetectionOutput, op::v6) NGRAPH_OP(ExperimentalDetectronGenerateProposalsSingleImage, op::v6) @@ -93,6 +94,7 @@ NGRAPH_OP(ExperimentalDetectronROIFeatureExtractor, op::v6) NGRAPH_OP(ExperimentalDetectronTopKROIs, op::v6) NGRAPH_OP(GatherElements, op::v6) NGRAPH_OP(MVN, ngraph::op::v6) +NGRAPH_OP(ReadValue, ngraph::op::v6) NGRAPH_OP(DFT, op::v7) NGRAPH_OP(Einsum, op::v7) diff --git a/src/common/low_precision_transformations/include/low_precision/assign_and_read_value.hpp b/src/common/low_precision_transformations/include/low_precision/assign_and_read_value.hpp new file mode 100644 index 00000000000..b7ea91d099e --- /dev/null +++ b/src/common/low_precision_transformations/include/low_precision/assign_and_read_value.hpp @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "layer_transformation.hpp" + +namespace ngraph { +namespace pass { +namespace low_precision { + +class LP_TRANSFORMATIONS_API AssignAndReadValueTransformation : public LayerTransformation { +public: + NGRAPH_RTTI_DECLARATION; + AssignAndReadValueTransformation(const std::shared_ptr function, const Params& params = Params()); + bool transform(TransformationContext& context, ngraph::pattern::Matcher& m) override; + bool canBeTransformed(const TransformationContext& context, std::shared_ptr op) const override; + bool isPrecisionPreserved(std::shared_ptr layer) const noexcept override; +private: + std::shared_ptr function; +}; + +} // namespace low_precision +} // namespace pass +} // namespace ngraph \ No newline at end of file diff --git a/src/common/low_precision_transformations/include/low_precision/common/fake_quantize_dequantization.hpp b/src/common/low_precision_transformations/include/low_precision/common/fake_quantize_dequantization.hpp index 0da82810c97..4bc50744c5a 100644 --- a/src/common/low_precision_transformations/include/low_precision/common/fake_quantize_dequantization.hpp +++ b/src/common/low_precision_transformations/include/low_precision/common/fake_quantize_dequantization.hpp @@ -33,6 +33,7 @@ public: bool multiplyHasZeroOrDenormal() const; bool isShared() const; bool isLowPrecision() const; + std::shared_ptr copyWithNewInput(const std::shared_ptr& input) const; static bool checkElementwise(const std::shared_ptr& elementwise); diff --git a/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp b/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp index 15975782ef0..6a3f84b6b4c 100644 --- a/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp @@ -22,11 +22,10 @@ public: static bool checkElementwise(const std::shared_ptr& eltwise); -private: - std::shared_ptr fuseElementwise( + static std::shared_ptr fuseElementwise( TransformationContext& context, MatcherPass* matcherPass, - const std::shared_ptr& fakeQuantize) const; + const std::shared_ptr& fakeQuantize); }; } // namespace low_precision diff --git a/src/common/low_precision_transformations/include/low_precision/network_helper.hpp b/src/common/low_precision_transformations/include/low_precision/network_helper.hpp index 5f7d6a74113..569cd79d506 100644 --- a/src/common/low_precision_transformations/include/low_precision/network_helper.hpp +++ b/src/common/low_precision_transformations/include/low_precision/network_helper.hpp @@ -41,17 +41,12 @@ public: static std::vector> consumer_inputs(std::shared_ptr node); - // returns true if at least one child is not FQ - static bool notAllChildrensAreFQ(const NodeVector& layer); - // Collect and return a vector with all nodes that consumes any of the `node` output static std::vector> consumers(std::shared_ptr node); // return true if op is on a constant path static bool isConstantPath(const std::shared_ptr& op); - static Shape alignShapeForChannelDim(const Shape& shape, Rank rank); - template static std::shared_ptr setOutDataPrecisionForTypeRelaxed(std::shared_ptr operation, const element::Type& precision); @@ -215,87 +210,6 @@ public: const std::shared_ptr& dequantization, const std::shared_ptr& newNode); - static void replaceAttributeInNodes( - std::shared_ptr f, - const std::string& name, - const ov::Any& newAttribute, - const ov::Any& oldAttribute, - const std::shared_ptr& initialNode) { - std::set> visited; - std::deque> nodes; - nodes.emplace_back(initialNode); - - while (!nodes.empty()) { - auto node = nodes.front(); - nodes.pop_front(); - - if (visited.count(node) || ov::is_type(node)) { - continue; - } - - visited.insert(node); - - bool handleConnectedNodes = false; - if (NetworkHelper::isPrecisionPreserved(node) || ov::is_type(node)) { - auto& rt = node->get_rt_info(); - - if (node == initialNode) { - rt[name] = newAttribute; - handleConnectedNodes = true; - } else { - auto it = rt.find(name); - if (it != rt.end()) { - const auto currentAttribute = it->second; - if (oldAttribute == currentAttribute) { - rt[name] = newAttribute; - } - handleConnectedNodes = true; - } - } - } - - if (!handleConnectedNodes) { - continue; - } - - if (!ov::is_type(node)) { - for (size_t index = 0ul; index < node->get_input_size(); ++index) { - auto getInput = [](const std::shared_ptr& node, const size_t index) { - const auto dequantization = NetworkHelper::getDequantization(node, index); - if (!dequantization.empty() && - (ov::is_type(dequantization.data.get_node())) && - ov::is_type(dequantization.data.get_node()->get_input_node_ptr(0))) { - const auto input = dequantization.data.get_node()->input(0); - return input; - } - return node->input(index); - }; - - const auto& input = getInput(node, index); - const auto& input_node = input.get_source_output().get_node_shared_ptr(); - - //const auto& input_node = input.get_source_output().get_node_shared_ptr(); - if (visited.count(input_node) || ov::is_type(input_node)) { - continue; - } - - nodes.push_front(input_node); - } - } - - for (auto& output : node->outputs()) { - for (auto& input_value : output.get_target_inputs()) { - const auto& output_node = input_value.get_node()->shared_from_this(); - if (visited.count(output_node) || ov::is_type(output_node)) { - continue; - } - - nodes.push_front(output_node); - } - } - } - } - template static void reassign( const std::shared_ptr& sharedValue, @@ -370,14 +284,6 @@ std::shared_ptr make_op_pattern(const ngraph::NodeVector& args) { return std::make_shared(element::undefined, PartialShape{}, [](std::shared_ptr n) {return !!ov::as_type_ptr(n); }, args); } -template -std::shared_ptr make_op_label() { - return std::make_shared( - element::undefined, - PartialShape{}, - [](std::shared_ptr n) {return !!ov::as_type_ptr(n); }); -} - template std::shared_ptr fold(Args&&... args) { auto node = std::make_shared(std::forward(args)...); diff --git a/src/common/low_precision_transformations/src/assign_and_read_value.cpp b/src/common/low_precision_transformations/src/assign_and_read_value.cpp new file mode 100644 index 00000000000..472b8df8eb2 --- /dev/null +++ b/src/common/low_precision_transformations/src/assign_and_read_value.cpp @@ -0,0 +1,132 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "low_precision/assign_and_read_value.hpp" +#include + +#include +#include "low_precision/network_helper.hpp" +#include +#include +#include +#include "low_precision/fake_quantize.hpp" + +namespace ngraph { +namespace pass { +namespace low_precision { + +NGRAPH_RTTI_DEFINITION(ngraph::pass::low_precision::AssignAndReadValueTransformation, "AssignAndReadValueTransformation", 0); + +AssignAndReadValueTransformation::AssignAndReadValueTransformation(const std::shared_ptr function, const Params& params) : + LayerTransformation(params), function(function) { + auto assign3 = pattern::wrap_type({ pattern::wrap_type() }); + auto assign6 = pattern::wrap_type({ pattern::wrap_type() }); + + ngraph::graph_rewrite_callback callback = [=](pattern::Matcher& m) { + const auto& opsMap = m.get_pattern_value_map(); + auto op = m.get_match_root(); + auto assignIt = opsMap.find(assign3); + if (assignIt == opsMap.end()) { + assignIt = opsMap.find(assign6); + } + const auto assign = assignIt->second.get_node_shared_ptr(); + // check that we have ReadValue as the first dependency + if (assign->get_control_dependencies().empty()) { + return false; + } + + if (transformation_callback(op)) { + return false; + } + return transform(*context, m); + }; + + auto m = std::make_shared( + std::make_shared(OutputVector{ assign3, assign6 }), + "AssignAndReadValueTransformation"); + this->register_matcher(m, callback); +} + +bool AssignAndReadValueTransformation::transform(TransformationContext& context, ngraph::pattern::Matcher& m) { + if (!canBeTransformed(context, m.get_match_root())) { + return false; + } + + const auto oldAssign = m.get_match_root(); + const auto readValue = oldAssign->get_control_dependencies()[0]; + oldAssign->remove_control_dependency(readValue); + + const auto assign = NetworkHelper::separateInStandaloneBranch(oldAssign); + const auto dequantization = NetworkHelper::getDequantization(assign); + + auto oldVar = ov::as_type_ptr(readValue)->get_variable(); + auto variableInfo = oldVar->get_info(); + // set new precision for oldVar to update precision in newReadValue + oldVar->update({variableInfo.data_shape, dequantization.data.get_element_type(), variableInfo.variable_id}); + // transform ReadValue part + const auto newConstant = foldConvert(readValue->get_input_node_shared_ptr(0), dequantization.data.get_element_type()); + const auto newReadValue = readValue->copy_with_new_inputs({newConstant}); + const auto newDequantization = dequantization.copyWithNewInput(newReadValue); + replace_node(readValue, newDequantization); + + // transform Assign part + + const auto newAssign = assign->copy_with_new_inputs({dequantization.data}); + function->remove_sink(as_type_ptr(oldAssign)); + function->add_sinks({as_type_ptr(newAssign)}); + + NetworkHelper::copyInfo(assign, newAssign); + replace_node(assign, newAssign); + newAssign->add_control_dependency(newReadValue); + + // fuse dequantization multiply with FQ after ReadValue if possible + const auto nextLayers = newDequantization->get_output_target_inputs(0); + if (nextLayers.size() > 1) { + return true; + } + const auto fakeQuantize = as_type_ptr(nextLayers.begin()->get_node()->shared_from_this()); + + if (fakeQuantize == nullptr) { + return true; + } + auto fakeQuantizeInputs = fakeQuantize->input_values(); + + const auto inputLow = as_type_ptr(fakeQuantizeInputs[1].get_node_shared_ptr()); + const auto inputHigh = as_type_ptr(fakeQuantizeInputs[2].get_node_shared_ptr()); + + if (inputLow == nullptr || inputHigh == nullptr) { + return true; + } + + FakeQuantizeTransformation::fuseElementwise(context, this, fakeQuantize); + + return true; +} + +bool AssignAndReadValueTransformation::canBeTransformed(const TransformationContext& context, std::shared_ptr op) const { + if (!LayerTransformation::canBeTransformed(context, op)) { + return false; + } + + const auto readValue = std::dynamic_pointer_cast(op->get_control_dependencies()[0]); + if (!readValue) { + return false; + } + + // TODO: remove this limitation and change the transformation when this constant will be accepted to be non-zero + if (!NetworkHelper::isZeroConst(readValue->get_input_node_shared_ptr(0))) { + return false; + } + + const auto dequantization = NetworkHelper::getDequantization(op); + return dequantization.subtract == nullptr && dequantization.multiply != nullptr; +} + +bool AssignAndReadValueTransformation::isPrecisionPreserved(std::shared_ptr layer) const noexcept { + return false; +} + +} // namespace low_precision +} // namespace pass +} // namespace ngraph diff --git a/src/common/low_precision_transformations/src/fake_quantize.cpp b/src/common/low_precision_transformations/src/fake_quantize.cpp index deb7ec1ea28..130ab6f5103 100644 --- a/src/common/low_precision_transformations/src/fake_quantize.cpp +++ b/src/common/low_precision_transformations/src/fake_quantize.cpp @@ -129,17 +129,15 @@ bool FakeQuantizeTransformation::checkElementwise(const std::shared_ptr& e std::shared_ptr FakeQuantizeTransformation::fuseElementwise( TransformationContext& context, MatcherPass* matcherPass, - const std::shared_ptr& fakeQuantize) const { + const std::shared_ptr& fakeQuantize) { const std::shared_ptr eltwise = fakeQuantize->get_input_node_shared_ptr(0); - std::shared_ptr inputLowConst_f32 = foldConvert(fakeQuantize->input_value(1), deqPrecision); - std::shared_ptr inputHighConst_f32 = foldConvert(fakeQuantize->input_value(2), deqPrecision); + std::shared_ptr inputLowConst_f32 = foldConvert(fakeQuantize->input_value(1), element::f32); + std::shared_ptr inputHighConst_f32 = foldConvert(fakeQuantize->input_value(2), element::f32); std::shared_ptr constant = fq::getConstant(eltwise); if (ov::is_type(eltwise) && checkElementwise(eltwise)) { - const auto value = constant->get_output_element_type(0) == deqPrecision ? - constant : - foldConvert(constant, deqPrecision); + const auto value = foldConvert(constant, element::f32); const auto valueVec = ov::as_type_ptr(value)->cast_vector(); @@ -159,9 +157,7 @@ std::shared_ptr FakeQuantizeTransformation::fuseElementwis inputLowConst_f32 = fq::updateShape(inputLowConst_f32, fakeQuantize->get_output_partial_shape(0)); inputHighConst_f32 = fq::updateShape(inputHighConst_f32, fakeQuantize->get_output_partial_shape(0)); } else if (ov::is_type(eltwise) && checkElementwise(eltwise)) { - const auto value = constant->get_output_element_type(0) == deqPrecision ? - constant : - foldConvert(constant, deqPrecision); + const auto value = foldConvert(constant, element::f32); inputLowConst_f32 = fq::updateShape(fold(inputLowConst_f32, value), fakeQuantize->get_output_partial_shape(0)); inputHighConst_f32 = fq::updateShape(fold(inputHighConst_f32, value), fakeQuantize->get_output_partial_shape(0)); @@ -173,9 +169,7 @@ std::shared_ptr FakeQuantizeTransformation::fuseElementwis return nullptr; } - const auto value = constant->get_output_element_type(0) == deqPrecision ? - constant : - foldConvert(constant, deqPrecision); + const auto value = foldConvert(constant, element::f32); inputLowConst_f32 = fq::updateShape(fold(inputLowConst_f32, value), fakeQuantize->get_output_partial_shape(0)); inputHighConst_f32 = fq::updateShape(fold(inputHighConst_f32, value), fakeQuantize->get_output_partial_shape(0)); @@ -196,8 +190,8 @@ std::shared_ptr FakeQuantizeTransformation::fuseElementwis data->output(outputIdx), inputLowConst_f32, inputHighConst_f32, - foldConvert(fakeQuantize->input_value(3), deqPrecision), - foldConvert(fakeQuantize->input_value(4), deqPrecision) })); + foldConvert(fakeQuantize->input_value(3), element::f32), + foldConvert(fakeQuantize->input_value(4), element::f32) })); matcherPass->register_new_node(newFakeQuantize); diff --git a/src/common/low_precision_transformations/src/fake_quantize_dequantization.cpp b/src/common/low_precision_transformations/src/fake_quantize_dequantization.cpp index 46b56716081..fc6ae961a24 100644 --- a/src/common/low_precision_transformations/src/fake_quantize_dequantization.cpp +++ b/src/common/low_precision_transformations/src/fake_quantize_dequantization.cpp @@ -155,6 +155,26 @@ bool FakeQuantizeDequantization::checkElementwise(const std::shared_ptr FakeQuantizeDequantization::copyWithNewInput(const std::shared_ptr& input) const { + auto lastNode = input; + if (convert) { + lastNode = convert->copy_with_new_inputs({lastNode}); + } + if (subtract) { + std::shared_ptr input1 = nullptr; + if (subtractConvert) { + input1 = subtractConvert; + } else { + input1 = subtractConstant; + } + lastNode = subtract->copy_with_new_inputs({lastNode, input1}); + } + if (multiply) { + lastNode = multiply->copy_with_new_inputs({lastNode, multiplyConstant}); + } + return lastNode; +} + int FakeQuantizeDequantization::fillDequantizationParams( const std::shared_ptr& elementwise, std::shared_ptr& convert, diff --git a/src/common/low_precision_transformations/src/low_precision.cpp b/src/common/low_precision_transformations/src/low_precision.cpp index 715bd2ea0b5..5d21e3d4bd3 100644 --- a/src/common/low_precision_transformations/src/low_precision.cpp +++ b/src/common/low_precision_transformations/src/low_precision.cpp @@ -38,6 +38,7 @@ // general transformations #include "low_precision/add.hpp" +#include "low_precision/assign_and_read_value.hpp" #include "low_precision/avg_pool.hpp" #include "low_precision/clamp.hpp" #include "low_precision/convolution.hpp" @@ -207,6 +208,7 @@ bool ngraph::pass::low_precision::LowPrecision::run_on_model(const std::shared_p std::shared_ptr common = manager.register_pass(); common->add_matcher(params); + common->add_matcher(f, params); common->add_matcher(params); common->add_matcher(params); common->add_matcher(params); diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp index 171f841f4cc..d7b5a9890d0 100644 --- a/src/common/low_precision_transformations/src/network_helper.cpp +++ b/src/common/low_precision_transformations/src/network_helper.cpp @@ -37,17 +37,6 @@ bool NetworkHelper::is_castable_to_one_of(NodeTypeInfo type, const std::unordere return false; } -bool NetworkHelper::notAllChildrensAreFQ(const NodeVector& childrens) { - // NOTE: This check was added for models that don't have FQ after AvgPool - // They will have transparent precision as it was in old LPT. - for (const auto& child : childrens) { - if (!ov::is_type(child)) { - return true; - } - } - return false; -} - // Collect and return a vector with all nodes that consumes any of the `node` output std::vector> NetworkHelper::consumer_inputs(std::shared_ptr node) { std::vector> result; @@ -199,15 +188,6 @@ size_t NetworkHelper::getGroupsCount(std::shared_ptr layer) { } } -// Assumin tensor in NC... layout, append necessary number of 1s to shape to align it to a give rank -Shape NetworkHelper::alignShapeForChannelDim(const Shape& shape, Rank rank) { - assert(shape.size() == 1); - assert(rank.is_static()); - Shape result = shape; - result.resize(rank.get_length() - 1, 1); - return result; -} - void NetworkHelper::removeLayer(std::shared_ptr layer) { ngraph::replace_output_update_name(layer->output(0), layer->input_value(0)); } @@ -1359,9 +1339,12 @@ FakeQuantizeDequantization NetworkHelper::getDequantization(const std::shared_pt const std::shared_ptr convert = ov::as_type_ptr(dataNode.get_node_shared_ptr()); if (convert != nullptr) { - if ((convert->input(0).get_element_type() != element::i8) && (convert->input(0).get_element_type() != element::u8) && - (convert->input(0).get_element_type() != element::i4) && (convert->input(0).get_element_type() != element::u4) && - (convert->output(0).get_element_type() != element::f32)) { + auto defaultPrecisions = LayerTransformation::getDefaultPrecisions(); + auto el_type = convert->input(0).get_element_type(); + auto foundIt = std::find(defaultPrecisions.begin(), defaultPrecisions.end(), el_type); + if (foundIt == defaultPrecisions.end() && + el_type != element::i4 && el_type != element::u4 && + el_type != element::f32 && el_type != element::f16) { return FakeQuantizeDequantization(dataNode, nullptr, subtract, subtractConvert, subtractConstant, multiply, multiplyConstant); } dataNode = convert->get_input_source_output(0); diff --git a/src/tests/functional/inference_engine/lp_transformations/assign_and_read_value_transformation.cpp b/src/tests/functional/inference_engine/lp_transformations/assign_and_read_value_transformation.cpp new file mode 100644 index 00000000000..0a9b125dea4 --- /dev/null +++ b/src/tests/functional/inference_engine/lp_transformations/assign_and_read_value_transformation.cpp @@ -0,0 +1,200 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "layer_transformation.hpp" + +#include +#include +#include + +#include +#include + +#include "common_test_utils/ngraph_test_utils.hpp" +#include "lpt_ngraph_functions/common/dequantization_operations.hpp" +#include "lpt_ngraph_functions/assign_and_read_value_function.hpp" +#include "simple_low_precision_transformer.hpp" +#include "low_precision/layer_transformation.hpp" + + +namespace { +using namespace testing; +using namespace ngraph::pass; +using namespace ngraph; + +class AssignTransformationTestValues { +public: + class Actual { + public: + std::vector constantValue; + ngraph::builder::subgraph::DequantizationOperations dequantization; + }; + + class Expected { + public: + std::vector constantValue; + ngraph::builder::subgraph::DequantizationOperations dequantizationBefore; + ngraph::builder::subgraph::DequantizationOperations dequantizationAfter; + }; + + TestTransformationParams params; + Actual actual; + Expected expected; + bool FQAfterReadValue; +}; + +typedef std::tuple < + ngraph::PartialShape, // input shape + element::Type, // input precision + element::Type, // precision before dequantization + size_t, // opset version + AssignTransformationTestValues // test values +> AssignTransformationParams; + +class AssignTransformation : public LayerTransformation, public testing::WithParamInterface { +public: + void SetUp() override { + const ngraph::PartialShape inputShape = std::get<0>(GetParam()); + const element::Type precision = std::get<1>(GetParam()); + const element::Type precisionBeforeDequantization = std::get<2>(GetParam()); + const size_t opsetVersion = std::get<3>(GetParam()); + const AssignTransformationTestValues testValues = std::get<4>(GetParam()); + low_precision::LayerTransformation::setDefaultPrecisions({ + ngraph::element::u8, ngraph::element::i8, + ngraph::element::u16, ngraph::element::i16, + ngraph::element::u32, ngraph::element::i32 + }); + actualFunction = ngraph::builder::subgraph::AssignAndReadValueFunction::getOriginal( + inputShape, + precision, + precisionBeforeDequantization, + opsetVersion, + testValues.FQAfterReadValue, + testValues.actual.constantValue, + testValues.actual.dequantization); + + SimpleLowPrecisionTransformer transformer; + transformer.add(actualFunction, testValues.params); + transformer.transform(actualFunction); + + referenceFunction = ngraph::builder::subgraph::AssignAndReadValueFunction::getReference( + inputShape, + precision, + precisionBeforeDequantization, + opsetVersion, + testValues.FQAfterReadValue, + testValues.expected.constantValue, + testValues.expected.dequantizationBefore, + testValues.expected.dequantizationAfter); + } + + static std::string getTestCaseName(testing::TestParamInfo obj) { + const ngraph::PartialShape inputShape = std::get<0>(obj.param); + const element::Type precision = std::get<1>(obj.param); + const element::Type precisionBeforeDequantization = std::get<2>(obj.param); + const size_t opsetVersion = std::get<3>(obj.param); + const AssignTransformationTestValues testValues = std::get<4>(obj.param); + + std::ostringstream result; + result << toString(testValues.params) << "_" << + inputShape << "_" << precision << "_" << + opsetVersion << "_" << testValues.FQAfterReadValue << "_" << + precisionBeforeDequantization << "_" << + testValues.actual.constantValue << "_" << + testValues.actual.dequantization; + return result.str(); + } +}; + +TEST_P(AssignTransformation, CompareFunctions) { + actualFunction->validate_nodes_and_infer_types(); + auto res = compare_functions(referenceFunction, actualFunction, true, true); + ASSERT_TRUE(res.first) << res.second; + + ASSERT_TRUE(LayerTransformation::allNamesAreUnique(actualFunction)) << "Not all names are unique"; +} + +namespace testValues1 { +const std::vector inputShapes = { + ngraph::PartialShape({ 1, 3, 224, 224 }), +}; + +const element::TypeVector precisions = { + element::f16, element::f32 +}; + +const element::TypeVector precisionsBeforeDequantizations = { + element::i8, element::u8, + element::i16, element::u16, + element::i32, element::u32, +}; + +const std::vector opsetVersions = { + 3, + 6 +}; + +const std::vector testValues = { + // general case, no subtract, FQ after ReadValue + { + LayerTransformation::createParamsU8I8(), + // ActualValues + { + {0}, + {{ngraph::element::f32}, {}, {3.f}} + }, + // ExpectedValues + { + {0}, + {{}, {}, {}}, + {{ngraph::element::f32}, {}, {3.f}} + }, + true + }, + // no FQ after ReadValue + { + LayerTransformation::createParamsU8I8(), + // ActualValues + { + {0}, + {{ngraph::element::f32}, {}, {3.f}} + }, + // ExpectedValues + { + {0}, + {{}, {}, {}}, + {{ngraph::element::f32}, {}, {3.f}} + }, + false + }, + // non-zero constant + { + LayerTransformation::createParamsU8I8(), + // ActualValues + { + {5}, + {{ngraph::element::f32}, {}, {3.f}} + }, + // ExpectedValues + { + {5}, + {{ngraph::element::f32}, {}, {3.f}}, + {} + }, + false + }, +}; + +INSTANTIATE_TEST_SUITE_P( + smoke_LPT, + AssignTransformation, + ::testing::Combine( + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(precisions), + ::testing::ValuesIn(precisionsBeforeDequantizations), + ::testing::ValuesIn(opsetVersions), + ::testing::ValuesIn(testValues)), + AssignTransformation::getTestCaseName); +} // namespace testValues1 +} // namespace diff --git a/src/tests/functional/inference_engine/lp_transformations/simple_low_precision_transformer.hpp b/src/tests/functional/inference_engine/lp_transformations/simple_low_precision_transformer.hpp index c349056f6de..9f68b6d0542 100644 --- a/src/tests/functional/inference_engine/lp_transformations/simple_low_precision_transformer.hpp +++ b/src/tests/functional/inference_engine/lp_transformations/simple_low_precision_transformer.hpp @@ -24,6 +24,10 @@ public: void add(const TestTransformationParams& params) { commonGraphRewrite->add_matcher(TestTransformationParams::toParams(params)); } + template + void add(const std::shared_ptr function, const TestTransformationParams& params) { + commonGraphRewrite->add_matcher(function, TestTransformationParams::toParams(params)); + } void transform(std::shared_ptr& function); bool run_on_model(const std::shared_ptr& m) override; diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp new file mode 100644 index 00000000000..0da67ab4379 --- /dev/null +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include + +#include "low_precision_transformations/assign_and_read_value_transformation.hpp" + + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + ngraph::element::f32, + // ngraph::element::f16 +}; + +const std::vector opsetVersions = { + // 3, // no evaluate for opset 3 in ngraph + 6 +}; + +const std::vector trasformationParamValues = { + LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams(), + LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams().setUpdatePrecisions(false), +}; + +const std::vector params{ + // u8 + { + { 256ul, ngraph::Shape{ 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 12.8f } }, + }, + // u16 + { + { 65536ul, ngraph::Shape{ 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 12.8f } }, + }, + // u32 + { + { 4294967296ul, ngraph::Shape{ 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 12.8f } }, + }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_LPT, AssignAndReadValueTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(ngraph::PartialShape({ 1, 3, 16, 16 })), + ::testing::ValuesIn(opsetVersions), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::ValuesIn(trasformationParamValues), + ::testing::ValuesIn(params)), + AssignAndReadValueTransformation::getTestCaseName); + +} // namespace diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp new file mode 100644 index 00000000000..390514c1884 --- /dev/null +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/low_precision_transformations/assign_and_read_value_transformation.cpp @@ -0,0 +1,55 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include + +#include "low_precision_transformations/assign_and_read_value_transformation.hpp" + + +using namespace LayerTestsDefinitions; + +namespace { +const std::vector netPrecisions = { + ngraph::element::f32, + // ngraph::element::f16 +}; + +const std::vector opsetVersions = { + // 3, // no evaluate for opset 3 in ngraph + // 6 // not supported on GPU +}; + +const std::vector trasformationParamValues = { + LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams(), + LayerTestsUtils::LayerTransformationParamsNGraphFactory::createParams().setUpdatePrecisions(false), +}; + +const std::vector params{ + // u8 + { + { 256ul, ngraph::Shape{ 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 12.8f } }, + }, + // u16 + { + { 65536ul, ngraph::Shape{ 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 12.8f } }, + }, + // u32 + { + { 4294967296ul, ngraph::Shape{ 1, 1, 1, 1 }, { 0.f }, { 25.5f }, { 0.f }, { 12.8f } }, + }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_LPT, AssignAndReadValueTransformation, + ::testing::Combine( + ::testing::ValuesIn(netPrecisions), + ::testing::Values(ngraph::PartialShape({ 1, 3, 16, 16 })), + ::testing::ValuesIn(opsetVersions), + ::testing::Values(CommonTestUtils::DEVICE_GPU), + ::testing::ValuesIn(trasformationParamValues), + ::testing::ValuesIn(params)), + AssignAndReadValueTransformation::getTestCaseName); + +} // namespace diff --git a/src/tests/functional/plugin/shared/include/low_precision_transformations/assign_and_read_value_transformation.hpp b/src/tests/functional/plugin/shared/include/low_precision_transformations/assign_and_read_value_transformation.hpp new file mode 100644 index 00000000000..3a5adf331e1 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/low_precision_transformations/assign_and_read_value_transformation.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/base/low_precision_transformations/layer_transformation.hpp" +#include "lpt_ngraph_functions/common/fake_quantize_on_data.hpp" +#include "lpt_ngraph_functions/common/dequantization_operations.hpp" + +namespace LayerTestsDefinitions { +class AssignAndReadValueTransformationParam { +public: + ngraph::builder::subgraph::FakeQuantizeOnData fakeQuantize; +}; + +typedef std::tuple < + ngraph::element::Type, // input precision + ngraph::PartialShape, // input shape + size_t, // opset version + std::string, // device + ngraph::pass::low_precision::LayerTransformation::Params, // transformation params + AssignAndReadValueTransformationParam // test params +> AssignAndReadValueTransformationParams; + +class AssignAndReadValueTransformation : + public testing::WithParamInterface, + public LayerTestsUtils::LayerTransformation { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); +protected: + void SetUp() override; +}; + +} // namespace LayerTestsDefinitions diff --git a/src/tests/functional/plugin/shared/src/low_precision_transformations/assign_and_read_value_transformation.cpp b/src/tests/functional/plugin/shared/src/low_precision_transformations/assign_and_read_value_transformation.cpp new file mode 100644 index 00000000000..43123a36cd1 --- /dev/null +++ b/src/tests/functional/plugin/shared/src/low_precision_transformations/assign_and_read_value_transformation.cpp @@ -0,0 +1,49 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "low_precision_transformations/assign_and_read_value_transformation.hpp" +#include +#include +#include +#include + +#include "lpt_ngraph_functions/assign_and_read_value_function.hpp" + +namespace LayerTestsDefinitions { + +std::string AssignAndReadValueTransformation::getTestCaseName(const testing::TestParamInfo& obj) { + ngraph::element::Type netPrecision; + ngraph::PartialShape inputShape; + size_t opset; + std::string targetDevice; + ngraph::pass::low_precision::LayerTransformation::Params params; + AssignAndReadValueTransformationParam param;; + std::tie(netPrecision, inputShape, opset, targetDevice, params, param) = obj.param; + + std::ostringstream result; + result << getTestCaseNameByParams(netPrecision, inputShape, targetDevice, params) << "_" << + param.fakeQuantize << "_" << opset; + return result.str(); +} + +void AssignAndReadValueTransformation::SetUp() { + ngraph::element::Type netPrecision; + ngraph::PartialShape inputShape; + size_t opset; + ngraph::pass::low_precision::LayerTransformation::Params params; + AssignAndReadValueTransformationParam param; + std::tie(netPrecision, inputShape, opset, targetDevice, params, param) = this->GetParam(); + + function = ngraph::builder::subgraph::AssignAndReadValueFunction::getOriginal( + netPrecision, + inputShape, + param.fakeQuantize, + opset); +} + +TEST_P(AssignAndReadValueTransformation, CompareWithRefImpl) { + Run(); +}; + +} // namespace LayerTestsDefinitions diff --git a/src/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/assign_and_read_value_function.hpp b/src/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/assign_and_read_value_function.hpp new file mode 100644 index 00000000000..d5113da7b1b --- /dev/null +++ b/src/tests/ngraph_helpers/lpt_ngraph_functions/include/lpt_ngraph_functions/assign_and_read_value_function.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include "lpt_ngraph_functions/common/fake_quantize_on_data.hpp" +#include "lpt_ngraph_functions/common/dequantization_operations.hpp" + +namespace ngraph { +namespace builder { +namespace subgraph { + +class AssignAndReadValueFunction { +public: + static std::shared_ptr getOriginal( + const ngraph::PartialShape& inputShape, + const element::Type& inputPrecision, + const ngraph::element::Type precisionBeforeDequantization, + const size_t opsetVersion, + const bool FQAfterReadValue, + const std::vector& constantValue, + const ngraph::builder::subgraph::DequantizationOperations& dequantization); + + static std::shared_ptr getOriginal( + const ngraph::element::Type originalFunctionPrecision, + const ngraph::PartialShape& inputShape, + const ngraph::builder::subgraph::FakeQuantizeOnData fakeQuantize, + const size_t opsetVersion); + + static std::shared_ptr getReference( + const ngraph::PartialShape& inputShape, + const element::Type& inputPrecision, + const ngraph::element::Type precisionBeforeDequantization, + const size_t opsetVersion, + const bool FQAfterReadValue, + const std::vector& constantValue, + const ngraph::builder::subgraph::DequantizationOperations& dequantizationBefore, + const ngraph::builder::subgraph::DequantizationOperations& dequantizationAfter); +}; +} // namespace subgraph +} // namespace builder +} // namespace ngraph diff --git a/src/tests/ngraph_helpers/lpt_ngraph_functions/src/assign_and_read_value_function.cpp b/src/tests/ngraph_helpers/lpt_ngraph_functions/src/assign_and_read_value_function.cpp new file mode 100644 index 00000000000..67e50de1852 --- /dev/null +++ b/src/tests/ngraph_helpers/lpt_ngraph_functions/src/assign_and_read_value_function.cpp @@ -0,0 +1,185 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + + +#include +#include "ngraph_functions/subgraph_builders.hpp" +#include "openvino/op/util/variable.hpp" +#include + +#include "lpt_ngraph_functions/common/builders.hpp" +#include "lpt_ngraph_functions/assign_and_read_value_function.hpp" +#include "low_precision/network_helper.hpp" + +namespace ngraph { +namespace builder { +namespace subgraph { + +std::shared_ptr AssignAndReadValueFunction::getOriginal( + const ngraph::PartialShape& inputShape, + const element::Type& inputPrecision, + const ngraph::element::Type precisionBeforeDequantization, + const size_t opsetVersion, + const bool FQAfterReadValue, + const std::vector& constantValue, + const ngraph::builder::subgraph::DequantizationOperations& dequantization) { + const auto input = std::make_shared(inputPrecision, inputShape); + const auto defaultConstant = std::make_shared(inputPrecision, inputShape.get_shape(), constantValue); + const auto variable = std::make_shared(VariableInfo{inputShape.get_shape(), inputPrecision, "id"}); + std::shared_ptr readValue; + if (opsetVersion == 6) { + readValue = std::make_shared(defaultConstant, variable); + } else if (opsetVersion == 3) { + readValue = std::make_shared(defaultConstant, "id"); + } else { + throw std::runtime_error("Unknown opset version"); + } + std::shared_ptr lastNode = readValue; + if (FQAfterReadValue) { + lastNode = builder::subgraph::makeFakeQuantize( + lastNode, + element::f32, + FakeQuantizeOnData{256ul, Shape{}, {0}, {2.55f}, {0}, {2.55f}}); + } + const auto add = std::make_shared(lastNode, input); + const auto FQAfterAdd = builder::subgraph::makeFakeQuantizeTypeRelaxed( + add, + element::f32, + FakeQuantizeOnData{256ul, Shape{}, {0}, {2.55f}, {0}, {2.55f}, precisionBeforeDequantization}); + auto deqStructure = dequantization; + deqStructure.multiply.outPrecision = inputPrecision; + const auto dequantizationOp = makeDequantization(FQAfterAdd, deqStructure); + std::shared_ptr assign; + if (opsetVersion == 6) { + assign = std::make_shared(dequantizationOp, variable); + } else { + assign = std::make_shared(dequantizationOp, "id"); + } + assign->add_control_dependency(readValue); + add->set_friendly_name("output"); + + ngraph::ResultVector results{ std::make_shared(add) }; + ngraph::SinkVector sinks{ as_type_ptr(assign) }; + return std::make_shared(results, sinks, ngraph::ParameterVector{ input }, "AssignAndReadValueFunction"); +} + +std::shared_ptr AssignAndReadValueFunction::getOriginal( + const ngraph::element::Type precision, + const ngraph::PartialShape& inputShape, + const ngraph::builder::subgraph::FakeQuantizeOnData fakeQuantize, + const size_t opsetVersion) { + const auto input = std::make_shared(precision, inputShape); + const auto defaultConstant = std::make_shared(precision, inputShape.get_shape(), std::vector{0}); + const auto variable = std::make_shared(VariableInfo{inputShape.get_shape(), precision, "id"}); + std::shared_ptr readValue; + if (opsetVersion == 6) { + readValue = std::make_shared(defaultConstant, variable); + } else if (opsetVersion == 3) { + readValue = std::make_shared(defaultConstant, "id"); + } else { + throw std::runtime_error("Unknown opset version"); + } + std::shared_ptr lastNode = readValue; + lastNode = builder::subgraph::makeFakeQuantize( + lastNode, + element::f32, + FakeQuantizeOnData{256ul, Shape{}, {0}, {2.55f}, {0}, {2.55f}}); + const auto add = std::make_shared(lastNode, input); + const auto FQAfterAdd = fakeQuantize.empty() ? nullptr : + ngraph::builder::makeFakeQuantize( + add, + precision, + fakeQuantize.quantizationLevel, + fakeQuantize.constantShape, + fakeQuantize.inputLowValues, + fakeQuantize.inputHighValues, + fakeQuantize.outputLowValues, + fakeQuantize.outputHighValues); + std::shared_ptr assign; + if (opsetVersion == 6) { + assign = std::make_shared(FQAfterAdd, variable); + } else { + assign = std::make_shared(FQAfterAdd, "id"); + } + assign->add_control_dependency(readValue); + add->set_friendly_name("output"); + + ngraph::ResultVector results{ std::make_shared(add) }; + ngraph::SinkVector sinks{ as_type_ptr(assign) }; + return std::make_shared(results, sinks, ngraph::ParameterVector{ input }, "AssignAndReadValueFunction"); +} + +std::shared_ptr AssignAndReadValueFunction::getReference( + const ngraph::PartialShape& inputShape, + const element::Type& inputPrecision, + const ngraph::element::Type precisionBeforeDequantization, + const size_t opsetVersion, + const bool FQAfterReadValue, + const std::vector& constantValue, + const ngraph::builder::subgraph::DequantizationOperations& dequantizationBefore, + const ngraph::builder::subgraph::DequantizationOperations& dequantizationAfter) { + const auto input = std::make_shared(inputPrecision, inputShape); + auto constantPrecision = precisionBeforeDequantization; + if (constantValue != std::vector{0}) { + constantPrecision = inputPrecision; + } + const auto defaultConstant = std::make_shared(constantPrecision, inputShape.get_shape(), constantValue); + const auto variable = std::make_shared(VariableInfo{inputShape.get_shape(), constantPrecision, "id"}); + std::shared_ptr readValue; + if (opsetVersion == 6) { + readValue = std::make_shared(defaultConstant, variable); + } else if (opsetVersion == 3) { + readValue = std::make_shared(defaultConstant, "id"); + } else { + throw std::runtime_error("Unknown opset version"); + } + std::shared_ptr lastNode = readValue; + + auto deqStructureAfter = dequantizationAfter; + if (FQAfterReadValue) { + DequantizationOperations tempDequantization; + tempDequantization.convert = dequantizationAfter.convert; + tempDequantization.subtract = dequantizationAfter.subtract; + lastNode = makeDequantization(lastNode, tempDequantization); + } else { + deqStructureAfter.multiply.outPrecision = inputPrecision; + lastNode = makeDequantization(lastNode, deqStructureAfter); + } + + if (FQAfterReadValue) { + lastNode = builder::subgraph::makeFakeQuantizeTypeRelaxed( + lastNode, + element::f32, + FakeQuantizeOnData{256ul, Shape{}, {0}, {2.55f / dequantizationAfter.multiply.values[0]}, {0}, {2.55f}, inputPrecision}); + } + const auto add = std::make_shared(lastNode, input); + const auto FQAfterAdd = builder::subgraph::makeFakeQuantizeTypeRelaxed( + add, + element::f32, + FakeQuantizeOnData{256ul, Shape{}, {0}, {2.55f}, {0}, {2.55f}, precisionBeforeDequantization}); + + auto deqStructureBefore = dequantizationBefore; + deqStructureBefore.multiply.outPrecision = inputPrecision; + const auto dequantizationBeforeStructure = makeDequantization(FQAfterAdd, deqStructureBefore); + std::shared_ptr assign; + if (opsetVersion == 6) { + assign = std::make_shared(dequantizationBeforeStructure, variable); + } else { + assign = std::make_shared(dequantizationBeforeStructure, "id"); + } + assign->add_control_dependency(readValue); + add->set_friendly_name("output"); + + ngraph::ResultVector results{ std::make_shared(add) }; + ngraph::SinkVector sinks{ as_type_ptr(assign) }; + return std::make_shared(results, sinks, ngraph::ParameterVector{ input }, "AssignAndReadValueFunction"); +} + +} // namespace subgraph +} // namespace builder +} // namespace ngraph From c87ac722b18b5fa0f475a88b3c08b89a1c52e51d Mon Sep 17 00:00:00 2001 From: Aleksandr Korolev Date: Tue, 28 Dec 2021 11:21:00 +0300 Subject: [PATCH 02/78] [VPU] Enable new InferRequest behavior tests with OV 2.0 (#9301) --- .../configuration/plugin_configuration.hpp | 2 +- .../myriad_executable_network.cpp | 9 ++ .../myriad_plugin/myriad_executable_network.h | 1 + .../behavior/ov_infer_request/callback.cpp | 37 +++++++++ .../ov_infer_request/cancellation.cpp | 19 +++++ .../infer_request_dynamic.cpp | 2 +- .../behavior/ov_infer_request/io_tensor.cpp | 82 +++++++++++++++++++ .../ov_infer_request/multithreading.cpp | 39 +++++++++ .../ov_infer_request/perf_counters.cpp | 65 +++++++++++++++ .../behavior/ov_infer_request/wait.cpp | 43 ++++++++++ .../skip_tests_config.cpp | 2 + 11 files changed, 299 insertions(+), 2 deletions(-) create mode 100644 src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/callback.cpp create mode 100644 src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/cancellation.cpp rename src/tests/functional/plugin/myriad/shared_tests_instances/behavior/{ => ov_infer_request}/infer_request_dynamic.cpp (98%) create mode 100644 src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/io_tensor.cpp create mode 100644 src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/multithreading.cpp create mode 100644 src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp create mode 100644 src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/wait.cpp diff --git a/inference-engine/src/vpu/common/include/vpu/configuration/plugin_configuration.hpp b/inference-engine/src/vpu/common/include/vpu/configuration/plugin_configuration.hpp index 5d4786f321c..8383c82932d 100644 --- a/inference-engine/src/vpu/common/include/vpu/configuration/plugin_configuration.hpp +++ b/inference-engine/src/vpu/common/include/vpu/configuration/plugin_configuration.hpp @@ -118,7 +118,7 @@ public: const std::string& operator[](const std::string& key) const; InferenceEngine::Parameter asParameter(const std::string& key) const; - + std::unordered_map getValues() const {return values;} virtual void validate() const; private: diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp index 83e48a22595..45e4add8603 100644 --- a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp +++ b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.cpp @@ -236,6 +236,15 @@ InferenceEngine::Parameter ExecutableNetwork::GetMetric(const std::string &name) } } +InferenceEngine::Parameter ExecutableNetwork::GetConfig(const std::string &name) const { + auto confValues = _config.getValues(); + auto it = confValues.find(name); + if (it != confValues.end()) { + return it->second; + } + VPU_THROW_EXCEPTION << "Unsupported ExecutableNetwork config key: " << name; +} + std::shared_ptr ExecutableNetwork::GetExecGraphInfo() { auto perfInfo = _executor->getPerfTimeInfo(_graphDesc._graphHandle); if (_graphDesc._name == importedNetworkName) diff --git a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h index ceb54714f28..69af0fca273 100644 --- a/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h +++ b/inference-engine/src/vpu/myriad_plugin/myriad_executable_network.h @@ -101,6 +101,7 @@ public: } ie::Parameter GetMetric(const std::string &name) const override; + InferenceEngine::Parameter GetConfig(const std::string &name) const override; std::shared_ptr GetExecGraphInfo() override; diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/callback.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/callback.cpp new file mode 100644 index 00000000000..c9aaa7b5fb6 --- /dev/null +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/callback.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "behavior/ov_infer_request/callback.hpp" + +using namespace ov::test::behavior; + +namespace { +const std::vector> configs = { + {}, +}; + +const std::vector> multiConfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestCallbackTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestCallbackTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, OVInferRequestCallbackTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(multiConfigs)), + OVInferRequestCallbackTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVInferRequestCallbackTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(multiConfigs)), + OVInferRequestCallbackTests::getTestCaseName); +} // namespace diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/cancellation.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/cancellation.cpp new file mode 100644 index 00000000000..e00764a32ca --- /dev/null +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/cancellation.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "behavior/ov_infer_request/cancellation.hpp" + +using namespace ov::test::behavior; + +namespace { +const std::vector> configs = { + {}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestCancellationTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestCancellationTests::getTestCaseName); +} // namespace diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/infer_request_dynamic.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/infer_request_dynamic.cpp similarity index 98% rename from src/tests/functional/plugin/myriad/shared_tests_instances/behavior/infer_request_dynamic.cpp rename to src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/infer_request_dynamic.cpp index a7558eebe09..7d2989d3044 100644 --- a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/infer_request_dynamic.cpp +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/infer_request_dynamic.cpp @@ -23,7 +23,7 @@ const std::vector> configs = { }; const std::vector> HeteroConfigs = { - {{"TARGET_FALLBACK", CommonTestUtils::DEVICE_CPU}} + {{"TARGET_FALLBACK", CommonTestUtils::DEVICE_MYRIAD}} }; std::shared_ptr getFunction1() { diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/io_tensor.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/io_tensor.cpp new file mode 100644 index 00000000000..79808ac0acf --- /dev/null +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/io_tensor.cpp @@ -0,0 +1,82 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "behavior/ov_infer_request/io_tensor.hpp" + +using namespace ov::test::behavior; + +namespace { +const std::vector> configs = { + {}, +}; + +const std::vector> Multiconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +const std::vector> Autoconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestIOTensorTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestIOTensorTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, OVInferRequestIOTensorTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(Multiconfigs)), + OVInferRequestIOTensorTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVInferRequestIOTensorTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(Autoconfigs)), + OVInferRequestIOTensorTest::getTestCaseName); + +std::vector prcs = { + ov::element::boolean, + ov::element::bf16, + ov::element::f16, + ov::element::f32, + ov::element::f64, + ov::element::i4, + ov::element::i8, + ov::element::i16, + ov::element::i32, + ov::element::i64, + ov::element::u1, + ov::element::u4, + ov::element::u8, + ov::element::u16, + ov::element::u32, + ov::element::u64, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestIOTensorSetPrecisionTest, + ::testing::Combine( + ::testing::ValuesIn(prcs), + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestIOTensorSetPrecisionTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, OVInferRequestIOTensorSetPrecisionTest, + ::testing::Combine( + ::testing::ValuesIn(prcs), + ::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(Multiconfigs)), + OVInferRequestIOTensorSetPrecisionTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVInferRequestIOTensorSetPrecisionTest, + ::testing::Combine( + ::testing::ValuesIn(prcs), + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(Autoconfigs)), + OVInferRequestIOTensorSetPrecisionTest::getTestCaseName); + +} // namespace diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/multithreading.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/multithreading.cpp new file mode 100644 index 00000000000..f65b67b3e94 --- /dev/null +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/multithreading.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "behavior/ov_infer_request/multithreading.hpp" + +using namespace ov::test::behavior; + +namespace { + +const std::vector> configs = { + {}, +}; + +const std::vector> Multiconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestMultithreadingTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestMultithreadingTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, OVInferRequestMultithreadingTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(Multiconfigs)), + OVInferRequestMultithreadingTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVInferRequestMultithreadingTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(Multiconfigs)), + OVInferRequestMultithreadingTests::getTestCaseName); + +} // namespace diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp new file mode 100644 index 00000000000..add84985315 --- /dev/null +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/perf_counters.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "behavior/ov_infer_request/perf_counters.hpp" +#include +#include + +using namespace ov::test::behavior; + +namespace { +TEST_P(OVInferRequestPerfCountersTest, CheckOperationInProfilingInfo) { + req = execNet.create_infer_request(); + ASSERT_NO_THROW(req.infer()); + std::vector profiling_info; + ASSERT_NO_THROW(profiling_info = req.get_profiling_info()); + for (const auto& op : function->get_ops()) { + if (ov::is_type(op) || ov::is_type(op)) + continue; + auto op_is_in_profiling_info = std::any_of(std::begin(profiling_info), std::end(profiling_info), + [&] (const ov::runtime::ProfilingInfo& info) { + if (info.node_name.find(op->get_friendly_name() + "_") != std::string::npos || info.node_name == op->get_friendly_name()) { + return true; + } else { + return false; + } + }); + if (!op_is_in_profiling_info) { + std::cout << "Node w/o prof info: " << op->get_friendly_name() + << ", type : " << op->get_type_name() << std::endl; + } + ASSERT_TRUE(op_is_in_profiling_info) << "For op: " << op; + } +} + +const std::vector> configs = { + {} +}; + +const std::vector> Multiconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +const std::vector> Autoconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestPerfCountersTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestPerfCountersTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, OVInferRequestPerfCountersTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(Multiconfigs)), + OVInferRequestPerfCountersTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVInferRequestPerfCountersTest, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(Autoconfigs)), + OVInferRequestPerfCountersTest::getTestCaseName); +} // namespace diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/wait.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/wait.cpp new file mode 100644 index 00000000000..987dacb8806 --- /dev/null +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/behavior/ov_infer_request/wait.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "behavior/ov_infer_request/wait.hpp" + +using namespace ov::test::behavior; + +namespace { + +const std::vector> configs = { + {}, +}; + +const std::vector> Multiconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +const std::vector> Autoconfigs = { + {{ MULTI_CONFIG_KEY(DEVICE_PRIORITIES) , CommonTestUtils::DEVICE_MYRIAD}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_BehaviorTests, OVInferRequestWaitTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MYRIAD), + ::testing::ValuesIn(configs)), + OVInferRequestWaitTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Multi_BehaviorTests, OVInferRequestWaitTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_MULTI), + ::testing::ValuesIn(Multiconfigs)), + OVInferRequestWaitTests::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_Auto_BehaviorTests, OVInferRequestWaitTests, + ::testing::Combine( + ::testing::Values(CommonTestUtils::DEVICE_AUTO), + ::testing::ValuesIn(Autoconfigs)), + OVInferRequestWaitTests::getTestCaseName); + +} // namespace diff --git a/src/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp index aaacb7b8df1..2d11cc32c60 100644 --- a/src/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/myriad/shared_tests_instances/skip_tests_config.cpp @@ -24,6 +24,8 @@ std::vector disabledTestPatterns() { ".*InferDynamicNetworkWithSetTensor2times.*", ".*InferRequestDynamicTests.GetSameTensor2times.*", ".*InferRequestDynamicTests.InferDynamicNetworkWithSetTensor.*", + // TODO: Issue: 67972 + R"(.*Hetero.*InferRequestDynamicTests.*)", // TODO: Issue: 26268 ".*ConcatLayerTest.*axis=0.*", // TODO: Issue 31197 From bd82e8d000ec3a512468bb676e46e760af3f2f14 Mon Sep 17 00:00:00 2001 From: azhogov Date: Tue, 28 Dec 2021 12:07:32 +0300 Subject: [PATCH 03/78] Fix stress test install: remove requirements-caffe2.in --- tests/stress_tests/scripts/get_testdata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/stress_tests/scripts/get_testdata.py b/tests/stress_tests/scripts/get_testdata.py index be716c71d16..6e90924a156 100755 --- a/tests/stress_tests/scripts/get_testdata.py +++ b/tests/stress_tests/scripts/get_testdata.py @@ -148,7 +148,6 @@ def main(): requirements = [ args.mo_tool.parents[3] / "requirements.txt", omz_path / "tools" / "model_tools" / "requirements.in", - omz_path / "tools" / "model_tools" / "requirements-caffe2.in", omz_path / "tools" / "model_tools" / "requirements-pytorch.in" ] Venv.create_n_install_requirements(*requirements) From 04bb8bb9bbe62b01b0ef67bbf7a17a2d288d2aa1 Mon Sep 17 00:00:00 2001 From: Vladimir Dudnik Date: Tue, 28 Dec 2021 15:58:09 +0300 Subject: [PATCH 04/78] [IE Samples] fix hello classification cpp (#9450) * fix image file read error message when sample built w/o opencv * code style and use model inputs/outputs instead of parameters and results --- samples/cpp/hello_classification/main.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/cpp/hello_classification/main.cpp b/samples/cpp/hello_classification/main.cpp index f5e67989909..4175bf57e0b 100644 --- a/samples/cpp/hello_classification/main.cpp +++ b/samples/cpp/hello_classification/main.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -44,8 +45,8 @@ int tmain(int argc, tchar* argv[]) { std::shared_ptr model = core.read_model(model_path); printInputAndOutputsInfo(*model); - OPENVINO_ASSERT(model->get_parameters().size() == 1, "Sample supports models with 1 input only"); - OPENVINO_ASSERT(model->get_results().size() == 1, "Sample supports models with 1 output only"); + OPENVINO_ASSERT(model->inputs().size() == 1, "Sample supports models with 1 input only"); + OPENVINO_ASSERT(model->outputs().size() == 1, "Sample supports models with 1 output only"); // -------- Step 3. Set up input @@ -53,8 +54,9 @@ int tmain(int argc, tchar* argv[]) { // without resize and layout conversions FormatReader::ReaderPtr reader(image_path.c_str()); if (reader.get() == nullptr) { - slog::warn << "Image " + image_path + " cannot be read!" << slog::endl; - throw std::logic_error(""); + std::stringstream ss; + ss << "Image " + image_path + " cannot be read!"; + throw std::logic_error(ss.str()); } ov::element::Type input_type = ov::element::u8; From ecbeff460a700c6f2d3bb9b47533effac0511c36 Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Tue, 28 Dec 2021 16:12:15 +0300 Subject: [PATCH 05/78] [GPU] Fix check binary size in SetBlob (#9418) --- .../intel_gpu/src/plugin/infer_request.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/infer_request.cpp b/src/plugins/intel_gpu/src/plugin/infer_request.cpp index 97a56184387..a746ce9b50e 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request.cpp @@ -242,8 +242,14 @@ void InferRequest::SetBlob(const std::string& name, const Blob::Ptr& data) { size_t netReqBinSize = std::accumulate(desc.getDims().begin(), desc.getDims().end(), desc.getPrecision().size(), std::multiplies()); + bool preProcResize = false; + if (is_input) { + preProcResize = foundInput->getPreProcess().getResizeAlgorithm() != ResizeAlgorithm::NO_RESIZE; + const auto inputColorFormat = foundInput->getPreProcess().getColorFormat(); + preProcResize |= (inputColorFormat != ColorFormat::RAW) && (inputColorFormat != ColorFormat::BGR); + } - if (dataBinSize != netReqBinSize && !compoundBlobPassed) { + if (dataBinSize != netReqBinSize && !compoundBlobPassed && !preProcResize) { IE_THROW() << "Incorrect binary data size for " << (is_input ? "input" : "output") << " blob with name: \'" << name << "\' " << "Current: " << dataBinSize << " Required: " << netReqBinSize; @@ -417,7 +423,13 @@ void InferRequest::SetBlobs(const std::string& name, const std::vector()); - if (dataBinSize != netReqBinSize) { + bool preProcResize = false; + if (is_input) { + preProcResize = foundInput->getPreProcess().getResizeAlgorithm() != ResizeAlgorithm::NO_RESIZE; + const auto inputColorFormat = foundInput->getPreProcess().getColorFormat(); + preProcResize |= (inputColorFormat != ColorFormat::RAW) && (inputColorFormat != ColorFormat::BGR); + } + if (dataBinSize != netReqBinSize && !preProcResize) { IE_THROW() << "Incorrect binary data size for input blobs with name: \'" << name << "\' " << "Current: " << dataBinSize << " Required: " << netReqBinSize; } From 0803684f9e636f6f7afa5cdd44f107227a2c84ca Mon Sep 17 00:00:00 2001 From: Ivan Novoselov Date: Tue, 28 Dec 2021 16:22:45 +0300 Subject: [PATCH 06/78] [Snippets] Support decreasing output shapes (#9446) --- src/common/snippets/src/op/subgraph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index d9a2fc34301..5082c1cb950 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -264,8 +264,8 @@ snippets::Schedule snippets::op::Subgraph::generate(const BlockedShapeVector& ou for (size_t i = 0; i < work_size.size(); i++) { if (work_size[i] != shape[i]) { - if (work_size[i] == 1) { - work_size[i] = shape[i]; + if (work_size[i] == 1 || shape[i] == 1) { + work_size[i] = max(work_size[i], shape[i]); } else { throw ngraph_error("incompatible shapes for output graphs"); } From c6bc4d004528fec516382a92beba8f98d64c9a4c Mon Sep 17 00:00:00 2001 From: hyunback kim Date: Tue, 28 Dec 2021 23:00:26 +0900 Subject: [PATCH 07/78] [GPU] Fix debug_config build failed issue. (#9441) --- src/plugins/intel_gpu/src/plugin/plugin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index b2892e3caeb..f402d362c27 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -664,6 +664,7 @@ static float GetGOPS(cldnn::device_info info, cldnn::data_types dt) { Parameter Plugin::GetMetric(const std::string& name, const std::map& options) const { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::GetMetric"); + GPU_DEBUG_GET_INSTANCE(debug_config); std::string device_id = GetConfig(CONFIG_KEY(DEVICE_ID), options); auto iter = device_map.find(device_id); @@ -831,7 +832,6 @@ Parameter Plugin::GetMetric(const std::string& name, const std::mapm_configs.GetConfig(device_id); uint32_t n_streams = static_cast(config.throughput_streams); uint64_t occupied_device_mem = 0; From da1261a1d8d210843edab04cbbef4cb02ac32a6b Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Tue, 28 Dec 2021 23:48:29 +0900 Subject: [PATCH 08/78] [GPU] Fix a bug in post-operation optimization (#9443) --- .../impls/onednn/primitive_onednn_base.h | 5 ++- .../src/graph/include/program_node.h | 5 ++- .../src/graph/include/to_string_utils.h | 5 ++- .../intel_gpu/src/graph/program_node.cpp | 45 ++++++++++++++++--- 4 files changed, 51 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index a7b266b00b4..fcb6d356194 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -128,7 +128,10 @@ protected: case onednn_post_op_type::sum: case onednn_post_op_type::optimized_sum: - case onednn_post_op_type::optimized_eltwise: + case onednn_post_op_type::optimized_eltwise_linear: + case onednn_post_op_type::optimized_eltwise_act: + case onednn_post_op_type::optimized_eltwise_round: + case onednn_post_op_type::optimized_eltwise_clip: { break; } diff --git a/src/plugins/intel_gpu/src/graph/include/program_node.h b/src/plugins/intel_gpu/src/graph/include/program_node.h index f7e9ba5e712..258c26ce713 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_node.h +++ b/src/plugins/intel_gpu/src/graph/include/program_node.h @@ -47,7 +47,10 @@ enum class onednn_post_op_type : uint32_t { scale, sum, optimized, - optimized_eltwise, + optimized_eltwise_act, + optimized_eltwise_clip, + optimized_eltwise_linear, + optimized_eltwise_round, optimized_sum }; diff --git a/src/plugins/intel_gpu/src/graph/include/to_string_utils.h b/src/plugins/intel_gpu/src/graph/include/to_string_utils.h index 8cd459d2eb8..1c1bd6099ac 100644 --- a/src/plugins/intel_gpu/src/graph/include/to_string_utils.h +++ b/src/plugins/intel_gpu/src/graph/include/to_string_utils.h @@ -289,7 +289,10 @@ inline std::string onednn_post_op_type_to_str(onednn_post_op_type type) { case onednn_post_op_type::scale: return "scale"; case onednn_post_op_type::sum: return "sum"; case onednn_post_op_type::optimized: return "optimized"; - case onednn_post_op_type::optimized_eltwise: return "optimized_eltwise"; + case onednn_post_op_type::optimized_eltwise_act: return "optimized_eltwise_act"; + case onednn_post_op_type::optimized_eltwise_linear: return "optimized_eltwise_linear"; + case onednn_post_op_type::optimized_eltwise_clip: return "optimized_eltwise_clip"; + case onednn_post_op_type::optimized_eltwise_round: return "optimized_eltwise_round"; case onednn_post_op_type::optimized_sum: return "optimized_sum"; default: return "unknown"; } diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 089f9f2619d..767d1e80218 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -378,7 +378,10 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const case onednn_post_op_type::optimized: case onednn_post_op_type::optimized_sum: - case onednn_post_op_type::optimized_eltwise: + case onednn_post_op_type::optimized_eltwise_act: + case onednn_post_op_type::optimized_eltwise_linear: + case onednn_post_op_type::optimized_eltwise_clip: + case onednn_post_op_type::optimized_eltwise_round: { // Current operation already has been optimized => don't need extra actions break; @@ -392,7 +395,10 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const // Check that post-op type is any optimized auto type_is_any_optimized = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized || type == onednn_post_op_type::optimized_sum || - type == onednn_post_op_type::optimized_eltwise; + type == onednn_post_op_type::optimized_eltwise_act || + type == onednn_post_op_type::optimized_eltwise_linear || + type == onednn_post_op_type::optimized_eltwise_clip || + type == onednn_post_op_type::optimized_eltwise_round; }; // Check that post-op type is eltwise @@ -409,13 +415,28 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const // Simple post-op type checks auto type_is_optimized = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized; }; auto type_is_eltwise_linear = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::eltwise_linear; }; - auto type_is_optimized_eltwise = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_eltwise; }; + auto type_is_optimized_eltwise = [](onednn_post_op_type type) -> bool { + return type == onednn_post_op_type::optimized_eltwise_act || type == onednn_post_op_type::optimized_eltwise_linear || + type == onednn_post_op_type::optimized_eltwise_round || type == onednn_post_op_type::optimized_eltwise_clip; + }; auto type_is_binary_add = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_add; }; auto type_is_binary_mul = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::binary_mul; }; auto type_is_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::sum; }; auto type_is_optimized_sum = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::optimized_sum; }; auto type_is_scale = [](onednn_post_op_type type) -> bool { return type == onednn_post_op_type::scale; }; + auto get_eltwise_type = [](onednn_post_op_type type) { + switch (type) { + case onednn_post_op_type::optimized_eltwise_act: return onednn_post_op_type::eltwise_act; + case onednn_post_op_type::optimized_eltwise_clip: return onednn_post_op_type::eltwise_clip; + case onednn_post_op_type::optimized_eltwise_linear: return onednn_post_op_type::eltwise_linear; + case onednn_post_op_type::optimized_eltwise_round: return onednn_post_op_type::eltwise_round; + default: + throw std::runtime_error("Unsupported optimized eltwise post-operation type"); + break; + } + }; + auto& cur_post_ops = get_fused_primitives_onednn(); size_t cur_post_op_idx = 1; @@ -427,7 +448,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const if (type_is_optimized_sum(cur_post_ops[post_op_idx].op_type)) cur_post_ops[post_op_idx].op_type = onednn_post_op_type::sum; else if (type_is_optimized_eltwise(cur_post_ops[post_op_idx].op_type)) - cur_post_ops[post_op_idx].op_type = onednn_post_op_type::eltwise_linear; + cur_post_ops[post_op_idx].op_type = get_eltwise_type(cur_post_ops[post_op_idx].op_type); else if (type_is_optimized(cur_post_ops[post_op_idx].op_type)) cur_post_ops.erase(cur_post_ops.begin() + post_op_idx); } @@ -435,6 +456,18 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const // Get post-ops size for current node auto post_ops_size = cur_post_ops.size(); + auto get_optimized_eltwise_type = [](onednn_post_op_type type) { + switch (type) { + case onednn_post_op_type::eltwise_linear: return onednn_post_op_type::optimized_eltwise_linear; + case onednn_post_op_type::eltwise_act: return onednn_post_op_type::optimized_eltwise_act; + case onednn_post_op_type::eltwise_round: return onednn_post_op_type::optimized_eltwise_round; + case onednn_post_op_type::eltwise_clip: return onednn_post_op_type::optimized_eltwise_clip; + default: + throw std::runtime_error("Unsupported optimized eltwise post-operation type"); + break; + } + }; + // Try to combine pairs of arithmetic post-ops (adds and muls) into one operation inside this cycle while (!optimization_done) { auto cur_type = cur_post_ops[cur_post_op_idx].op_type; @@ -516,7 +549,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const if (eltw_linear_and_eltw_linear || eltw_linear_and_eltw_non_linear) { // Marked current and previous eltwise operations as 'optimized' (they will be ignored on the next iteration of cycle) cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized; - cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise; + cur_post_ops[prev_post_op_idx].op_type = get_optimized_eltwise_type(prev_type); // Set the flag if extra optimizations checking is needed if (cur_post_op_idx < post_ops_size - 1) { @@ -649,7 +682,7 @@ dnnl::post_ops program_node::try_optimize_post_ops(dnnl::post_ops& p_ops, const add_post_op(cur_type, sum_p_op, optimized_p_ops, 0); // Marked current, previous and next operations as 'optimized' (they will be ignored on the next iteration of cycle) - cur_post_ops[prev_post_op_idx].op_type = onednn_post_op_type::optimized_eltwise; + cur_post_ops[prev_post_op_idx].op_type = get_optimized_eltwise_type(prev_type); cur_post_ops[cur_post_op_idx].op_type = onednn_post_op_type::optimized_sum; cur_post_ops[next_post_op_idx].op_type = onednn_post_op_type::optimized; From b2aff0cd56463bfa9ed6fa8d4a881e33669496fc Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Tue, 28 Dec 2021 17:49:48 +0300 Subject: [PATCH 09/78] [MO SDL] Test MO and IR Reader on attacking inputs (#8947) * Test MO and IR Reader on attacking inputs Signed-off-by: Roman Kazantsev * Add test to check IR Reader against untrusted well-formed IR Signed-off-by: Roman Kazantsev * Refactor IR Reader tests with corrupted IR Signed-off-by: Roman Kazantsev * Test for regular expression denial of service Signed-off-by: Roman Kazantsev * Remove undesired word like bomb Signed-off-by: Roman Kazantsev * Move tests to new location Signed-off-by: Roman Kazantsev * Use correct import Signed-off-by: Roman Kazantsev * Revert blank line Signed-off-by: Roman Kazantsev --- .../mo/unit_tests/mo/utils/cli_parser_test.py | 6 + .../mo/utils/ir_reader/restore_graph_test.py | 141 +++++++++++++++--- 2 files changed, 123 insertions(+), 24 deletions(-) diff --git a/tools/mo/unit_tests/mo/utils/cli_parser_test.py b/tools/mo/unit_tests/mo/utils/cli_parser_test.py index d9e5b3e1d69..f5f34d8c184 100644 --- a/tools/mo/unit_tests/mo/utils/cli_parser_test.py +++ b/tools/mo/unit_tests/mo/utils/cli_parser_test.py @@ -773,6 +773,12 @@ class TestShapesParsing(unittest.TestCase): input_shapes = "(-12,4,1),(4,6,8)" self.assertRaises(Error, get_placeholder_shapes, argv_input, input_shapes) + def test_get_shapes_long_dimension_with_invalid_character(self): + # test for regular expression denial of service + argv_input = "inp1,inp2" + input_shapes = "(222222222222222222222222222222222222222222!,4,1),(4,6,8)" + self.assertRaises(Error, get_placeholder_shapes, argv_input, input_shapes) + def test_get_shapes_one_input_any_neg_shape(self): argv_input = "inp1, inp2" input_shapes = "(12,4,1),(4,-6,8)" diff --git a/tools/mo/unit_tests/mo/utils/ir_reader/restore_graph_test.py b/tools/mo/unit_tests/mo/utils/ir_reader/restore_graph_test.py index dfb9fc3693b..5a71897a0d8 100644 --- a/tools/mo/unit_tests/mo/utils/ir_reader/restore_graph_test.py +++ b/tools/mo/unit_tests/mo/utils/ir_reader/restore_graph_test.py @@ -4,33 +4,126 @@ import os import tempfile import unittest - from defusedxml.common import EntitiesForbidden - from openvino.tools.mo.utils.ir_reader.restore_graph import restore_graph_from_ir class TestIRReader(unittest.TestCase): - def setUp(self): - self.xml_bomb = b'\n' \ - b'\n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b' \n' \ - b']>\n' \ - b'&lol9;' + def test_read_xml_incorrect(self): + incorrect_xml = b'\n' \ + b'\n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b' \n' \ + b']>\n' \ + b'&lol9;' - def test_read_xml_bomb(self): - bomb_file = tempfile.NamedTemporaryFile(delete=False) - bomb_file.write(self.xml_bomb) - bomb_file.close() - self.assertRaises(EntitiesForbidden, restore_graph_from_ir, bomb_file.name) - os.remove(bomb_file.name) + incorrect_xml_file = tempfile.NamedTemporaryFile(delete=False) + incorrect_xml_file.write(incorrect_xml) + incorrect_xml_file.close() + self.assertRaises(EntitiesForbidden, restore_graph_from_ir, incorrect_xml_file.name) + os.remove(incorrect_xml_file.name) + + def test_read_untrusted_IR(self): + untrusted_xml = b'\n' \ + b'\n' \ + b'\n' \ + b']>\n' \ + b'&xxe;\n' + + untrusted_xml_file = tempfile.NamedTemporaryFile(delete=False) + untrusted_xml_file.write(untrusted_xml) + untrusted_xml_file.close() + self.assertRaises(EntitiesForbidden, restore_graph_from_ir, untrusted_xml_file.name) + os.remove(untrusted_xml_file.name) + + def test_read_malformed_IR(self): + ir_front = b'' \ + b'' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' 1' \ + b' 3' \ + b' 22' \ + b' 22' \ + b' ' \ + b' ' \ + b' ' \ + + ir_front_malformed = b'' \ + b'' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' 1' \ + b' 3' \ + b' 22' \ + b' 22' \ + b' ' \ + b' ' \ + b' ' \ + + ir_end = b' ' \ + b' ' \ + b' ' \ + b' 1' \ + b' 3' \ + b' 22' \ + b' 22' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' 1' \ + b' 3' \ + b' 22' \ + b' 22' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' 1' \ + b' 3' \ + b' 22' \ + b' 22' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b' ' \ + b'' \ + + normal_ir_ir = ir_front + ir_end + normal_ir_file = tempfile.NamedTemporaryFile(delete=False) + normal_ir_file.write(normal_ir_ir) + normal_ir_file.close() + # we must expect no exceptions + restore_graph_from_ir(normal_ir_file.name) + os.remove(normal_ir_file.name) + + # expect that IR Reader complains on IR with malformed port id + malformed_ir = ir_front_malformed + ir_end + malformed_ir_file = tempfile.NamedTemporaryFile(delete=False) + malformed_ir_file.write(malformed_ir) + malformed_ir_file.close() + self.assertRaises(ValueError, restore_graph_from_ir, malformed_ir_file.name) + os.remove(malformed_ir_file.name) From 3cef5134950012aea6358bdd14f46ff9dce55d09 Mon Sep 17 00:00:00 2001 From: Milana Shhanukova Date: Tue, 28 Dec 2021 19:08:36 +0400 Subject: [PATCH 10/78] [POT] Check type for layers' renaming (#9276) * change in installation * develop mode * change in install guide * style change * change declare * add type checking * revert changes * rename directly in nx_model * Update README_dev.md --- tools/pot/openvino/tools/pot/graph/nx_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pot/openvino/tools/pot/graph/nx_model.py b/tools/pot/openvino/tools/pot/graph/nx_model.py index 24b862ec923..c1631d41859 100644 --- a/tools/pot/openvino/tools/pot/graph/nx_model.py +++ b/tools/pot/openvino/tools/pot/graph/nx_model.py @@ -186,7 +186,7 @@ class NXModel: for model_dict in self._models: model_name, model = model_dict['name'], model_dict['model'] for node in ge.get_all_operation_nodes(model, recursively=False): - rename_node(node, f'{model_name}_{node.name}') + node.name = f'{model_name}_{node.name}' def _remove_models_prefix(self): """Removes model name prefix from node names""" From 3e6951c1da683252b4d31badeb797199116fd246 Mon Sep 17 00:00:00 2001 From: Anton Chetverikov Date: Wed, 29 Dec 2021 00:59:48 +0300 Subject: [PATCH 11/78] [MO] Support for common rt_info attribute in MO IR Reader (#9272) * Support for common rt_info attribute in MO IR Reader * Add missed change * Moved back wrong change * Change attr name * Add support for rt_info for out ports * Add emitting for rt_info * Fix restoration error * Add support for rt_info for input ports * Add more comments * Set correct layout attr to restored graph --- .../tools/mo/back/SpecialNodesFinalization.py | 4 +- .../tools/mo/back/ie_ir_ver_2/emitter.py | 32 +++++++-- .../tools/mo/middle/passes/eliminate.py | 2 + .../tools/mo/utils/ir_engine/ir_engine.py | 67 ++++++++++++++++--- .../mo/utils/ir_reader/layer_to_class.py | 7 +- 5 files changed, 96 insertions(+), 16 deletions(-) diff --git a/tools/mo/openvino/tools/mo/back/SpecialNodesFinalization.py b/tools/mo/openvino/tools/mo/back/SpecialNodesFinalization.py index 8013eb506f5..f5cac6bfed5 100644 --- a/tools/mo/openvino/tools/mo/back/SpecialNodesFinalization.py +++ b/tools/mo/openvino/tools/mo/back/SpecialNodesFinalization.py @@ -8,11 +8,12 @@ from collections import defaultdict import numpy as np from openvino.tools.mo.back.pass_separator import BackFinish -from openvino.tools.mo.ops.tensor_iterator import TensorIterator from openvino.tools.mo.back.replacement import BackReplacementPattern from openvino.tools.mo.graph.graph import Graph from openvino.tools.mo.ops.const import Const +from openvino.tools.mo.ops.tensor_iterator import TensorIterator from openvino.tools.mo.utils.error import Error +from openvino.tools.mo.utils.runtime_info import RTInfo from openvino.tools.mo.utils.utils import refer_to_faq_msg @@ -86,6 +87,7 @@ class CreateConstNodesReplacement(BackReplacementPattern): 'override_output_shape': node.has_valid('force_shape'), 'force_type': node.soft_get('force_type', None), 'correct_data_type': node.soft_get('correct_data_type', None), + 'rt_info': node.soft_get('rt_info', RTInfo()), }).create_node() const_node.add_input_port(0) graph.add_edges_from([(const_node_name, node.id, {'out': 0})]) diff --git a/tools/mo/openvino/tools/mo/back/ie_ir_ver_2/emitter.py b/tools/mo/openvino/tools/mo/back/ie_ir_ver_2/emitter.py index 5061badfe55..085dc73d991 100644 --- a/tools/mo/openvino/tools/mo/back/ie_ir_ver_2/emitter.py +++ b/tools/mo/openvino/tools/mo/back/ie_ir_ver_2/emitter.py @@ -140,6 +140,21 @@ def xml_ports(node: Node, element: Element, edges: Element): assert node.graph.node[u]['shape'] is not None, 'Input shape is not calculated properly for node {}'.format( node.id) xml_shape(node.graph.node[u]['shape'], port) + + # support saving rt_info passed from IR Reader + port_id = d['in'] + if node.has('restored_input_ports') and port_id in node.restored_input_ports: + port_rt_info_value = node.restored_input_ports[port_id][2] + if port_rt_info_value != {}: + port_rt_info = SubElement(port, 'rt_info') + for (name, version), info_elem in port_rt_info_value.items(): + attribute = SubElement(port_rt_info, 'attribute') + attribute.set('name', name) + attribute.set('version', str(version)) + params = info_elem.serialize(node) if not isinstance(info_elem, dict) else info_elem + for key, value in params.items(): + attribute.set(key, value) + # u is a data node that has a single producer, let's find it assert (node.graph.node[u]['kind'] == 'data') in_nodes = list(node.graph.in_edges(u, data=True)) @@ -176,6 +191,18 @@ def xml_ports(node: Node, element: Element, edges: Element): port.set('names', ','.join(tensor_names)) xml_shape(node.graph.node[v]['shape'], port) + # support saving rt_info passed from IR Reader + if node.has('ports') and port_id in node.ports: + port_rt_info_value = node.ports[port_id][2] + if port_rt_info_value != []: + port_rt_info = SubElement(port, 'rt_info') + for (name, version), info_elem in port_rt_info_value.items(): + attribute = SubElement(port_rt_info, 'attribute') + attribute.set('name', name) + attribute.set('version', str(version)) + params = info_elem.serialize(node) if not isinstance(info_elem, dict) else info_elem + for key, value in params.items(): + attribute.set(key, value) def xml_consts(graph: Graph, node: Node, element: Element): blobs = None # sub-element that will be created on-demand @@ -258,10 +285,7 @@ def serialize_runtime_info(node, parent_element: Element): attribute = SubElement(rt_info, 'attribute') attribute.set('name', name) attribute.set('version', str(version)) - params = info_elem.serialize(node) - if len(params) == 0: - rt_info.remove(attribute) - continue + params = info_elem.serialize(node) if not isinstance(info_elem, dict) else info_elem for key, value in params.items(): attribute.set(key, value) if len(rt_info.attrib) == 0 and len(list(rt_info)) == 0: diff --git a/tools/mo/openvino/tools/mo/middle/passes/eliminate.py b/tools/mo/openvino/tools/mo/middle/passes/eliminate.py index 60b6dc69534..edf4e45fb1c 100644 --- a/tools/mo/openvino/tools/mo/middle/passes/eliminate.py +++ b/tools/mo/openvino/tools/mo/middle/passes/eliminate.py @@ -144,6 +144,7 @@ def add_constant_operations(graph): if len(node.in_nodes()) == 0 and len(node.out_nodes()) != 0: # It's necessary to import here due to cycle dependencies from openvino.tools.mo.ops.const import Const + from openvino.tools.mo.utils.runtime_info import RTInfo name = node.soft_get('name', node.id) new_name = re.sub(r'\/Output_\d+\/Data_(.?)+', '', name) const_node = Const(graph, dict(value=node.value, name=new_name, @@ -151,6 +152,7 @@ def add_constant_operations(graph): override_output_shape=node.has_valid('force_shape'), force_type=node.soft_get('force_type', None), correct_data_type=node.soft_get('correct_data_type', False), + rt_info=node.soft_get('rt_info', RTInfo()), )).create_node() graph.add_edges_from([(const_node.id, node.id, {'out': 0})]) diff --git a/tools/mo/openvino/tools/mo/utils/ir_engine/ir_engine.py b/tools/mo/openvino/tools/mo/utils/ir_engine/ir_engine.py index 40b2dbd4549..0049ed97e66 100644 --- a/tools/mo/openvino/tools/mo/utils/ir_engine/ir_engine.py +++ b/tools/mo/openvino/tools/mo/utils/ir_engine/ir_engine.py @@ -5,14 +5,13 @@ import hashlib import logging as log import os import sys - -from defusedxml import defuse_stdlib -import defusedxml.ElementTree as ET from argparse import Namespace -from collections import namedtuple, defaultdict +from collections import namedtuple, defaultdict, OrderedDict from pathlib import Path +import defusedxml.ElementTree as ET import numpy as np +from defusedxml import defuse_stdlib from openvino.tools.mo.front.common.partial_infer.utils import dynamic_dimension_value, shape_array from openvino.tools.mo.graph.graph import Node, Graph @@ -59,7 +58,9 @@ class IREngine(object): self.graph.graph['hashes'] = {} self.graph.graph['ir_version'] = int(xml_root.attrib['version']) if xml_root.attrib.get('version') is not None else None - self.graph.graph['layout'] = 'NCHW' + self.graph.graph['layout'] = 'NCHW' # We set layout to NCHW as default value and + # changing it in __rt_info_check_layout if it will be necessary + self.graph.name = xml_root.attrib['name'] if xml_root.attrib.get('name') is not None else None # Parse XML @@ -204,7 +205,7 @@ class IREngine(object): layer_id = layer.attrib['id'] layer_attrs = layer.attrib - layer_attrs.update({'ports': {}, 'kind': 'op'}) + layer_attrs.update({'ports': {}, 'restored_input_ports': {}, 'kind': 'op'}) inputs_counter = 0 @@ -224,22 +225,50 @@ class IREngine(object): layer_attrs.update(new_attrs) elif attr.tag == 'input': inputs_counter = len(attr) + + input = attr + for port in input: + port_id = int(port.attrib['id']) + input_shape = [] + port_rt_info = {} + for dim in port: + if dim.tag == "dim": + input_shape.append(int(dim.text)) + if dim.tag == 'rt_info': + for attr in dim: + port_rt_info.update(self.__read_rt_info_common(attr)) + self.__rt_info_check_layout(attr) + + input_shape = shape_array([d if d != -1 else dynamic_dimension_value for d in input_shape]) + + in_tensor_names = None + if 'names' in port.attrib: + in_tensor_names = port.attrib['names'] + + # special attribute to pass information about operation input ports + layer_attrs['restored_input_ports'].update({port_id: (input_shape, in_tensor_names, port_rt_info)}) elif attr.tag == 'output': output = attr for port in output: port_id = int(port.attrib['id']) output_shape = [] + port_rt_info = {} for dim in port: if dim.tag == "dim": output_shape.append(int(dim.text)) + if dim.tag == 'rt_info': + for attr in dim: + port_rt_info.update(self.__read_rt_info_common(attr)) + self.__rt_info_check_layout(attr) output_shape = shape_array([d if d != -1 else dynamic_dimension_value for d in output_shape]) out_tensor_names = None if 'names' in port.attrib: out_tensor_names = port.attrib['names'] - - layer_attrs['ports'].update({port_id: (output_shape, out_tensor_names)}) + # special attribute to pass information about operation input ports + # NOTE: renaming or structure changing of this attribute may have big impact on tests + layer_attrs['ports'].update({port_id: (output_shape, out_tensor_names, port_rt_info)}) elif attr.tag == 'blobs': in_port = inputs_counter for blob_attr in attr: @@ -460,8 +489,10 @@ class IREngine(object): attr_name = attr.attrib['name'] if attr_name == 'old_api_map_order': rt_info.info.update(self.__read_old_api_map_order(attr, layer.attrib['type'])) - if attr_name == 'old_api_map_element_type': + elif attr_name == 'old_api_map_element_type': rt_info.info.update(self.__read_old_api_map_element_type(attr, layer.attrib['type'])) + else: + rt_info.info.update((self.__read_rt_info_common(attr))) layer_attrs.update({'rt_info': rt_info}) return layer_attrs @@ -487,3 +518,21 @@ class IREngine(object): old_api_map = OldAPIMapElementType(version=version) old_api_map.set_legacy_type(element_type) return {('old_api_map_element_type', version): old_api_map} + + @staticmethod + def __read_rt_info_common(attr): + attr_name = attr.attrib['name'] + version = int(attr.attrib['version']) + rt_info = OrderedDict() + for key in attr.attrib: + if key not in ('name', 'version'): + rt_info[key] = attr.attrib[key] + return {(attr_name, version): rt_info} + + def __rt_info_check_layout(self, attr): + graph_layout = None + for key in attr.attrib: + if key == 'layout': + graph_layout = attr.attrib[key].replace(',', '').strip('[] ')# .strip(']').strip(',').strip(' ') + if graph_layout is not None: + self.graph.graph['layout'] = graph_layout diff --git a/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py b/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py index e207478e1ae..90a31257134 100644 --- a/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py +++ b/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py @@ -278,8 +278,8 @@ def restore_tensor_names(op: Node): for out_port in op.ports: # op.ports is our internal attribute, dictionary, where keys are numbers of output ports # and values are tuples with shape and tensor name: - # {out_port_idx_1: (out_port_idx_1_shape, out_port_idx_1_tensor_name), - # out_port_idx_2: (out_port_idx_2_shape, out_port_idx_2_tensor_name)} + # {out_port_idx_1: (out_port_idx_1_shape, out_port_idx_1_tensor_name, out_port_idx_1_rt_info), + # out_port_idx_2: (out_port_idx_2_shape, out_port_idx_2_tensor_name, out_port_idx_2_rt_info)} out_tensor_names = op.ports[out_port][1] # handle Constant operations with old style output port numbering @@ -405,6 +405,9 @@ def copy_graph_with_ops(graph: Graph) -> Graph: 'Const node {} not properly corrected to appropriate data node'.format(op.soft_get('name')) op.out_node(0)['correct_data_type'] = True + if op.has_and_set('rt_info'): + op.out_node(0)['rt_info'] = op.rt_info + restore_tensor_names(op) # operations postprocessing with some special types From a51a735d9fcc679c36b7e21bdb1de5835c4fac6a Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Wed, 29 Dec 2021 01:33:49 +0300 Subject: [PATCH 12/78] [MO] cli_parser.py fix to accept scalar value for freezing (#9395) * cli_parser.py fix to accept scalar value for freezing * update cli help * fixed unit-tests, clarified help for specifying data type * typos correction --- .../convert_model/Converting_Model.md | 30 ++- .../mo/front/freeze_placeholder_value.py | 5 +- .../mo/openvino/tools/mo/utils/cli_parser.py | 25 +- .../mo/unit_tests/mo/utils/cli_parser_test.py | 223 ++++++++++-------- 4 files changed, 167 insertions(+), 116 deletions(-) diff --git a/docs/MO_DG/prepare_model/convert_model/Converting_Model.md b/docs/MO_DG/prepare_model/convert_model/Converting_Model.md index 6b384ea81a4..a8a22189f18 100644 --- a/docs/MO_DG/prepare_model/convert_model/Converting_Model.md +++ b/docs/MO_DG/prepare_model/convert_model/Converting_Model.md @@ -101,18 +101,24 @@ Framework-agnostic parameters: Parameter -> ReverseInputChannels -> Mean/Scale apply -> the original body of the model. --log_level {CRITICAL,ERROR,WARN,WARNING,INFO,DEBUG,NOTSET} Logger level - --input INPUT Quoted list of comma-separated input nodes names with - shapes, data types, and values for freezing. The shape - and value are specified as space-separated lists. The - data type of input node is specified in braces and can - have one of the values: f64 (float64), f32 (float32), - f16 (float16), i64 (int64), i32 (int32), u8 (uint8), - boolean. For example, use the following format to set - input port 0 of the node `node_name1` with the shape - [3 4] as an input node and freeze output port 1 of the - node `node_name2` with the value [20 15] of the int32 - type and shape [2]: "0:node_name1[3 - 4],node_name2:1[2]{i32}->[20 15]". + --input INPUT Quoted list of comma-separated input nodes names with shapes, + data types, and values for freezing. The shape and value are + specified as space-separated lists. The data type of input + node is specified in braces and can have one of the values: + f64 (float64), f32 (float32), f16 (float16), i64 (int64), + i32 (int32), u8 (uint8), boolean (bool). Data type is optional. + If it's not specified explicitly then there are two options: + if input node is a parameter, data type is taken from the + original node dtype, if input node is not a parameter, data type + is set to f32. Example, to set `input_1` with shape [1 100], + and Parameter node `sequence_len` with scalar input with value `150`, + and boolean input `is_training` with `False` value use the + following format: "input_1[1 10],sequence_len->150,is_training->False". + Another example, use the following format to set input port 0 + of the node `node_name1` with the shape [3 4] as an input node + and freeze output port 1 of the node `node_name2` with the + value [20 15] of the int32 type and shape [2]: + "0:node_name1[3 4],node_name2:1[2]{i32}->[20 15]". --output OUTPUT The name of the output operation of the model. For TensorFlow*, do not add :0 to this name. --mean_values MEAN_VALUES, -ms MEAN_VALUES diff --git a/tools/mo/openvino/tools/mo/front/freeze_placeholder_value.py b/tools/mo/openvino/tools/mo/front/freeze_placeholder_value.py index 9f9beb545d8..a307333a560 100644 --- a/tools/mo/openvino/tools/mo/front/freeze_placeholder_value.py +++ b/tools/mo/openvino/tools/mo/front/freeze_placeholder_value.py @@ -48,6 +48,7 @@ class FreezePlaceholderValue(FrontReplacementSubgraph): try: if data_type != np.bool: value = mo_array(string_value, dtype=data_type) + # TODO: investigate why boolean type is allowed only for TensorFlow elif data_type == np.bool and graph.graph['fw'] == 'tf': from openvino.tools.mo.front.tf.common import tf_data_type_cast if isinstance(string_value, list): @@ -58,9 +59,9 @@ class FreezePlaceholderValue(FrontReplacementSubgraph): else: value = tf_data_type_cast[ph.data_type](string_value) else: - raise Error("Can not cast value {} to {} data_type".format(string_value, data_type)) + raise Error("Cannot cast value {} to {} data_type".format(string_value, data_type)) except: - raise Error("Can not cast value {} to {} data_type".format(string_value, data_type)) + raise Error("Cannot cast value {} to {} data_type".format(string_value, data_type)) try: value = np.reshape(a=value, newshape=ph.shape) except: diff --git a/tools/mo/openvino/tools/mo/utils/cli_parser.py b/tools/mo/openvino/tools/mo/utils/cli_parser.py index 604e2fc3f12..a9ff24193ee 100644 --- a/tools/mo/openvino/tools/mo/utils/cli_parser.py +++ b/tools/mo/openvino/tools/mo/utils/cli_parser.py @@ -289,10 +289,18 @@ def get_common_cli_parser(parser: argparse.ArgumentParser = None): common_group.add_argument('--input', help='Quoted list of comma-separated input nodes names with shapes, data types, ' 'and values for freezing. The shape and value are specified as space-separated ' - 'lists. The data type of input node is specified in braces and can have one of the ' - 'values: f64 (float64), f32 (float32), f16 (float16), i64 (int64), i32 (int32), u8 ' - '(uint8), boolean. For example, use the following format to set input port 0 of the ' - 'node `node_name1` with the shape [3 4] as an input node and freeze output port 1 ' + 'lists. The data type of input node is specified in braces and ' + 'can have one of the values: f64 (float64), f32 (float32), f16 (float16), ' + 'i64 (int64), i32 (int32), u8 (uint8), boolean (bool). Data type is optional. ' + 'If it\'s not specified explicitly then there are two options: ' + 'if input node is a parameter, data type is taken from the original node dtype, ' + 'if input node is not a parameter, data type is set to f32. ' + 'Example, to set `input_1` with shape [1 100], and Parameter node `sequence_len` ' + 'with scalar input with value `150`, and boolean input `is_training` with ' + '`False` value use the following format: ' + '"input_1[1 10],sequence_len->150,is_training->False". ' + 'Another example, use the following format to set input port 0 of the node ' + '`node_name1` with the shape [3 4] as an input node and freeze output port 1 ' 'of the node `node_name2` with the value [20 15] of the int32 type and shape [2]: ' '"0:node_name1[3 4],node_name2:1[2]{i32}->[20 15]".') common_group.add_argument('--output', @@ -796,9 +804,11 @@ def get_shape_from_input_value(input_value: str): input_value = input_value.split('->')[0] # parse shape - shape = re.findall(r'[(\[]([0-9\.\? -]+)[)\]]', input_value) + shape = re.findall(r'[(\[]([0-9\.\? -]*)[)\]]', input_value) if len(shape) == 0: shape = None + elif len(shape) == 1 and shape[0] in ['', ' ']: + shape = () elif len(shape) == 1: shape = tuple(map(parse_dimension, shape[0].split(' '))) else: @@ -856,7 +866,7 @@ def parse_input_value(input_value: str): data_type = get_data_type_from_input_value(input_value) node_name = get_node_name_with_port_from_input_value(input_value) value = get_value_from_input_value(input_value) - shape = get_shape_from_input_value(input_value.split('->')[0]) + shape = get_shape_from_input_value(input_value) value_size = np.prod(len(value)) if isinstance(value, list) else 1 if value is not None and shape is not None: @@ -1170,6 +1180,9 @@ def get_placeholder_shapes(argv_input: str, argv_input_shape: str, argv_batch=No if '->' not in inp: continue shape = placeholder_shapes[inp.split('->')[0]] + + if shape is None: + continue for dim in shape: if isinstance(dim, tuple) or dim == -1: raise Error("Cannot freeze input with dynamic shape: {}".format(shape)) diff --git a/tools/mo/unit_tests/mo/utils/cli_parser_test.py b/tools/mo/unit_tests/mo/utils/cli_parser_test.py index f5f34d8c184..5f79ad9008a 100644 --- a/tools/mo/unit_tests/mo/utils/cli_parser_test.py +++ b/tools/mo/unit_tests/mo/utils/cli_parser_test.py @@ -10,12 +10,11 @@ import unittest from unittest.mock import patch import numpy as np -import numpy.testing as npt from openvino.tools.mo.utils.cli_parser import get_placeholder_shapes, get_tuple_values, get_mean_scale_dictionary, \ get_model_name, \ parse_tuple_pairs, check_positive, writable_dir, readable_dirs, \ - readable_file, get_freeze_placeholder_values, parse_transform, check_available_transforms, get_layout_values + readable_file, get_freeze_placeholder_values, parse_transform, check_available_transforms, get_layout_values, get_data_type_from_input_value from openvino.tools.mo.utils.error import Error @@ -28,7 +27,7 @@ class TestingMeanScaleGetter(unittest.TestCase): 'info': np.array([2.2, 33.33, 444.444]) } for el in exp_res.keys(): - npt.assert_array_equal(result[el], exp_res[el]) + assert np.array_equal(result[el], exp_res[el]) def test_tuple_parser_name_digits_only(self): tuple_values = "0448(1.1,22.22,333.333),0449[2.2,33.33,444.444]" @@ -38,7 +37,7 @@ class TestingMeanScaleGetter(unittest.TestCase): '0449': np.array([2.2, 33.33, 444.444]) } for el in exp_res.keys(): - npt.assert_array_equal(result[el], exp_res[el]) + assert np.array_equal(result[el], exp_res[el]) def test_tuple_parser_same_values(self): tuple_values = "data(1.1,22.22,333.333),info[1.1,22.22,333.333]" @@ -48,7 +47,7 @@ class TestingMeanScaleGetter(unittest.TestCase): 'info': np.array([1.1, 22.22, 333.333]) } for el in exp_res.keys(): - npt.assert_array_equal(result[el], exp_res[el]) + assert np.array_equal(result[el], exp_res[el]) def test_tuple_parser_no_inputs(self): tuple_values = "(1.1,22.22,333.333),[2.2,33.33,444.444]" @@ -56,7 +55,7 @@ class TestingMeanScaleGetter(unittest.TestCase): exp_res = [np.array([1.1, 22.22, 333.333]), np.array([2.2, 33.33, 444.444])] for i in range(0, len(exp_res)): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_tuple_parser_error_mixed_with_and_without_name(self): tuple_values = "(1.1,22.22,333.333),data[2.2,33.33,444.444]" @@ -91,7 +90,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -112,7 +111,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -128,7 +127,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -144,7 +143,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -163,7 +162,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for i in range(len(exp_res)): for j in range(len(exp_res[i])): if type(exp_res[i][j]) is np.ndarray: - npt.assert_array_equal(exp_res[i][j], result[i][j]) + assert np.array_equal(exp_res[i][j], result[i][j]) else: self.assertEqual(exp_res[i][j], result[i][j]) @@ -182,7 +181,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -205,7 +204,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -228,7 +227,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -247,7 +246,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -270,7 +269,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for i in range(len(exp_res)): for j in range(len(exp_res[i])): if type(exp_res[i][j]) is np.ndarray: - npt.assert_array_equal(exp_res[i][j], result[i][j]) + assert np.array_equal(exp_res[i][j], result[i][j]) else: self.assertEqual(exp_res[i][j], result[i][j]) @@ -291,7 +290,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -313,7 +312,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -335,7 +334,7 @@ class TestingMeanScaleGetter(unittest.TestCase): for input in exp_res.keys(): for key in exp_res[input].keys(): if type(exp_res[input][key]) is np.ndarray: - npt.assert_array_equal(exp_res[input][key], result[input][key]) + assert np.array_equal(exp_res[input][key], result[input][key]) else: self.assertEqual(exp_res[input][key], result[input][key]) @@ -355,7 +354,7 @@ class TestingMeanScaleGetter(unittest.TestCase): ] for i in range(0, len(exp_res)): for j in range(0, len(exp_res[i])): - npt.assert_array_equal(exp_res[i][j], result[i][j]) + assert np.array_equal(exp_res[i][j], result[i][j]) def test_scale_do_not_match_input(self): scale_values = parse_tuple_pairs("input_not_present(255),input2(255)") @@ -376,7 +375,7 @@ class TestingMeanScaleGetter(unittest.TestCase): self.assertEqual(len(exp_res), len(res_values)) for i, j in zip(exp_res, res_values): self.assertEqual(i, j) - npt.assert_array_equal(exp_res[i], res_values[j]) + assert np.array_equal(exp_res[i], res_values[j]) def test_input_without_values(self): self.assertRaises(Error, parse_tuple_pairs, "input1,input2") @@ -452,7 +451,7 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([1, 22, 333, 123]), 'inp2': np.array([-1, 45, 7, 1])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_shapes_several_inputs_several_shapes2(self): # shapes specified using --input command line parameter and no values @@ -461,13 +460,45 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([1, 22, 333, 123]), 'inp2': np.array([-1, 45, 7, 1])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {} input_node_names_ref = "inp1,inp2" self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + + def test_get_shapes_and_freezing_with_scalar_and_without_shapes_in_input(self): + # shapes and value for freezing specified using --input command line parameter + argv_input = "inp1,inp2->157" + result_shapes, _ = get_placeholder_shapes(argv_input, None) + ref_shapes = {'inp1': None, 'inp2': None} + self.assertEqual(list(ref_shapes.keys()), list(result_shapes.keys())) + for i in ref_shapes.keys(): + assert np.array_equal(result_shapes[i], ref_shapes[i]) + + placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) + placeholder_values_ref = {'inp2': 157} + + self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) + for i in placeholder_values_ref.keys(): + self.assertEqual(placeholder_values_res[i], placeholder_values_ref[i]) + + def test_get_shapes_and_freezing_with_scalar(self): + # shapes and value for freezing specified using --input command line parameter + argv_input = "inp1,inp2[]->157" + result_shapes, _ = get_placeholder_shapes(argv_input, None) + ref_shapes = {'inp1': None, 'inp2': ()} + self.assertEqual(list(ref_shapes.keys()), list(result_shapes.keys())) + for i in ref_shapes.keys(): + assert np.array_equal(result_shapes[i], ref_shapes[i]) + + placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) + placeholder_values_ref = {'inp2': 157} + + self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) + for i in placeholder_values_ref.keys(): + self.assertEqual(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_several_inputs_several_shapes3(self): # shapes and value for freezing specified using --input command line parameter @@ -476,14 +507,14 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([3, 1]), 'inp2': np.array([3, 2, 3]), 'inp3': np.array([5])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': np.array(['1.0', '1.0', '2.0', '3.0', '5.0'])} input_node_names_ref = "inp1,inp2,inp3" self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_several_inputs_several_shapes4(self): # shapes specified using --input_shape and values for freezing using --input command line parameter @@ -493,14 +524,14 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([3, 1]), 'inp2': np.array([3, 2, 3]), 'inp3': np.array([5])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': np.array(['1.0', '1.0', '2.0', '3.0', '5.0'])} input_node_names_ref = "inp1,inp2,inp3" self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) self.assertEqual(input_node_names_ref, input_node_names_res) def test_get_shapes_several_inputs_several_shapes5(self): @@ -513,7 +544,7 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([3, 1]), 'inp2': np.array([3, 2, 3]), 'inp3': np.array([5])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, argv_freeze_placeholder_with_value) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), @@ -522,22 +553,22 @@ class TestShapesParsing(unittest.TestCase): input_node_names_ref = "inp1,inp2,inp3" self.assertEqual(sorted(list(placeholder_values_res.keys())), sorted(list(placeholder_values_ref.keys()))) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) self.assertEqual(input_node_names_ref, input_node_names_res) def test_get_shapes_several_inputs_several_shapes6(self): # 0D value for freezing specified using --input command line parameter without shape argv_input = "inp1[3 1]->[1.0 2.0 3.0],inp2[3 2 3],inp3->False" result, _ = get_placeholder_shapes(argv_input, None) - exp_res = {'inp1': np.array([3, 1]), 'inp2': np.array([3, 2, 3]), 'inp3': np.array(False).shape} + exp_res = {'inp1': np.array([3, 1]), 'inp2': np.array([3, 2, 3]), 'inp3': None} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': False} self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_several_inputs_several_shapes7(self): # 0D shape and value for freezing specified using --input command line parameter @@ -546,12 +577,12 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([3, 1]), 'inp2': np.array([3, 2, 3]), 'inp3': np.array(False).shape} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': True} self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_and_data_types1(self): argv_input = "inp1[3 1]->[1.0 2.0 3.0],inp2[3 2 3]{i32},inp3[5]{f32}->[1.0 1.0 2.0 3.0 5.0]" @@ -560,7 +591,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {'inp2': np.int32, 'inp3': np.float32} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -572,7 +603,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {'inp2': np.int32, '0:inp3': np.float32} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -584,7 +615,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {'inp2': np.int32, 'inp3:4': np.float32} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -597,7 +628,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -610,7 +641,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -623,7 +654,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {'placeholder1': np.int32, 'placeholder3': np.int32} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -672,28 +703,28 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([1, 22, 333, 123])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_shapes_no_input_no_shape(self): argv_input = "" input_shapes = "" result, _ = get_placeholder_shapes(argv_input, input_shapes) - exp_res = np.array([None]) - npt.assert_array_equal(result, exp_res) + exp_res = None + assert np.array_equal(result, exp_res) def test_get_shapes_no_input_one_shape(self): argv_input = "" input_shapes = "(12,4,1)" result, _ = get_placeholder_shapes(argv_input, input_shapes) exp_res = np.array([12, 4, 1]) - npt.assert_array_equal(result, exp_res) + assert np.array_equal(result, exp_res) def test_get_shapes_no_input_one_shape2(self): argv_input = "" input_shapes = "[12,4,1]" result, _ = get_placeholder_shapes(argv_input, input_shapes) exp_res = np.array([12, 4, 1]) - npt.assert_array_equal(result, exp_res) + assert np.array_equal(result, exp_res) def test_get_shapes_no_input_two_shapes(self): argv_input = "" @@ -704,10 +735,10 @@ class TestShapesParsing(unittest.TestCase): argv_input = "inp1" input_shapes = "" result, _ = get_placeholder_shapes(argv_input, input_shapes) - exp_res = {'inp1': np.array([None])} + exp_res = {'inp1': None} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_shapes_one_input_wrong_shape8(self): argv_input = "inp1" @@ -766,7 +797,7 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': np.array([-1, 4, 1]), 'inp2': np.array([4, 6, 8])} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_shapes_one_input_first_neg_shape_not_one(self): argv_input = "inp1" @@ -791,7 +822,7 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': (1, (0, 22), (1, 100), -1), 'inp2': (-1, (45, np.iinfo(np.int64).max), 7, 1)} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_shapes_several_inputs_several_partial_shapes2(self): # shapes specified using --input command line parameter and no values @@ -800,44 +831,44 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': (1, -1, (50, 100), 123), 'inp2': (-1, (45,np.iinfo(np.int64).max), (0, 7), 1)} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {} input_node_names_ref = "inp1,inp2" self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_several_inputs_several_partial_shapes3(self): # shapes and value for freezing specified using --input command line parameter argv_input = "inp1[3 1]->[1.0 2.0 3.0],inp2[3.. ..2 5..10 ? -1],inp3[5]->[1.0 1.0 2.0 3.0 5.0]" result, _ = get_placeholder_shapes(argv_input, None) - exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': (5)} + exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': (5,)} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': np.array(['1.0', '1.0', '2.0', '3.0', '5.0'])} input_node_names_ref = "inp1,inp2,inp3" self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_several_inputs_several_partial_shapes4(self): # shapes specified using --input_shape and values for freezing using --input command line parameter argv_input = "inp1->[1.0 2.0 3.0],inp2,inp3->[1.0 1.0 2.0 3.0 5.0]" input_shapes = "(3,1), (3..,..2,5..10,?,-1), (5)" result, _ = get_placeholder_shapes(argv_input, input_shapes) - exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': (5)} + exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': (5,)} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': np.array(['1.0', '1.0', '2.0', '3.0', '5.0'])} input_node_names_ref = "inp1,inp2,inp3" self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) self.assertEqual(input_node_names_ref, input_node_names_res) def test_get_shapes_several_inputs_several_partial_shapes5(self): @@ -847,32 +878,32 @@ class TestShapesParsing(unittest.TestCase): argv_freeze_placeholder_with_value = "inp2->[5.0 7.0 3.0],inp4->[100.0 200.0]" result, _ = get_placeholder_shapes(argv_input, input_shapes) - exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': (5)} + exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': (5,)} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, argv_freeze_placeholder_with_value) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': np.array(['1.0', '1.0', '2.0', '3.0', '5.0'],), 'inp2': np.array(['5.0', '7.0', '3.0']), 'inp4': np.array(['100.0', '200.0'])} input_node_names_ref = "inp1,inp2,inp3" self.assertEqual(sorted(list(placeholder_values_res.keys())), sorted(list(placeholder_values_ref.keys()))) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) self.assertEqual(input_node_names_ref, input_node_names_res) def test_get_shapes_several_inputs_several_partial_shapes6(self): # 0D value for freezing specified using --input command line parameter without shape argv_input = "inp1[3 1]->[1.0 2.0 3.0],inp2[3.. ..2 5..10 ? -1],inp3->False" result, _ = get_placeholder_shapes(argv_input, None) - exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': np.array(False).shape} + exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': None} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': False} self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_several_inputs_several_partial_shapes7(self): # 0D shape and value for freezing specified using --input command line parameter @@ -881,12 +912,12 @@ class TestShapesParsing(unittest.TestCase): exp_res = {'inp1': (3, 1), 'inp2': ((3, np.iinfo(np.int64).max), (0, 2), (5, 10), -1, -1), 'inp3': np.array(False).shape} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) placeholder_values_res, input_node_names_res = get_freeze_placeholder_values(argv_input, None) placeholder_values_ref = {'inp1': np.array(['1.0', '2.0', '3.0']), 'inp3': True} self.assertEqual(list(placeholder_values_res.keys()), list(placeholder_values_ref.keys())) for i in placeholder_values_ref.keys(): - npt.assert_array_equal(placeholder_values_res[i], placeholder_values_ref[i]) + assert np.array_equal(placeholder_values_res[i], placeholder_values_ref[i]) def test_get_shapes_and_data_types_partial_shape_with_input_port(self): argv_input = "inp1:1[3 1]->[1.0 2.0 3.0],0:inp2[3.. ..2 5..10 ? -1]{i32},inp3:4[5]{f32}->[1.0 1.0 2.0 3.0 5.0]" @@ -895,7 +926,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {'0:inp2': np.int32, 'inp3:4': np.float32} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -907,7 +938,7 @@ class TestShapesParsing(unittest.TestCase): ref_result_data_types = {'inp2:3': np.int32, 'inp3:4': np.float32} self.assertEqual(list(ref_result_shapes.keys()), list(result_shapes.keys())) for i in ref_result_shapes.keys(): - npt.assert_array_equal(result_shapes[i], ref_result_shapes[i]) + assert np.array_equal(result_shapes[i], ref_result_shapes[i]) self.assertEqual(list(ref_result_data_types.keys()), list(result_data_types.keys())) for i in ref_result_data_types.keys(): np.testing.assert_equal(result_data_types[i], ref_result_data_types[i]) @@ -1137,7 +1168,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[n,h,w,c]', 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_2(self): argv_layout = "name1(nhwc),name2(nhwc->nchw)" @@ -1146,7 +1177,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': 'nhwc', 'target_layout': 'nchw'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_3(self): argv_layout = "name1(n...c),name2(n...c->nc...)" @@ -1155,7 +1186,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': 'n...c', 'target_layout': 'nc...'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_4(self): argv_layout = "nhwc" @@ -1163,7 +1194,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': 'nhwc', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_5(self): argv_layout = "[n,h,w,c]" @@ -1171,7 +1202,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': '[n,h,w,c]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_6(self): argv_layout = "nhwc->nchw" @@ -1179,7 +1210,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': 'nhwc', 'target_layout': 'nchw'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_7(self): argv_layout = "[n,h,w,c]->[n,c,h,w]" @@ -1187,7 +1218,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': '[n,h,w,c]', 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_scalar(self): argv_layout = "name1(nhwc),name2([])" @@ -1196,7 +1227,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_1(self): argv_source_layout = "[n,h,w,c]" @@ -1204,7 +1235,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': '[n,h,w,c]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_2(self): argv_source_layout = "nhwc" @@ -1212,7 +1243,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': 'nhwc', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_3(self): argv_source_layout = "name1(nhwc),name2(nchw)" @@ -1221,7 +1252,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': 'nchw', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_4(self): argv_source_layout = "name1([n,h,w,c]),name2([n,c,h,w])" @@ -1230,7 +1261,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[n,c,h,w]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_5(self): argv_source_layout = "name1(nhwc),name2([n,c,h,w])" @@ -1239,7 +1270,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[n,c,h,w]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_6(self): argv_source_layout = "name1(nhwc),name2[n,c,h,w]" @@ -1248,7 +1279,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[n,c,h,w]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_layout_scalar(self): argv_source_layout = "name1(nhwc),name2([])" @@ -1257,7 +1288,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[]', 'target_layout': None}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_1(self): argv_target_layout = "[n,h,w,c]" @@ -1265,7 +1296,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': None, 'target_layout': '[n,h,w,c]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_2(self): argv_target_layout = "nhwc" @@ -1273,7 +1304,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': None, 'target_layout': 'nhwc'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_3(self): argv_target_layout = "name1(nhwc),name2(nchw)" @@ -1282,7 +1313,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': None, 'target_layout': 'nchw'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_4(self): argv_target_layout = "name1([n,h,w,c]),name2([n,c,h,w])" @@ -1291,7 +1322,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': None, 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_5(self): argv_target_layout = "name1(nhwc),name2([n,c,h,w])" @@ -1300,7 +1331,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': None, 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_6(self): argv_target_layout = "name1(nhwc),name2[n,c,h,w]" @@ -1309,7 +1340,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': None, 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_target_layout_scalar(self): argv_target_layout = "name1(nhwc),name2[]" @@ -1318,7 +1349,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': None, 'target_layout': '[]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_target_layout_1(self): argv_source_layout = "[n,h,w,c]" @@ -1327,7 +1358,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': '[n,h,w,c]', 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_target_layout_2(self): argv_source_layout = "nhwc" @@ -1336,7 +1367,7 @@ class TestLayoutParsing(unittest.TestCase): exp_res = {'': {'source_layout': 'nhwc', 'target_layout': 'nchw'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_target_layout_3(self): argv_source_layout = "name1(nhwc),name2(nhwc)" @@ -1346,7 +1377,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': 'nhwc', 'target_layout': 'nchw'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_target_layout_4(self): argv_source_layout = "name1([n,h,w,c]),name2([n,h,w,c])" @@ -1356,7 +1387,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[n,h,w,c]', 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_target_layout_5(self): argv_source_layout = "name1(nhwc),name2[n,h,w,c]" @@ -1366,7 +1397,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[n,h,w,c]', 'target_layout': '[n,c,h,w]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_source_target_layout_scalar(self): argv_source_layout = "name1(nhwc),name2[]" @@ -1376,7 +1407,7 @@ class TestLayoutParsing(unittest.TestCase): 'name2': {'source_layout': '[]', 'target_layout': '[]'}} self.assertEqual(list(exp_res.keys()), list(result.keys())) for i in exp_res.keys(): - npt.assert_array_equal(result[i], exp_res[i]) + assert np.array_equal(result[i], exp_res[i]) def test_get_layout_raises_if_layout_and_source_layout_provided(self): argv_layout = "nhwc" From ce753f41dc5aa091c85726961559e99e44da87b6 Mon Sep 17 00:00:00 2001 From: Luwei Zhou Date: Wed, 29 Dec 2021 10:39:50 +0800 Subject: [PATCH 13/78] [shape_infer]shape inference implement of Select Detectionoutput and Shufflechannels OPs (#8348) * Implement detection_output shape infer * revise and update the code flow * update based on review. * Update based on review * Implement the shuffle_channels Op shape inference. * Fix CI coding style issue. * Implement the select OP shape inference. * Update based on the review comments * Update based on the review comments. * Add pragma once for the shape inference head. * Add new shape_infer test file for detection_output OP. * Ensure the header would only be included once. * Add shuffle_channels OP shape infer test. * Add shape_infer() invocations into shape_inference() API shape_inference() API support Select, ShuffleChannels, DetectionOutput OPs Fix extra pragma, unnecessary friend function declaration. * Update based on the review comments. * Move the shape infer API helpers into new folder. * Applied review comments. * Applied 2nd review comments * Applied review comments * Fix coding style. * Update * Applied review comments. * Fix comipling issue of unused variable. * Fix the CI issue. * Update the coding style * Move test cases into new folder * Applied review comments. --- .../include/openvino/op/shuffle_channels.hpp | 5 +- .../op/util/detection_output_base.hpp | 3 +- .../detection_output_shape_inference.hpp | 343 ++++++++++++++++++ .../include/select_shape_inference.hpp | 48 +++ .../shuffle_channels_shape_inference.hpp | 46 +++ src/core/src/op/detection_output.cpp | 28 +- src/core/src/op/select.cpp | 37 +- src/core/src/op/shuffle_channels.cpp | 24 +- .../src/op/util/detection_output_base.cpp | 291 ++------------- src/core/src/partial_shape.cpp | 3 + src/core/tests/type_prop/detection_output.cpp | 26 +- src/core/tests/type_prop/select.cpp | 9 +- src/core/tests/type_prop/shuffle_channels.cpp | 26 ++ .../utils/shape_inference/shape_inference.cpp | 11 + .../utils/shape_inference/static_shape.cpp | 4 + .../detection_output_shape_inference.cpp | 177 +++++++++ .../select_shape_inference.cpp | 63 ++++ .../shuffle_channels_shape_inference.cpp | 26 ++ 18 files changed, 841 insertions(+), 329 deletions(-) create mode 100644 src/core/shape_inference/include/detection_output_shape_inference.hpp create mode 100644 src/core/shape_inference/include/select_shape_inference.hpp create mode 100644 src/core/shape_inference/include/shuffle_channels_shape_inference.hpp create mode 100644 src/tests/unit/cpu/shape_inference_test/detection_output_shape_inference.cpp create mode 100644 src/tests/unit/cpu/shape_inference_test/select_shape_inference.cpp create mode 100644 src/tests/unit/cpu/shape_inference_test/shuffle_channels_shape_inference.cpp diff --git a/src/core/include/openvino/op/shuffle_channels.hpp b/src/core/include/openvino/op/shuffle_channels.hpp index 3a231a9c266..f6c9c8e4f07 100644 --- a/src/core/include/openvino/op/shuffle_channels.hpp +++ b/src/core/include/openvino/op/shuffle_channels.hpp @@ -35,10 +35,10 @@ public: std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - int64_t get_axis() const { + const int64_t& get_axis() const { return m_axis; } - int64_t get_group() const { + const int64_t& get_group() const { return m_group; } OPENVINO_SUPPRESS_DEPRECATED_START @@ -48,7 +48,6 @@ public: private: bool evaluate_shuffle_channels(const HostTensorVector& outputs, const HostTensorVector& inputs) const; - int64_t m_axis{1}; int64_t m_group{1}; }; diff --git a/src/core/include/openvino/op/util/detection_output_base.hpp b/src/core/include/openvino/op/util/detection_output_base.hpp index 4a8030ae891..28ac6c7b5a2 100644 --- a/src/core/include/openvino/op/util/detection_output_base.hpp +++ b/src/core/include/openvino/op/util/detection_output_base.hpp @@ -34,10 +34,9 @@ public: DetectionOutputBase() = default; DetectionOutputBase(const OutputVector& args); - void validate_and_infer_types_base(const AttributesBase& attrs, Dimension num_classes); + void validate_base(const AttributesBase& attrs); bool visit_attributes_base(AttributeVisitor& visitor, AttributesBase& attrs); - Dimension compute_num_classes(const AttributesBase& attrs); }; } // namespace util diff --git a/src/core/shape_inference/include/detection_output_shape_inference.hpp b/src/core/shape_inference/include/detection_output_shape_inference.hpp new file mode 100644 index 00000000000..df7e1e4a62e --- /dev/null +++ b/src/core/shape_inference/include/detection_output_shape_inference.hpp @@ -0,0 +1,343 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace op { +namespace util { + +template ::value_type::value_type> +void compute_num_classes(const DetectionOutputBase* op, + const DetectionOutputBase::AttributesBase& attrs, + const std::vector& input_shapes, + V& num_classes, + V& num_prior_boxes) { + const T& box_logits_pshape = input_shapes[0]; + const T& class_preds_pshape = input_shapes[1]; + const T& proposals_pshape = input_shapes[2]; + T ad_class_preds_shape{}; + T ad_box_preds_shape{}; + bool have_five_inputs = false; + + NODE_VALIDATION_CHECK(op, + box_logits_pshape.rank().compatible(2), + "Box logits rank must be 2. Got ", + box_logits_pshape.rank().get_length()); + + NODE_VALIDATION_CHECK(op, + class_preds_pshape.rank().compatible(2), + "Class predictions rank must be 2. Got ", + class_preds_pshape.rank().get_length()); + NODE_VALIDATION_CHECK(op, + proposals_pshape.rank().compatible(3), + "Proposals rank must be 3. Got ", + proposals_pshape.rank().get_length()); + if (input_shapes.size() == 5) { + ad_class_preds_shape = input_shapes[3]; + NODE_VALIDATION_CHECK(op, + ad_class_preds_shape.rank().compatible(2), + "Additional class predictions rank must be 2. Got ", + ad_class_preds_shape.rank().get_length()); + ad_box_preds_shape = input_shapes[4]; + NODE_VALIDATION_CHECK(op, + ad_box_preds_shape.rank().compatible(2), + "Additional box predictions rank must be 2. Got ", + ad_box_preds_shape.rank().get_length()); + have_five_inputs = true; + } + + int64_t prior_box_size = attrs.normalized ? 4 : 5; + + // try to deduce a number of prior boxes + if (num_prior_boxes == 0 && proposals_pshape.rank().is_static() && proposals_pshape[2].is_static()) { + NODE_VALIDATION_CHECK(op, + (proposals_pshape[2].get_length()) % prior_box_size == 0, + "Proposals' third dimension must be a multiply of prior_box_size (", + prior_box_size, + "). Current value is: ", + proposals_pshape[2].get_length(), + "."); + num_prior_boxes = (proposals_pshape[2].get_length()) / prior_box_size; + NODE_VALIDATION_CHECK(op, + num_prior_boxes > 0, + "A number of prior boxes must be greater zero. Got: ", + num_prior_boxes); + } + if (num_prior_boxes == 0 && have_five_inputs && ad_class_preds_shape.rank().is_static() && + ad_class_preds_shape[1].is_static()) { + NODE_VALIDATION_CHECK( + op, + (ad_class_preds_shape[1].get_length()) % 2 == 0, + "Additional class predictions second dimension must be a multiply of 2. Current value is: ", + ad_class_preds_shape[1].get_length(), + "."); + num_prior_boxes = (ad_class_preds_shape[1].get_length()) / 2; + NODE_VALIDATION_CHECK(op, + num_prior_boxes > 0, + "A number of prior boxes must be greater zero. Got: ", + num_prior_boxes); + } + + // try to deduce a number of classes + if (num_classes == 0 && num_prior_boxes > 0 && class_preds_pshape.rank().is_static() && + class_preds_pshape[1].is_static()) { + NODE_VALIDATION_CHECK(op, + (class_preds_pshape[1].get_length()) % num_prior_boxes == 0, + "Class predictions second dimension must be a multiply of num_prior_boxes (", + num_prior_boxes, + "). Current value is: ", + class_preds_pshape[1].get_length(), + "."); + num_classes = (class_preds_pshape[1].get_length()) / num_prior_boxes; + } + if (num_classes == 0 && num_prior_boxes > 0 && box_logits_pshape.rank().is_static() && + box_logits_pshape[1].is_static() && !attrs.share_location) { + NODE_VALIDATION_CHECK(op, + (box_logits_pshape[1].get_length()) % (num_prior_boxes * 4) == 0, + "Box logits second dimension must be a multiply of num_prior_boxes * 4 (", + num_prior_boxes * 4, + "). Current value is: ", + box_logits_pshape[1].get_length(), + "."); + num_classes = (box_logits_pshape[1].get_length()) / (num_prior_boxes * 4); + } + if (num_classes == 0 && num_prior_boxes > 0 && have_five_inputs && ad_box_preds_shape.rank().is_static() && + ad_box_preds_shape[1].is_static() && !attrs.share_location) { + NODE_VALIDATION_CHECK(op, + (ad_box_preds_shape[1].get_length()) % (num_prior_boxes * 4) == 0, + "Additional box predictions second dimension must be a multiply of num_prior_boxes * 4 (", + num_prior_boxes * 4, + "). Current value is: ", + ad_box_preds_shape[1].get_length(), + "."); + num_classes = ad_box_preds_shape[1].get_length() / (num_prior_boxes * 4); + } +} + +template +void shape_infer_base(const DetectionOutputBase* op, + const DetectionOutputBase::AttributesBase& attrs, + const std::vector& input_shapes, + std::vector& output_shapes, + int64_t attribute_num_classes) { + using dim_t = typename std::iterator_traits::value_type; + using val_type = typename dim_t::value_type; + + NODE_VALIDATION_CHECK(op, (input_shapes.size() == 3 || input_shapes.size() == 5) && output_shapes.size() == 1); + + auto& ret_output_shape = output_shapes[0]; + ret_output_shape.resize(4); + + const auto& box_logits_pshape = input_shapes[0]; + const auto& class_preds_pshape = input_shapes[1]; + const auto& proposals_pshape = input_shapes[2]; + + val_type num_classes = 0; + val_type num_prior_boxes = 0; + dim_t dim_num_images{}; + bool dim_num_images_updated = false; + + if (attribute_num_classes == -1) { + ov::op::util::compute_num_classes(op, attrs, input_shapes, num_classes, num_prior_boxes); + } else { + num_classes = static_cast(attribute_num_classes); + } + + const val_type num_loc_classes = attrs.share_location ? 1 : num_classes; + const val_type prior_box_size = attrs.normalized ? 4 : 5; + + if (box_logits_pshape.rank().is_static()) { + NODE_VALIDATION_CHECK(op, + box_logits_pshape.size() == 2, + "Box logits rank must be 2. Got ", + box_logits_pshape.size()); + dim_num_images = box_logits_pshape[0]; + dim_num_images_updated = true; + if (!num_prior_boxes && box_logits_pshape[1].is_static()) { + auto box_logits_pshape_2nd_dim = box_logits_pshape[1].get_length(); + NODE_VALIDATION_CHECK(op, + (box_logits_pshape_2nd_dim % (num_loc_classes * 4)) == 0, + "Box logits' second dimension must be a multiply of num_loc_classes * 4 (", + num_loc_classes * 4, + "). Current value is: ", + box_logits_pshape_2nd_dim, + "."); + num_prior_boxes = box_logits_pshape_2nd_dim / (num_loc_classes * 4); + } + } + if (class_preds_pshape.rank().is_static()) { + NODE_VALIDATION_CHECK(op, + class_preds_pshape.size() == 2, + "Class predictions rank must be 2. Got ", + class_preds_pshape.size()); + if ((!dim_num_images_updated || dim_num_images.is_dynamic()) && class_preds_pshape[0].is_static()) { + dim_num_images = class_preds_pshape[0]; + dim_num_images_updated = true; + } else { + NODE_VALIDATION_CHECK( + op, + class_preds_pshape[0].compatible(dim_num_images), + "Class predictions' first dimension is not compatible with batch size. Current value is: ", + class_preds_pshape[0], + ", expected: ", + dim_num_images, + "."); + } + if (class_preds_pshape[1].is_static() && num_classes) { + auto class_preds_pshape_2nd_dim = class_preds_pshape[1].get_length(); + if (!num_prior_boxes) { + NODE_VALIDATION_CHECK(op, + class_preds_pshape_2nd_dim % num_classes == 0, + "Class predictions' second dimension must be a multiply of num_classes (", + num_classes, + "). Current value is: ", + class_preds_pshape_2nd_dim, + "."); + num_prior_boxes = class_preds_pshape_2nd_dim / num_classes; + } else { + NODE_VALIDATION_CHECK(op, + class_preds_pshape_2nd_dim == num_prior_boxes * num_classes, + "Class predictions' second dimension must be equal to num_prior_boxes * " + "num_classes (", + num_prior_boxes * num_classes, + "). Current value is: ", + class_preds_pshape_2nd_dim, + "."); + } + } + } + if (proposals_pshape.rank().is_static()) { + NODE_VALIDATION_CHECK(op, + proposals_pshape.size() == 3, + "Proposals rank must be 3. Got ", + proposals_pshape.size()); + NODE_VALIDATION_CHECK(op, + proposals_pshape[0].compatible(1) || proposals_pshape[0].compatible(dim_num_images), + "Proposals' first dimension is must be equal to either batch size (", + dim_num_images, + ") or 1. Got: ", + proposals_pshape[0], + "."); + + size_t proposals_expected_2nd_dim = attrs.variance_encoded_in_target ? 1 : 2; + NODE_VALIDATION_CHECK(op, + proposals_pshape[1].compatible(proposals_expected_2nd_dim), + "Proposals' second dimension is mismatched. Current value is: ", + proposals_pshape[1], + ", expected: ", + proposals_expected_2nd_dim, + "."); + + if (proposals_pshape[2].is_static()) { + auto proposals_pshape_3rd_dim = proposals_pshape[2].get_length(); + if (!num_prior_boxes) { + NODE_VALIDATION_CHECK(op, + proposals_pshape_3rd_dim % prior_box_size == 0, + "Proposals' third dimension must be a multiply of prior_box_size (", + prior_box_size, + "). Current value is: ", + proposals_pshape_3rd_dim, + "."); + num_prior_boxes = proposals_pshape_3rd_dim / prior_box_size; + } else { + NODE_VALIDATION_CHECK(op, + proposals_pshape_3rd_dim == num_prior_boxes * prior_box_size, + "Proposals' third dimension must be equal to num_prior_boxes " + "* prior_box_size (", + num_prior_boxes * prior_box_size, + "). Current value is: ", + proposals_pshape_3rd_dim, + "."); + } + } + } + + if (input_shapes.size() == 5) { + const auto& aux_class_preds_pshape = input_shapes[3]; + const auto& aux_box_preds_pshape = input_shapes[4]; + if (aux_class_preds_pshape.rank().is_static()) { + NODE_VALIDATION_CHECK(op, + aux_class_preds_pshape.size() == 2, + "additional class predictions rank must be 2. Got ", + aux_class_preds_pshape.size()); + NODE_VALIDATION_CHECK(op, + aux_class_preds_pshape[0].compatible(dim_num_images), + "Additional class predictions' first dimension must be " + "compatible with batch size. Current value is: ", + aux_class_preds_pshape[0], + ", expected: ", + dim_num_images, + "."); + if (num_prior_boxes) { + NODE_VALIDATION_CHECK(op, + aux_class_preds_pshape[1].compatible(num_prior_boxes * 2), + "Additional class predictions' second dimension must be compatible with " + "num_prior_boxes * 2. Current value is: ", + aux_class_preds_pshape[1], + ", expected: ", + num_prior_boxes * 2, + "."); + } + + if (aux_class_preds_pshape[1].is_static()) + num_prior_boxes = aux_class_preds_pshape[1].get_length() / 2; + } + NODE_VALIDATION_CHECK( + op, + aux_box_preds_pshape.compatible(box_logits_pshape), + "Additional box predictions' shape must be compatible with box logits shape. Current value is: ", + aux_box_preds_pshape, + ", expected: ", + box_logits_pshape, + "."); + } + + ret_output_shape[0] = 1; + ret_output_shape[1] = 1; + ret_output_shape[3] = 7; + + const dim_t dim_num_prior_boxes = num_prior_boxes ? dim_t{num_prior_boxes} : Dimension(); + const dim_t dim_num_classes = num_classes ? dim_t{num_classes} : Dimension(); + + if (attrs.keep_top_k[0] > 0) { + ret_output_shape[2] = dim_num_images * attrs.keep_top_k[0]; + } else if (attrs.keep_top_k[0] == -1 && attrs.top_k > 0) { + ret_output_shape[2] = dim_num_images * attrs.top_k * dim_num_classes; + } else { + ret_output_shape[2] = dim_num_images * dim_num_prior_boxes * dim_num_classes; + } +} + +} // namespace util +} // namespace op +} // namespace ov + +namespace ov { +namespace op { +namespace v0 { +template +void shape_infer(const DetectionOutput* op, const std::vector& input_shapes, std::vector& output_shapes) { + const auto& attrs = op->get_attrs(); + ov::op::util::shape_infer_base(op, attrs, input_shapes, output_shapes, attrs.num_classes); +} + +} // namespace v0 +} // namespace op +} // namespace ov + +namespace ov { +namespace op { +namespace v8 { + +template +void shape_infer(const DetectionOutput* op, const std::vector& input_shapes, std::vector& output_shapes) { + ov::op::util::shape_infer_base(op, op->get_attrs(), input_shapes, output_shapes, -1); +} + +} // namespace v8 +} // namespace op +} // namespace ov \ No newline at end of file diff --git a/src/core/shape_inference/include/select_shape_inference.hpp b/src/core/shape_inference/include/select_shape_inference.hpp new file mode 100644 index 00000000000..afc8ee37dce --- /dev/null +++ b/src/core/shape_inference/include/select_shape_inference.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace op { +namespace v1 { + +template +void shape_infer(const Select* op, const std::vector& input_shapes, std::vector& output_shapes) { + NODE_VALIDATION_CHECK(op, input_shapes.size() == 3 && output_shapes.size() == 1); + + const auto& broadcast_spec = op->get_auto_broadcast(); + auto& result_shape = output_shapes[0]; + if (broadcast_spec.m_type == op::AutoBroadcastType::PDPD) { + result_shape = input_shapes[1]; // 'then' tensor + // in PDPD type, Broacast-merging 'else' into 'then' one way not each other. + NODE_VALIDATION_CHECK(op, + T::broadcast_merge_into(result_shape, input_shapes[2], broadcast_spec), + "'Else' tensor shape is not broadcastable."); + NODE_VALIDATION_CHECK(op, + T::broadcast_merge_into(result_shape, input_shapes[0], broadcast_spec), + "'Cond' tensor shape is not broadcastable."); + } else { + result_shape = input_shapes[2]; + for (int input_port = 1; input_port >= 0; input_port--) { + if (broadcast_spec.m_type == op::AutoBroadcastType::NONE) { + NODE_VALIDATION_CHECK(op, + T::merge_into(result_shape, input_shapes[input_port]), + "Argument shapes are inconsistent."); + } else if (broadcast_spec.m_type == op::AutoBroadcastType::NUMPY) { + NODE_VALIDATION_CHECK(op, + T::broadcast_merge_into(result_shape, input_shapes[input_port], broadcast_spec), + "Argument shapes are inconsistent."); + } else { + NODE_VALIDATION_CHECK(op, false, "Unsupported auto broadcast specification"); + } + } + } +} + +} // namespace v1 +} // namespace op +} // namespace ov \ No newline at end of file diff --git a/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp b/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp new file mode 100644 index 00000000000..587f8e8d7eb --- /dev/null +++ b/src/core/shape_inference/include/shuffle_channels_shape_inference.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace op { +namespace v0 { + +template +void shape_infer(const ShuffleChannels* op, const std::vector& input_shapes, std::vector& output_shapes) { + NODE_VALIDATION_CHECK(op, input_shapes.size() == 1 && output_shapes.size() == 1); + + const auto& group = op->get_group(); + NODE_VALIDATION_CHECK(op, group >= 1, "The 'group' parameter must be greater or equal to 1."); + + const auto& input_shape = input_shapes[0]; + const auto input_shape_rank = input_shape.rank(); + + if (input_shape_rank.is_static()) { + const int64_t input_rank_value = static_cast(input_shape.size()); + NODE_VALIDATION_CHECK(op, input_rank_value >= 1, "The input tensor's shape is expected to be at least 1D."); + + const auto& axis = op->get_axis(); + NODE_VALIDATION_CHECK(op, + axis < input_rank_value && axis >= (0 - input_rank_value), + "The 'axis' parameter for ShuffleChannels has to point to one of the " + "input tensor's shape dimensions."); + size_t axis_zb = static_cast(axis >= 0 ? axis : (axis + input_rank_value)); + + if (input_shape[axis_zb].is_static()) { + const auto channel_dim_size = input_shape[axis_zb].get_length(); + NODE_VALIDATION_CHECK(op, + channel_dim_size % group == 0, + "The channel dimension size has to be a multiple of the groups parameter value."); + } + } + output_shapes[0] = input_shape; +} + +} // namespace v0 +} // namespace op +} // namespace ov \ No newline at end of file diff --git a/src/core/src/op/detection_output.cpp b/src/core/src/op/detection_output.cpp index cac74673f34..92071e01ca6 100644 --- a/src/core/src/op/detection_output.cpp +++ b/src/core/src/op/detection_output.cpp @@ -4,6 +4,8 @@ #include "ngraph/op/detection_output.hpp" +#include + #include "itt.hpp" using namespace std; @@ -33,7 +35,16 @@ ov::op::v0::DetectionOutput::DetectionOutput(const Output& box_logits, void ov::op::v0::DetectionOutput::validate_and_infer_types() { NGRAPH_OP_SCOPE(v0_DetectionOutput_validate_and_infer_types); NODE_VALIDATION_CHECK(this, m_attrs.num_classes > 0, "Number of classes must be greater than zero"); - validate_and_infer_types_base(m_attrs, m_attrs.num_classes); + validate_base(m_attrs); + std::vector input_shapes; + for (auto input_idx = 0; input_idx < get_input_size(); input_idx++) + input_shapes.push_back(get_input_partial_shape(input_idx)); + std::vector output_shapes = {ov::PartialShape{}}; + + shape_infer(this, input_shapes, output_shapes); + + set_output_size(1); + set_output_type(0, get_input_element_type(0), output_shapes[0]); } shared_ptr ov::op::v0::DetectionOutput::clone_with_new_inputs(const OutputVector& new_args) const { @@ -86,8 +97,17 @@ ov::op::v8::DetectionOutput::DetectionOutput(const Output& box_logits, } void ov::op::v8::DetectionOutput::validate_and_infer_types() { - NGRAPH_OP_SCOPE(v0_DetectionOutput_validate_and_infer_types); - validate_and_infer_types_base(m_attrs, Dimension::dynamic()); + NGRAPH_OP_SCOPE(v8_DetectionOutput_validate_and_infer_types); + validate_base(m_attrs); + std::vector input_shapes; + for (auto input_idx = 0; input_idx < get_input_size(); input_idx++) + input_shapes.push_back(get_input_partial_shape(input_idx)); + std::vector output_shapes = {ov::PartialShape{}}; + + shape_infer(this, input_shapes, output_shapes); + + set_output_size(1); + set_output_type(0, get_input_element_type(0), output_shapes[0]); } shared_ptr ov::op::v8::DetectionOutput::clone_with_new_inputs(const OutputVector& new_args) const { @@ -114,4 +134,4 @@ bool ov::op::v8::DetectionOutput::visit_attributes(AttributeVisitor& visitor) { NGRAPH_OP_SCOPE(v0_DetectionOutput_visit_attributes); visit_attributes_base(visitor, m_attrs); return true; -} +} \ No newline at end of file diff --git a/src/core/src/op/select.cpp b/src/core/src/op/select.cpp index 15f6af798e8..ea098d43a66 100644 --- a/src/core/src/op/select.cpp +++ b/src/core/src/op/select.cpp @@ -6,6 +6,7 @@ #include #include +#include #include "itt.hpp" #include "ngraph/attribute_visitor.hpp" @@ -42,36 +43,12 @@ void op::v1::Select::validate_and_infer_types() { element::Type::merge(result_et, get_input_element_type(1), get_input_element_type(2)), "Argument 1 and 2 element types must match."); - ov::PartialShape result_shape; - if (get_auto_broadcast().m_type == op::AutoBroadcastType::PDPD) { - result_shape = get_input_partial_shape(1); // 'then' tensor - NODE_VALIDATION_CHECK( - this, - ov::PartialShape::broadcast_merge_into(result_shape, get_input_partial_shape(2), get_auto_broadcast()), - "'Else' tensor shape is not broadcastable."); - NODE_VALIDATION_CHECK( - this, - ov::PartialShape::broadcast_merge_into(result_shape, get_input_partial_shape(0), get_auto_broadcast()), - "'Cond' tensor shape is not broadcastable."); - } else { - result_shape = get_input_partial_shape(2); - for (int i = 1; i >= 0; i--) { - if (get_auto_broadcast().m_type == op::AutoBroadcastType::NONE) { - NODE_VALIDATION_CHECK(this, - ov::PartialShape::merge_into(result_shape, get_input_partial_shape(i)), - "Argument shapes are inconsistent."); - } else if (get_auto_broadcast().m_type == op::AutoBroadcastType::NUMPY) { - NODE_VALIDATION_CHECK(this, - ov::PartialShape::broadcast_merge_into(result_shape, - get_input_partial_shape(i), - get_auto_broadcast()), - "Argument shapes are inconsistent."); - } else { - NODE_VALIDATION_CHECK(this, false, "Unsupported auto broadcast specification"); - } - } - } - set_output_type(0, result_et, result_shape); + std::vector output_shapes = {ov::PartialShape{}}; + const std::vector input_shapes = {get_input_partial_shape(0), + get_input_partial_shape(1), + get_input_partial_shape(2)}; + shape_infer(this, input_shapes, output_shapes); + set_output_type(0, result_et, output_shapes[0]); } shared_ptr op::v1::Select::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/op/shuffle_channels.cpp b/src/core/src/op/shuffle_channels.cpp index 6a44f846c0a..6866eba13e8 100644 --- a/src/core/src/op/shuffle_channels.cpp +++ b/src/core/src/op/shuffle_channels.cpp @@ -5,6 +5,7 @@ #include "ngraph/op/shuffle_channels.hpp" #include +#include #include "itt.hpp" #include "ngraph/attribute_visitor.hpp" @@ -48,25 +49,12 @@ size_t op::ShuffleChannels::get_zero_based_axis() const { void op::ShuffleChannels::validate_and_infer_types() { NGRAPH_OP_SCOPE(v0_ShuffleChannels_validate_and_infer_types); + const auto& data_type = get_input_element_type(0); - if (get_input_partial_shape(0).is_static()) { - const auto shape = get_input_shape(0); - NODE_VALIDATION_CHECK(this, shape.size() >= 1, "The input tensor's shape is expected to be at least 1D."); - - size_t axis_zb = get_zero_based_axis(); - NODE_VALIDATION_CHECK(this, - axis_zb < shape.size(), - "The 'axis' parameter for ShuffleChannels has to point to one of the " - "input tensor's shape dimensions."); - - NODE_VALIDATION_CHECK(this, m_group >= 1, "The 'group' parameter must be greater or equal to 1."); - - const auto channel_dim_size = shape.at(axis_zb); - NODE_VALIDATION_CHECK(this, - channel_dim_size % m_group == 0, - "The channel dimension size has to be a multiple of the groups parameter value."); - } - set_output_type(0, data_type, get_input_partial_shape(0)); + std::vector output_shapes = {ov::PartialShape{}}; + const std::vector input_shapes = {get_input_partial_shape(0)}; + shape_infer(this, input_shapes, output_shapes); + set_output_type(0, data_type, output_shapes[0]); } shared_ptr op::ShuffleChannels::clone_with_new_inputs(const OutputVector& new_args) const { diff --git a/src/core/src/op/util/detection_output_base.cpp b/src/core/src/op/util/detection_output_base.cpp index c8e2462c012..b16de435600 100644 --- a/src/core/src/op/util/detection_output_base.cpp +++ b/src/core/src/op/util/detection_output_base.cpp @@ -4,6 +4,7 @@ #include "ngraph/op/util/detection_output_base.hpp" +#include #include #include "ngraph/op/concat.hpp" @@ -17,128 +18,7 @@ using namespace ov::op::util; DetectionOutputBase::DetectionOutputBase(const ov::OutputVector& args) : Op(args) {} -ov::Dimension DetectionOutputBase::compute_num_classes(const AttributesBase& attrs) { - Dimension num_classes = Dimension::dynamic(); - - NODE_VALIDATION_CHECK(this, - 3 <= get_input_size() && get_input_size() <= 5, - "A number of arguments must be greater than or equal to 3 and less than or equal to 5. Got " + - std::to_string(get_input_size())); - - const ov::PartialShape& box_logits_pshape = get_input_partial_shape(0); - const ov::PartialShape& class_preds_pshape = get_input_partial_shape(1); - const ov::PartialShape& proposals_pshape = get_input_partial_shape(2); - ov::PartialShape ad_class_preds_shape = ov::PartialShape::dynamic(); - ov::PartialShape ad_box_preds_shape = ov::PartialShape::dynamic(); - - if (box_logits_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK( - this, - box_logits_pshape.rank().get_length() == 2, - "Box logits rank must be 2. Got " + std::to_string(box_logits_pshape.rank().get_length())); - } - if (class_preds_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK( - this, - class_preds_pshape.rank().get_length() == 2, - "Class predictions rank must be 2. Got " + std::to_string(class_preds_pshape.rank().get_length())); - } - if (proposals_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK(this, - proposals_pshape.rank().get_length() == 3, - "Proposals rank must be 3. Got " + std::to_string(proposals_pshape.rank().get_length())); - } - if (get_input_size() >= 4) { - ad_class_preds_shape = get_input_partial_shape(3); - if (ad_class_preds_shape.rank().is_static()) { - NODE_VALIDATION_CHECK(this, - ad_class_preds_shape.rank().get_length() == 2, - "Additional class predictions rank must be 2. Got " + - std::to_string(ad_class_preds_shape.rank().get_length())); - } - } - if (get_input_size() == 5) { - ad_box_preds_shape = get_input_partial_shape(4); - if (ad_box_preds_shape.rank().is_static()) { - NODE_VALIDATION_CHECK(this, - ad_box_preds_shape.rank().get_length() == 2, - "Additional box predictions rank must be 2. Got " + - std::to_string(ad_box_preds_shape.rank().get_length())); - } - } - - int prior_box_size = attrs.normalized ? 4 : 5; - Dimension num_prior_boxes = Dimension::dynamic(); - - // try to deduce a number of prior boxes - if (num_prior_boxes.is_dynamic() && proposals_pshape.rank().is_static() && proposals_pshape[2].is_static()) { - NODE_VALIDATION_CHECK(this, - proposals_pshape[2].get_length() % prior_box_size == 0, - "Proposals' third dimension must be a multiply of prior_box_size (" + - std::to_string(prior_box_size) + "). Current value is: ", - proposals_pshape[2].get_length(), - "."); - num_prior_boxes = proposals_pshape[2].get_length() / prior_box_size; - NODE_VALIDATION_CHECK( - this, - num_prior_boxes.get_length() > 0, - "A number of prior boxes must be greater zero. Got: " + std::to_string(num_prior_boxes.get_length())); - } - if (num_prior_boxes.is_dynamic() && ad_class_preds_shape.rank().is_static() && - ad_class_preds_shape[1].is_static()) { - NODE_VALIDATION_CHECK( - this, - ad_class_preds_shape[1].get_length() % 2 == 0, - "Additional class predictions second dimension must be a multiply of 2. Current value is: ", - ad_class_preds_shape[1].get_length(), - "."); - num_prior_boxes = ad_class_preds_shape[1].get_length() / 2; - NODE_VALIDATION_CHECK( - this, - num_prior_boxes.get_length() > 0, - "A number of prior boxes must be greater zero. Got: " + std::to_string(num_prior_boxes.get_length())); - } - - // try to deduce a number of classes - if (num_classes.is_dynamic() && num_prior_boxes.is_static() && class_preds_pshape.rank().is_static() && - class_preds_pshape[1].is_static()) { - NODE_VALIDATION_CHECK(this, - class_preds_pshape[1].get_length() % num_prior_boxes.get_length() == 0, - "Class predictions second dimension must be a multiply of num_prior_boxes (" + - std::to_string(num_prior_boxes.get_length()) + "). Current value is: ", - class_preds_pshape[1].get_length(), - "."); - num_classes = class_preds_pshape[1].get_length() / num_prior_boxes.get_length(); - } - if (num_classes.is_dynamic() && num_prior_boxes.is_static() && box_logits_pshape.rank().is_static() && - box_logits_pshape[1].is_static() && !attrs.share_location) { - NODE_VALIDATION_CHECK(this, - box_logits_pshape[1].get_length() % (num_prior_boxes.get_length() * 4) == 0, - "Box logits second dimension must be a multiply of num_prior_boxes * 4 (" + - std::to_string(num_prior_boxes.get_length() * 4) + "). Current value is: ", - box_logits_pshape[1].get_length(), - "."); - num_classes = box_logits_pshape[1].get_length() / (num_prior_boxes.get_length() * 4); - } - if (num_classes.is_dynamic() && num_prior_boxes.is_static() && ad_box_preds_shape.is_static() && - ad_box_preds_shape[1].is_static() && !attrs.share_location) { - NODE_VALIDATION_CHECK( - this, - ad_box_preds_shape[1].get_length() % (num_prior_boxes.get_length() * 4) == 0, - "Additional box predictions second dimension must be a multiply of num_prior_boxes * 4 (" + - std::to_string(num_prior_boxes.get_length() * 4) + "). Current value is: ", - ad_box_preds_shape[1].get_length(), - "."); - num_classes = ad_box_preds_shape[1].get_length() / (num_prior_boxes.get_length() * 4); - } - - return num_classes; -} - -void DetectionOutputBase::validate_and_infer_types_base(const DetectionOutputBase::AttributesBase& attrs, - ov::Dimension num_classes) { - NODE_VALIDATION_CHECK(this, attrs.keep_top_k.size() > 0, "keep_top_k attribute must be provided"); - +void DetectionOutputBase::validate_base(const DetectionOutputBase::AttributesBase& attrs) { NODE_VALIDATION_CHECK( this, attrs.code_type == "caffe.PriorBoxParameter.CORNER" || attrs.code_type == "caffe.PriorBoxParameter.CENTER_SIZE", @@ -159,117 +39,7 @@ void DetectionOutputBase::validate_and_infer_types_base(const DetectionOutputBas proposals_et.is_real(), "Proposals' data type must be floating point. Got " + proposals_et.get_type_name()); - const ov::PartialShape& box_logits_pshape = get_input_partial_shape(0); - const ov::PartialShape& class_preds_pshape = get_input_partial_shape(1); - const ov::PartialShape& proposals_pshape = get_input_partial_shape(2); - - // deduce a number of classes for DetectionOutput-8 - if (num_classes.is_dynamic()) { - num_classes = compute_num_classes(attrs); - } - - Dimension num_loc_classes = attrs.share_location ? 1 : num_classes; - int prior_box_size = attrs.normalized ? 4 : 5; - - Dimension num_images = Dimension::dynamic(); - Dimension num_prior_boxes = Dimension::dynamic(); - if (box_logits_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK( - this, - box_logits_pshape.rank().get_length() == 2, - "Box logits rank must be 2. Got " + std::to_string(box_logits_pshape.rank().get_length())); - num_images = box_logits_pshape[0]; - if (box_logits_pshape[1].is_static() && num_loc_classes.is_static()) { - NODE_VALIDATION_CHECK(this, - (box_logits_pshape[1].get_length() % (num_loc_classes.get_length() * 4)) == 0, - "Box logits' second dimension must be a multiply of num_loc_classes * 4 (" + - std::to_string(num_loc_classes.get_length() * 4) + "). Current value is: ", - box_logits_pshape[1].get_length(), - "."); - num_prior_boxes = box_logits_pshape[1].get_length() / (num_loc_classes.get_length() * 4); - } - } - if (class_preds_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK( - this, - class_preds_pshape.rank().get_length() == 2, - "Class predictions rank must be 2. Got " + std::to_string(class_preds_pshape.rank().get_length())); - if (num_images.is_dynamic() && class_preds_pshape[0].is_static()) { - num_images = class_preds_pshape[0]; - } else { - NODE_VALIDATION_CHECK(this, - class_preds_pshape[0].compatible(num_images), - "Class predictions' first dimension is not compatible with batch size."); - } - if (class_preds_pshape[1].is_static()) { - if (num_prior_boxes.is_dynamic() && num_classes.is_static()) { - NODE_VALIDATION_CHECK(this, - class_preds_pshape[1].get_length() % num_classes.get_length() == 0, - "Class predictions' second dimension must be a multiply of num_classes (" + - std::to_string(num_classes.get_length()) + "). Current value is: ", - class_preds_pshape[1].get_length(), - "."); - num_prior_boxes = class_preds_pshape[1].get_length() / num_classes.get_length(); - } else if (num_classes.is_static()) { - int num_prior_boxes_val = num_prior_boxes.get_length(); - NODE_VALIDATION_CHECK( - this, - class_preds_pshape[1].get_length() == num_prior_boxes_val * num_classes.get_length(), - "Class predictions' second dimension must be equal to num_prior_boxes * " - "num_classes (" + - std::to_string(num_prior_boxes_val * num_classes.get_length()) + "). Current value is: ", - class_preds_pshape[1].get_length(), - "."); - } - } - } - if (proposals_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK(this, - proposals_pshape.rank().get_length() == 3, - "Proposals rank must be 3. Got " + std::to_string(proposals_pshape.rank().get_length())); - if (num_images.is_static() && proposals_pshape[0].is_static()) { - int64_t proposals_1st_dim = proposals_pshape[0].get_length(); - int64_t num_images_val = num_images.get_length(); - NODE_VALIDATION_CHECK(this, - proposals_1st_dim == 1 || proposals_1st_dim == num_images_val, - "Proposals' first dimension is must be equal to either batch size (" + - std::to_string(num_images_val) + - ") or 1. Got: " + std::to_string(proposals_1st_dim) + "."); - } - if (proposals_pshape[1].is_static()) { - size_t proposals_expected_2nd_dim = attrs.variance_encoded_in_target ? 1 : 2; - NODE_VALIDATION_CHECK(this, - proposals_pshape[1].compatible(proposals_expected_2nd_dim), - "Proposals' second dimension is mismatched. Current value is: ", - proposals_pshape[1].get_length(), - ", expected: ", - proposals_expected_2nd_dim, - "."); - } - if (proposals_pshape[2].is_static()) { - if (num_prior_boxes.is_dynamic()) { - NODE_VALIDATION_CHECK(this, - proposals_pshape[2].get_length() % prior_box_size == 0, - "Proposals' third dimension must be a multiply of prior_box_size (" + - std::to_string(prior_box_size) + "). Current value is: ", - proposals_pshape[2].get_length(), - "."); - num_prior_boxes = proposals_pshape[2].get_length() / prior_box_size; - } else { - int num_prior_boxes_val = num_prior_boxes.get_length(); - NODE_VALIDATION_CHECK(this, - proposals_pshape[2].get_length() == num_prior_boxes_val * prior_box_size, - "Proposals' third dimension must be equal to num_prior_boxes " - "* prior_box_size (" + - std::to_string(num_prior_boxes_val * prior_box_size) + - "). Current value is: ", - proposals_pshape[2].get_length(), - "."); - } - } - } - - if (get_input_size() > 3) { + if (get_input_size() == 5) { auto aux_class_preds_et = get_input_element_type(3); NODE_VALIDATION_CHECK(this, aux_class_preds_et == class_preds_et, @@ -281,42 +51,7 @@ void DetectionOutputBase::validate_and_infer_types_base(const DetectionOutputBas aux_box_preds_et == box_logits_et, "Additional box predictions' data type must be the same as box logits data type (" + box_logits_et.get_type_name() + "). Got " + aux_box_preds_et.get_type_name()); - - const ov::PartialShape& aux_class_preds_pshape = get_input_partial_shape(3); - const ov::PartialShape& aux_box_preds_pshape = get_input_partial_shape(4); - if (aux_class_preds_pshape.rank().is_static()) { - NODE_VALIDATION_CHECK(this, - aux_class_preds_pshape[0].compatible(num_images), - "Additional class predictions' first dimension must be " - "compatible with batch size."); - if (num_prior_boxes.is_static()) { - int num_prior_boxes_val = num_prior_boxes.get_length(); - NODE_VALIDATION_CHECK(this, - aux_class_preds_pshape[1].get_length() == num_prior_boxes_val * 2, - "Additional class predictions' second dimension must be equal to " - "num_prior_boxes * 2 (" + - std::to_string(num_prior_boxes_val * 2) + "). Got " + - std::to_string(aux_class_preds_pshape[1].get_length()) + "."); - } - } - NODE_VALIDATION_CHECK(this, - aux_box_preds_pshape.compatible(box_logits_pshape), - "Additional box predictions' shape must be compatible with box logits shape."); } - - std::vector output_shape{1, 1}; - if (attrs.keep_top_k[0] > 0) { - output_shape.push_back(num_images * attrs.keep_top_k[0]); - } else if (attrs.top_k > 0 && num_classes.is_static()) { - output_shape.push_back(num_images * attrs.top_k * num_classes); - } else if (num_classes.is_static()) { - output_shape.push_back(num_images * num_prior_boxes * num_classes); - } else { - output_shape.push_back(Dimension::dynamic()); - } - output_shape.emplace_back(7); - - set_output_type(0, box_logits_et, output_shape); } bool ov::op::util::DetectionOutputBase::visit_attributes_base(AttributeVisitor& visitor, @@ -338,3 +73,23 @@ bool ov::op::util::DetectionOutputBase::visit_attributes_base(AttributeVisitor& visitor.on_attribute("objectness_score", attrs.objectness_score); return true; } + +ov::Dimension DetectionOutputBase::compute_num_classes(const AttributesBase& attrs) { + NODE_VALIDATION_CHECK(this, + 3 == get_input_size() || get_input_size() == 5, + "A number of arguments must be equal to 3 or equal to 5. Got ", + get_input_size()); + + std::vector input_shapes; + for (auto input_idx = 0; input_idx < get_input_size(); input_idx++) + input_shapes.push_back(get_input_partial_shape(input_idx)); + std::vector output_shapes = {ov::PartialShape{}}; + + int64_t num_classes = 0; + int64_t num_prior_boxes_calculated = 0; + ov::op::util::compute_num_classes(this, attrs, input_shapes, num_classes, num_prior_boxes_calculated); + if (num_classes > 0) + return ov::Dimension{num_classes}; + else + return ov::Dimension::dynamic(); +} \ No newline at end of file diff --git a/src/core/src/partial_shape.cpp b/src/core/src/partial_shape.cpp index 78f266bc122..00b3d71960d 100644 --- a/src/core/src/partial_shape.cpp +++ b/src/core/src/partial_shape.cpp @@ -298,6 +298,9 @@ bool ov::PartialShape::broadcast_merge_into(PartialShape& dst, // Ranks are both static. auto dst_rank = dst.rank().get_length(); auto src_rank = src.rank().get_length(); + // source rank can't be bigger than destination rank according to PDPD broadcast rule. + if (src_rank > dst_rank) + return false; if (dst_rank == src_rank && dst.compatible(src)) return true; diff --git a/src/core/tests/type_prop/detection_output.cpp b/src/core/tests/type_prop/detection_output.cpp index ed7b83ae859..eb8e2db1a1f 100644 --- a/src/core/tests/type_prop/detection_output.cpp +++ b/src/core/tests/type_prop/detection_output.cpp @@ -135,6 +135,25 @@ TEST(type_prop_layers, detection_output_no_share_location) { ASSERT_EQ(op->get_element_type(), element::f32); } +TEST(type_prop_layers, detection_output_calculated_num_prior_boxes) { + op::DetectionOutputAttrs attrs; + attrs.keep_top_k = {-1}; + attrs.top_k = -1; + attrs.normalized = true; + attrs.num_classes = 2; + attrs.share_location = false; + auto op = create_detection_output(PartialShape{4, -1}, + PartialShape::dynamic(), + PartialShape::dynamic(), + PartialShape{-1, 20}, + PartialShape::dynamic(), + attrs, + element::f32, + element::f32); + ASSERT_EQ(op->get_shape(), (Shape{1, 1, 80, 7})); + ASSERT_EQ(op->get_element_type(), element::f32); +} + TEST(type_prop_layers, detection_output_top_k) { op::DetectionOutputAttrs attrs; attrs.keep_top_k = {-1}; @@ -594,9 +613,10 @@ TEST(type_prop_layers, detection_output_invalid_aux_class_preds) { element::f32); FAIL() << "Exception expected"; } catch (const NodeValidationFailure& error) { - EXPECT_HAS_SUBSTRING(error.what(), - std::string("Additional class predictions' second dimension must " - "be equal to num_prior_boxes * 2 (6). Got 7.")); + EXPECT_HAS_SUBSTRING( + error.what(), + std::string("Additional class predictions' second dimension must " + "be compatible with num_prior_boxes * 2. Current value is: 7, expected: 6.")); } catch (...) { FAIL() << "Unknown exception was thrown"; } diff --git a/src/core/tests/type_prop/select.cpp b/src/core/tests/type_prop/select.cpp index ac8aa7cd4e4..041926e026a 100644 --- a/src/core/tests/type_prop/select.cpp +++ b/src/core/tests/type_prop/select.cpp @@ -243,9 +243,16 @@ INSTANTIATE_TEST_SUITE_P( // TODO: Whats the right behavior here? // SelectParams({{2}, {2, 4}, {2}, {2, 4}}, {element::boolean, element::f32, // element::dynamic, element::f32}, {op::AutoBroadcastType::PDPD, 0}), + SelectParams({{4}, {2, 4}, {2, 4}, {2, 4}}, + {element::boolean, element::f32, element::f32, element::f32}, + {op::AutoBroadcastType::PDPD, 1}), SelectParams({{4}, {2, 4}, {4}, {2, 4}}, {element::boolean, element::f32, element::dynamic, element::f32}, - {op::AutoBroadcastType::PDPD, 1})), + {op::AutoBroadcastType::PDPD, 1}), + SelectParams({{4}, {4, 2, 3, 8}, {4, 2, 3, 1}, {4, 2, 3, 8}}, + {element::boolean, element::f32, element::f32, element::f32}, + {op::AutoBroadcastType::PDPD, 0})), + PrintToDummyParamName()); TEST(type_prop, select_v1_partial_shape) { diff --git a/src/core/tests/type_prop/shuffle_channels.cpp b/src/core/tests/type_prop/shuffle_channels.cpp index ea08f51d445..9f556144b23 100644 --- a/src/core/tests/type_prop/shuffle_channels.cpp +++ b/src/core/tests/type_prop/shuffle_channels.cpp @@ -128,6 +128,32 @@ TEST(type_prop, shuffle_channels_negative_axis_calculation) { EXPECT_EQ(shuffle_channels->get_zero_based_axis(), 1); } +TEST(type_prop, shuffle_channels_infer_shape_with_negative_axis_calculation) { + // Only when the length of `axis` dimension is even, the shuffle_channels OP can work correctly. + const auto group = 2; + { + const auto data_input_shape = Shape{1, 3, 5, 8}; + const auto data = make_shared(element::f64, data_input_shape); + + const auto shuffle_channels = make_shared(data, -1, group); + EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_input_shape); + } + { + const auto data_input_shape = Shape{1, 3, 8, 5}; + const auto data = make_shared(element::f64, data_input_shape); + + const auto shuffle_channels = make_shared(data, -2, group); + EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_input_shape); + } + { + const auto data_input_shape = Shape{8, 3, 5, 7}; + const auto data = make_shared(element::f64, data_input_shape); + + const auto shuffle_channels = make_shared(data, -4, group); + EXPECT_EQ(shuffle_channels->get_output_partial_shape(0), data_input_shape); + } +} + TEST(type_prop, shuffle_channels_invalid_input_shape) { try { const auto data = make_shared(element::f64, Shape{}); diff --git a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp index 3e1a3b083d3..273b8265e3e 100644 --- a/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/utils/shape_inference/shape_inference.cpp @@ -56,6 +56,9 @@ #include "roi_align_shape_inference.hpp" #include "roll_shape_inference.hpp" #include "proposal_shape_inference.hpp" +#include "detection_output_shape_inference.hpp" +#include "select_shape_inference.hpp" +#include "shuffle_channels_shape_inference.hpp" #include "static_shape.hpp" #include "tile_shape_inference.hpp" #include "utils.hpp" @@ -218,6 +221,14 @@ void shape_inference(ov::Node* op, shape_infer(node, input_shapes, output_shapes); } else if (auto node = ov::as_type(op)) { shape_infer(node, input_shapes, output_shapes); + } else if (auto node = ov::as_type(op)) { + shape_infer(node, input_shapes, output_shapes); + } else if (auto node = ov::as_type(op)) { + shape_infer(node, input_shapes, output_shapes); + } else if (auto node = ov::as_type(op)) { + shape_infer(node, input_shapes, output_shapes); + } else if (auto node = ov::as_type(op)) { + shape_infer(node, input_shapes, output_shapes); } else { ngraph::OutputVector new_inputs; for (size_t i = 0; i < op->get_input_size(); ++i) { diff --git a/src/plugins/intel_cpu/src/utils/shape_inference/static_shape.cpp b/src/plugins/intel_cpu/src/utils/shape_inference/static_shape.cpp index 1a684d663ab..197c2196987 100644 --- a/src/plugins/intel_cpu/src/utils/shape_inference/static_shape.cpp +++ b/src/plugins/intel_cpu/src/utils/shape_inference/static_shape.cpp @@ -124,6 +124,10 @@ bool ov::StaticShape::broadcast_merge_into(StaticShape& dst, // Ranks are both static. auto dst_rank = dst.rank().get_length(); auto src_rank = src.rank().get_length(); + // source rank can't be bigger than destination rank according to + // PDPD broadcast rule. + if (src_rank > dst_rank) + return false; if (dst_rank == src_rank && dst.compatible(src)) return true; diff --git a/src/tests/unit/cpu/shape_inference_test/detection_output_shape_inference.cpp b/src/tests/unit/cpu/shape_inference_test/detection_output_shape_inference.cpp new file mode 100644 index 00000000000..adaa19b58e1 --- /dev/null +++ b/src/tests/unit/cpu/shape_inference_test/detection_output_shape_inference.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +#include "utils/shape_inference/static_shape.hpp" + +using namespace ov; + +template +std::shared_ptr create_detection_output(const PartialShape& box_logits_shape, + const PartialShape& class_preds_shape, + const PartialShape& proposals_shape, + const PartialShape& aux_class_preds_shape, + const PartialShape& aux_box_preds_shape, + T2& attrs, + element::Type input_type, + element::Type proposals_type) { + auto box_logits = std::make_shared(input_type, box_logits_shape); + auto class_preds = std::make_shared(input_type, class_preds_shape); + auto proposals = std::make_shared(proposals_type, proposals_shape); + auto aux_class_preds = std::make_shared(input_type, aux_class_preds_shape); + auto aux_box_preds = std::make_shared(input_type, aux_box_preds_shape); + return std::make_shared(box_logits, class_preds, proposals, aux_class_preds, aux_box_preds, attrs); +} + +TEST(StaticShapeInferenceTest, detection_output_top_k) { + op::v0::DetectionOutput::Attributes attrs; + attrs.keep_top_k = {-1}; + attrs.top_k = 7; + attrs.normalized = true; + attrs.num_classes = 2; + auto op = create_detection_output(PartialShape{4, 20}, + PartialShape{4, 10}, + PartialShape{4, 2, 20}, + PartialShape{4, 10}, + PartialShape{4, 20}, + attrs, + element::f32, + element::f32); + + const std::vector input_shapes = {StaticShape{4, 20}, + StaticShape{4, 10}, + StaticShape{4, 2, 20}, + StaticShape{4, 10}, + StaticShape{4, 20}}; + std::vector output_shapes = {StaticShape{}}; + shape_inference(op.get(), input_shapes, output_shapes); + ASSERT_EQ(output_shapes[0], StaticShape({1, 1, 56, 7})); +} + +TEST(StaticShapeInferenceTest, detection_output_no_share_location) { + op::v0::DetectionOutput::Attributes attrs; + attrs.keep_top_k = {-1}; + attrs.top_k = -1; + attrs.normalized = true; + attrs.num_classes = 2; + attrs.share_location = false; + auto op = create_detection_output(PartialShape{4, 40}, + PartialShape{4, 10}, + PartialShape{4, 2, 20}, + PartialShape{4, 10}, + PartialShape{4, 40}, + attrs, + element::f32, + element::f32); + + const std::vector input_shapes = {StaticShape{4, 40}, + StaticShape{4, 10}, + StaticShape{4, 2, 20}, + StaticShape{4, 10}, + StaticShape{4, 40}}; + std::vector output_shapes = {StaticShape{}}; + shape_inference(op.get(), input_shapes, output_shapes); + ASSERT_EQ(output_shapes[0], StaticShape({1, 1, 40, 7})); +} + +TEST(StaticShapeInferenceTest, detection_output) { + op::v0::DetectionOutput::Attributes attrs; + attrs.keep_top_k = {200}; + attrs.num_classes = 2; + attrs.normalized = true; + auto op = create_detection_output(PartialShape{4, 20}, + PartialShape{4, 10}, + PartialShape{4, 2, 20}, + PartialShape{4, 10}, + PartialShape{4, 20}, + attrs, + element::f32, + element::f32); + + const std::vector input_shapes = {StaticShape{4, 20}, + StaticShape{4, 10}, + StaticShape{4, 2, 20}, + StaticShape{4, 10}, + StaticShape{4, 20}}; + std::vector output_shapes = {StaticShape{}}; + shape_inference(op.get(), input_shapes, output_shapes); + ASSERT_EQ(output_shapes[0], (StaticShape{1, 1, 800, 7})); +} + +TEST(StaticShapeInferenceTest, detection_outputv8_top_k) { + op::v8::DetectionOutput::Attributes attrs; + attrs.keep_top_k = {-1}; + attrs.top_k = 7; + attrs.normalized = true; + auto op = create_detection_output(PartialShape{4, 20}, + PartialShape{4, 10}, + PartialShape{4, 2, 20}, + PartialShape{4, 10}, + PartialShape{4, 20}, + attrs, + element::f32, + element::f32); + + const std::vector input_shapes = {StaticShape{4, 20}, + StaticShape{4, 10}, + StaticShape{4, 2, 20}, + StaticShape{4, 10}, + StaticShape{4, 20}}; + std::vector output_shapes = {StaticShape{}}; + shape_inference(op.get(), input_shapes, output_shapes); + ASSERT_EQ(output_shapes[0], StaticShape({1, 1, 56, 7})); +} + +TEST(StaticShapeInferenceTest, detection_outputv8_no_share_location) { + op::v8::DetectionOutput::Attributes attrs; + attrs.keep_top_k = {-1}; + attrs.top_k = -1; + attrs.normalized = true; + attrs.share_location = false; + auto op = create_detection_output(PartialShape{4, 40}, + PartialShape{4, 10}, + PartialShape{4, 2, 20}, + PartialShape{4, 10}, + PartialShape{4, 40}, + attrs, + element::f32, + element::f32); + + const std::vector input_shapes = {StaticShape{4, 40}, + StaticShape{4, 10}, + StaticShape{4, 2, 20}, + StaticShape{4, 10}, + StaticShape{4, 40}}; + std::vector output_shapes = {StaticShape{}}; + shape_inference(op.get(), input_shapes, output_shapes); + ASSERT_EQ(output_shapes[0], StaticShape({1, 1, 40, 7})); +} + +TEST(StaticShapeInferenceTest, detection_output_v8) { + op::v8::DetectionOutput::Attributes attrs; + attrs.keep_top_k = {200}; + attrs.normalized = true; + auto op = create_detection_output(PartialShape{4, 20}, + PartialShape{4, 10}, + PartialShape{4, 2, 20}, + PartialShape{4, 10}, + PartialShape{4, 20}, + attrs, + element::f32, + element::f32); + + const std::vector input_shapes = {StaticShape{4, 20}, + StaticShape{4, 10}, + StaticShape{4, 2, 20}, + StaticShape{4, 10}, + StaticShape{4, 20}}; + std::vector output_shapes = {StaticShape{}}; + shape_inference(op.get(), input_shapes, output_shapes); + ASSERT_EQ(output_shapes[0], (StaticShape{1, 1, 800, 7})); +} \ No newline at end of file diff --git a/src/tests/unit/cpu/shape_inference_test/select_shape_inference.cpp b/src/tests/unit/cpu/shape_inference_test/select_shape_inference.cpp new file mode 100644 index 00000000000..8821127ea69 --- /dev/null +++ b/src/tests/unit/cpu/shape_inference_test/select_shape_inference.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +#include "utils/shape_inference/static_shape.hpp" + +using namespace ov; + +TEST(StaticShapeInferenceTest, SelectTestBCastModeNUMPY) { + auto cond = std::make_shared(element::boolean, PartialShape{}); + auto ptrue = std::make_shared(element::f32, PartialShape{}); + auto pfalse = std::make_shared(element::f32, PartialShape{}); + auto select = std::make_shared(cond, ptrue, pfalse, op::AutoBroadcastType::NUMPY); + { + std::vector static_input_shapes = {StaticShape{}, StaticShape{4}, StaticShape{2, 4}}, + static_output_shapes = {StaticShape{}}; + shape_inference(select.get(), static_input_shapes, static_output_shapes); + ASSERT_EQ(static_output_shapes[0], StaticShape({2, 4})); + } + + { + std::vector static_input_shapes = {StaticShape{}, StaticShape{2, 4}, StaticShape{2, 4}}, + static_output_shapes = {StaticShape{}}; + shape_inference(select.get(), static_input_shapes, static_output_shapes); + ASSERT_EQ(static_output_shapes[0], StaticShape({2, 4})); + } + + { + std::vector static_input_shapes = {StaticShape{4}, StaticShape{2, 4}, StaticShape{4}}, + static_output_shapes = {StaticShape{}}; + shape_inference(select.get(), static_input_shapes, static_output_shapes); + ASSERT_EQ(static_output_shapes[0], StaticShape({2, 4})); + } +} +TEST(StaticShapeInferenceTest, SelectTestBCastModePDPD) { + auto cond = std::make_shared(element::boolean, PartialShape{}); + auto ptrue = std::make_shared(element::f32, PartialShape{}); + auto pfalse = std::make_shared(element::f32, PartialShape{}); + auto select = + std::make_shared(cond, ptrue, pfalse, op::AutoBroadcastSpec{op::AutoBroadcastType::PDPD, 1}); + std::vector static_input_shapes = {StaticShape{4}, StaticShape{2, 4}, StaticShape{4}}, + static_output_shapes = {StaticShape{}}; + shape_inference(select.get(), static_input_shapes, static_output_shapes); + ASSERT_EQ(static_output_shapes[0], StaticShape({2, 4})); +} + +TEST(StaticShapeInferenceTest, SelectTestBCastModeNone) { + auto cond = std::make_shared(element::boolean, PartialShape{}); + auto ptrue = std::make_shared(element::f32, PartialShape{}); + auto pfalse = std::make_shared(element::f32, PartialShape{}); + auto select = std::make_shared(cond, ptrue, pfalse, op::AutoBroadcastType::NONE); + + std::vector static_input_shapes = {StaticShape{6, 4}, StaticShape{6, 4}, StaticShape{6, 4}}, + static_output_shapes = {StaticShape{}}; + shape_inference(select.get(), static_input_shapes, static_output_shapes); + ASSERT_EQ(static_output_shapes[0], StaticShape({6, 4})); +} diff --git a/src/tests/unit/cpu/shape_inference_test/shuffle_channels_shape_inference.cpp b/src/tests/unit/cpu/shape_inference_test/shuffle_channels_shape_inference.cpp new file mode 100644 index 00000000000..42089ddebe8 --- /dev/null +++ b/src/tests/unit/cpu/shape_inference_test/shuffle_channels_shape_inference.cpp @@ -0,0 +1,26 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include + +#include "utils/shape_inference/static_shape.hpp" + +using namespace ov; + +TEST(StaticShapeInferenceTest, ShuffleChannelsTest) { + const auto data = std::make_shared(element::f32, PartialShape{-1, -1, -1}); + const auto axis = -1; + const auto group = 3; + const auto shuffle_channels = std::make_shared(data, axis, group); + + std::vector static_input_shapes = {StaticShape{5, 4, 9}}; + std::vector static_output_shapes = {StaticShape{}}; + shape_inference(shuffle_channels.get(), static_input_shapes, static_output_shapes); + + ASSERT_EQ(static_output_shapes[0], static_input_shapes[0]); +} \ No newline at end of file From cb9fe0910dfa4a71200117553ecdc2fee7cb1d78 Mon Sep 17 00:00:00 2001 From: Vladislav Volkov Date: Wed, 29 Dec 2021 09:09:56 +0300 Subject: [PATCH 14/78] [CPU] Broken support for Layout::ANY in CPU plugin (#9434) --- src/plugins/intel_cpu/src/mkldnn_graph.cpp | 6 +- .../cpu/subgraph_tests/src/any_layout.cpp | 95 +++++++++++++++++++ 2 files changed, 100 insertions(+), 1 deletion(-) create mode 100644 src/tests/functional/plugin/cpu/subgraph_tests/src/any_layout.cpp diff --git a/src/plugins/intel_cpu/src/mkldnn_graph.cpp b/src/plugins/intel_cpu/src/mkldnn_graph.cpp index 37d8633bd5c..15d6f1b79a5 100644 --- a/src/plugins/intel_cpu/src/mkldnn_graph.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_graph.cpp @@ -810,7 +810,11 @@ void MKLDNNGraph::PullOutputData(BlobMap &out) { if (ext_blob_ptr == intr_blob_ptr) continue; if (actualDesc.getBlockingDesc() != expectedDesc.getBlockingDesc() && !isScalarOutput) { - auto outBlobDesc = MemoryDescUtils::convertToDnnlBlockedMemoryDesc(expectedDesc); + // User can initialize output via SetOutput API using tensorDesc with ANY layout. + // For these cases we create planar memory descriptor. + auto outBlobDesc = expectedDesc.getLayout() == InferenceEngine::Layout::ANY + ? DnnlBlockedMemoryDesc(expectedDesc.getPrecision(), Shape(expectedDesc.getDims())) + : MemoryDescUtils::convertToDnnlBlockedMemoryDesc(expectedDesc); auto outBloMem = MKLDNNMemory(eng); outBloMem.Create(outBlobDesc, ext_blob_ptr, false); diff --git a/src/tests/functional/plugin/cpu/subgraph_tests/src/any_layout.cpp b/src/tests/functional/plugin/cpu/subgraph_tests/src/any_layout.cpp new file mode 100644 index 00000000000..072db80c8a5 --- /dev/null +++ b/src/tests/functional/plugin/cpu/subgraph_tests/src/any_layout.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; + +namespace SubgraphTestsDefinitions { + +class AnyLayoutOnInputsAndOutputs : public ::testing::TestWithParam { +public: + static std::string getTestCaseName(::testing::TestParamInfo obj) { + std::ostringstream result; + result << "shape=" << obj.param; + return result.str(); + } + +protected: + std::shared_ptr + create_test_function(const ov::Shape & shape) { + auto param = std::make_shared(ov::element::f32, shape); + + float shift = 1.0f; + auto shift_node = std::make_shared(ov::element::f32, ov::Shape{1}, &shift); + + auto add = std::make_shared(param, shift_node); + + auto result = std::make_shared(add); + + return std::make_shared(ngraph::ResultVector{result}, ngraph::ParameterVector{param}); + } + + void Run() { + const ov::Shape & shape = GetParam(); + auto shape_size = ov::shape_size(shape); + + std::vector input_data(shape_size, 2); + std::vector output_data(shape_size); + std::vector expected_output(shape_size, 3); + + // Create CNNNetwork + auto ngraph_function = create_test_function(shape); + auto cnn = InferenceEngine::CNNNetwork(ngraph_function); + + // Fill inputs and outputs + std::vector input_names; + std::vector out_names; + for (const auto& it : cnn.getInputsInfo()) { + input_names.push_back(it.first); + } + for (const auto& it : cnn.getOutputsInfo()) { + out_names.push_back(it.first); + } + + BlobMap inputBlobs; + BlobMap outputBlobs; + + TensorDesc tensorDescInp1(Precision::FP32, shape, Layout::ANY); + TensorDesc tensorDescOut(Precision::FP32, shape, Layout::ANY); + + inputBlobs[input_names[0]] = make_shared_blob(tensorDescInp1, input_data.data()); + outputBlobs[out_names[0]] = make_shared_blob(tensorDescOut, output_data.data()); + + // Load network + Core ie; + ExecutableNetwork executable_network = ie.LoadNetwork(cnn, "CPU"); + + // Infer + InferRequest infer_request = executable_network.CreateInferRequest(); + infer_request.SetInput(inputBlobs); + infer_request.SetOutput(outputBlobs); + infer_request.Infer(); + + ASSERT_EQ(output_data, expected_output); + } +}; + +TEST_P(AnyLayoutOnInputsAndOutputs, CheckExpectedResult) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + Run(); +} + +static AnyLayoutOnInputsAndOutputs::ParamType AnyLayoutOnInputsAndOutputsParams[] = { + ov::Shape{ 1, 2, 3, 4 }, + ov::Shape{ 1, 2, 3, 4, 5 }, + ov::Shape{ 1, 2, 3, 4, 5, 6 }, +}; + +INSTANTIATE_TEST_SUITE_P(AnyLayoutOnInputsAndOutputs, + AnyLayoutOnInputsAndOutputs, + ::testing::ValuesIn(AnyLayoutOnInputsAndOutputsParams), + AnyLayoutOnInputsAndOutputs::getTestCaseName); + +} // namespace SubgraphTestsDefinitions From 2870dc7d3f155f5774885a414c9196d5298e5177 Mon Sep 17 00:00:00 2001 From: Maksim Kutakov Date: Wed, 29 Dec 2021 09:19:45 +0300 Subject: [PATCH 15/78] [CPU] Cache for runtime data (#9192) Caching added for Eltwise and MatMul nodes --- .../interface/ie_internal_plugin_config.hpp | 7 + src/plugins/intel_cpu/src/cache/cache_entry.h | 69 + src/plugins/intel_cpu/src/cache/lru_cache.h | 106 ++ .../intel_cpu/src/cache/multi_cache.cpp | 9 + src/plugins/intel_cpu/src/cache/multi_cache.h | 84 ++ src/plugins/intel_cpu/src/config.cpp | 11 + src/plugins/intel_cpu/src/config.h | 1 + .../src/emitters/jit_bf16_emitters.hpp | 4 +- .../src/emitters/jit_eltwise_emitters.cpp | 110 +- .../src/emitters/jit_eltwise_emitters.hpp | 52 +- .../intel_cpu/src/emitters/jit_emitter.hpp | 2 +- .../src/emitters/jit_load_store_emitters.cpp | 10 +- .../src/emitters/jit_load_store_emitters.hpp | 4 +- .../src/emitters/jit_mkldnn_emitters.cpp | 19 +- .../src/emitters/jit_mkldnn_emitters.hpp | 6 +- src/plugins/intel_cpu/src/mkldnn_graph.cpp | 7 +- src/plugins/intel_cpu/src/mkldnn_graph.h | 3 + src/plugins/intel_cpu/src/mkldnn_node.h | 30 +- .../intel_cpu/src/nodes/common/softmax.cpp | 2 +- .../src/nodes/mkldnn_eltwise_node.cpp | 1043 ++++++++------ .../intel_cpu/src/nodes/mkldnn_eltwise_node.h | 68 +- .../src/nodes/mkldnn_interpolate_node.cpp | 2 +- .../src/nodes/mkldnn_matmul_node.cpp | 119 +- .../intel_cpu/src/nodes/mkldnn_matmul_node.h | 1 - .../intel_cpu/src/nodes/mkldnn_mvn_node.cpp | 6 +- .../nodes/mkldnn_non_max_suppression_node.cpp | 4 +- .../src/nodes/mkldnn_normalize_node.cpp | 20 +- .../src/nodes/mkldnn_reduce_node.cpp | 4 +- .../src/nodes/mkldnn_region_yolo_node.cpp | 2 +- .../src/nodes/mkldnn_roi_pooling_node.cpp | 4 +- src/plugins/intel_cpu/thirdparty/mkl-dnn | 2 +- .../plugin/cpu/single_layer_tests/eltwise.cpp | 1196 ++++++++++------- .../plugin/cpu/single_layer_tests/mat_mul.cpp | 8 +- .../cpu/subgraph_tests/src/eltwise_chain.cpp | 771 ++++++----- .../plugin/cpu/test_utils/cpu_test_utils.cpp | 8 +- .../cpu/test_utils/fusing_test_utils.hpp | 23 + src/tests/unit/cpu/rt_cache.cpp | 381 ++++++ 37 files changed, 2776 insertions(+), 1422 deletions(-) create mode 100644 src/plugins/intel_cpu/src/cache/cache_entry.h create mode 100644 src/plugins/intel_cpu/src/cache/lru_cache.h create mode 100644 src/plugins/intel_cpu/src/cache/multi_cache.cpp create mode 100644 src/plugins/intel_cpu/src/cache/multi_cache.h create mode 100644 src/tests/unit/cpu/rt_cache.cpp diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp index e532274c506..1d87c5e5053 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_internal_plugin_config.hpp @@ -45,6 +45,13 @@ DECLARE_CONFIG_KEY(LP_TRANSFORMS_MODE); */ DECLARE_CONFIG_KEY(CPU_THREADS_PER_STREAM); +/** + * @brief Defines how many records can be stored in the CPU runtime parameters cache per CPU runtime parameter type per + * stream + * @ingroup ie_dev_api_plugin_api + */ +DECLARE_CONFIG_KEY(CPU_RUNTIME_CACHE_CAPACITY); + /** * @brief This key should be used to force disable export while loading network even if global cache dir is defined * Used by HETERO plugin to disable automatic caching of subnetworks (set value to YES) diff --git a/src/plugins/intel_cpu/src/cache/cache_entry.h b/src/plugins/intel_cpu/src/cache/cache_entry.h new file mode 100644 index 00000000000..f8e6c1fb1ee --- /dev/null +++ b/src/plugins/intel_cpu/src/cache/cache_entry.h @@ -0,0 +1,69 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "lru_cache.h" + +namespace MKLDNNPlugin { + +class CacheEntryBase { +public: + enum class LookUpStatus : int8_t { + Hit, + Miss + }; +public: + virtual ~CacheEntryBase() = default; +}; + +/** + * @brief Class represents a templated record in multi cache + * @tparam KeyType is a key type that must define hash() const method with return type convertible to size_t and define comparison operator. + * @tparam ValType is a type that must meet all the requirements to the std::unordered_map mapped type + * @tparam ImplType is a type for the internal storage. It must provide put(KeyType, ValueType) and ValueType get(const KeyType&) + * interface and must have constructor of type ImplType(size_t). + * + * @note In this implementation default constructed value objects are treated as empty objects. + */ + +template> +class CacheEntry : public CacheEntryBase { +public: + using ResultType = std::pair; + +public: + explicit CacheEntry(size_t capacity) : _impl(capacity) {} + + /** + * @brief Searches the key in the underlying storage and returns value if it exists, or creates a value using the builder functor and adds it to + * the underlying storage. + * @param key is the search key + * @param builder is a callable object that creates the ValType object from the KeyType lval reference + * @return result of the operation which is a pair of the requested object of ValType and the status of whether the cache hit or miss occurred + */ + + ResultType getOrCreate(const KeyType& key, std::function builder) { + if (0 == _impl.getCapacity()) { + // fast track + return {builder(key), CacheEntryBase::LookUpStatus::Miss}; + } + auto retStatus = LookUpStatus::Hit; + ValType retVal = _impl.get(key); + if (retVal == ValType()) { + retStatus = LookUpStatus::Miss; + retVal = builder(key); + _impl.put(key, retVal); + } + return {retVal, retStatus}; + } + +public: + ImplType _impl; +}; +}// namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/cache/lru_cache.h b/src/plugins/intel_cpu/src/cache/lru_cache.h new file mode 100644 index 00000000000..ffe0de66500 --- /dev/null +++ b/src/plugins/intel_cpu/src/cache/lru_cache.h @@ -0,0 +1,106 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +/** + * @brief This is yet another implementation of a preemptive cache with LRU eviction policy. + * @tparam Key is a key type that must define hash() const method with return type convertible to size_t and define comparison operator. + * @tparam Value is a type that must meet all the requirements to the std::unordered_map mapped type + * + * @attention This cache implementation IS NOT THREAD SAFE! + */ + +namespace MKLDNNPlugin { + +template +class LruCache { +public: + using value_type = std::pair; + +public: + explicit LruCache(size_t capacity) : _capacity(capacity) {} + + /** + * @brief Puts the value associated with the key into the cache. + * @param key + * @param value + */ + + void put(Key key, Value val) { + if (0 == _capacity) { + return; + } + auto mapItr = _cacheMapper.find(key); + if (mapItr != _cacheMapper.end()) { + touch(mapItr->second); + mapItr->second->second = std::move(val); + } else { + if (_cacheMapper.size() == _capacity) { + evict(1); + } + auto itr = _lruList.insert(_lruList.begin(), {key, std::move(val)}); + _cacheMapper.insert({std::move(key), itr}); + } + } + + /** + * @brief Searches a value associated with the key. + * @param key + * @return Value associated with the key or default constructed instance of the Value type. + */ + + Value get(const Key &key) { + auto itr = _cacheMapper.find(key); + if (itr == _cacheMapper.end()) { + return Value(); + } + + touch(itr->second); + return _lruList.front().second; + } + + /** + * @brief Evicts n least recently used cache records + * @param n number of records to be evicted, can be greater than capacity + */ + + void evict(size_t n) { + for (size_t i = 0; i < n && !_lruList.empty(); ++i) { + _cacheMapper.erase(_lruList.back().first); + _lruList.pop_back(); + } + } + + /** + * @brief Returns the current capacity value + * @return the current capacity value + */ + size_t getCapacity() const noexcept { + return _capacity; + } + +private: + struct key_hasher { + std::size_t operator()(const Key &k) const { + return k.hash(); + } + }; + + using lru_list_type = std::list; + using cache_map_value_type = typename lru_list_type::iterator; + + void touch(typename lru_list_type::iterator itr) { + _lruList.splice(_lruList.begin(), _lruList, itr); + } + + lru_list_type _lruList; + std::unordered_map _cacheMapper; + size_t _capacity; +}; + +} // namespace MKLDNNPlugin \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/cache/multi_cache.cpp b/src/plugins/intel_cpu/src/cache/multi_cache.cpp new file mode 100644 index 00000000000..521a64c533e --- /dev/null +++ b/src/plugins/intel_cpu/src/cache/multi_cache.cpp @@ -0,0 +1,9 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "multi_cache.h" + +using namespace MKLDNNPlugin; + +std::atomic_size_t MultiCache::_typeIdCounter{0}; \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/cache/multi_cache.h b/src/plugins/intel_cpu/src/cache/multi_cache.h new file mode 100644 index 00000000000..9184f59ba41 --- /dev/null +++ b/src/plugins/intel_cpu/src/cache/multi_cache.h @@ -0,0 +1,84 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include "cache_entry.h" + +namespace MKLDNNPlugin { + +/** + * @brief Class that represent a preemptive cache for different key/value pair types. + * + * @attention This implementation IS NOT THREAD SAFE! + */ + +class MultiCache { +public: + template + using EntryTypeT = CacheEntry; + using EntryBasePtr = std::shared_ptr; + template + using EntryPtr = std::shared_ptr>; + +public: + /** + * @param capacity here means maximum records limit FOR EACH entry specified by a pair of Key/Value types. + * @note zero capacity means empty cache so no records are stored and no entries are created + */ + explicit MultiCache(size_t capacity) : _capacity(capacity) {} + + /** + * @brief Searches a value of ValueType in the cache using the provided key or creates a new ValueType instance (if nothing was found) + * using the key and the builder functor and adds the new record to the cache + * @param key is the search key + * @param builder is a callable object that creates the ValType object from the KeyType lval reference. + * Also the builder type is used for the ValueType deduction + * @return result of the operation which is a pair of the requested object of ValType and the status of whether the cache hit or miss occurred + */ + + template::type> + typename CacheEntry::ResultType + getOrCreate(const KeyType& key, BuilderType builder) { + auto entry = getEntry(); + return entry->getOrCreate(key, std::move(builder)); + } + +private: + template + size_t getTypeId(); + template + EntryPtr getEntry(); + +private: + static std::atomic_size_t _typeIdCounter; + size_t _capacity; + std::unordered_map _storage; +}; + +template +size_t MultiCache::getTypeId() { + static size_t id = _typeIdCounter.fetch_add(1); + return id; +} + +template +MultiCache::EntryPtr MultiCache::getEntry() { + using EntryType = EntryTypeT; + size_t id = getTypeId(); + auto itr = _storage.find(id); + if (itr == _storage.end()) { + auto result = _storage.insert({id, std::make_shared(_capacity)}); + itr = result.first; + } + return std::static_pointer_cast(itr->second); +} + +using MultiCachePtr = std::shared_ptr; +using MultiCacheCPtr = std::shared_ptr; + +} // namespace MKLDNNPlugin \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 02d12b349e2..f820a2a97da 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -117,6 +117,17 @@ void Config::readProperties(const std::map &prop) { } } else if (key == PluginConfigParams::KEY_CACHE_DIR) { cache_dir = val; + } else if (PluginConfigInternalParams::KEY_CPU_RUNTIME_CACHE_CAPACITY == key) { + int val_i = -1; + try { + val_i = std::stoi(val); + } catch (const std::exception&) { + IE_THROW() << "Wrong value for property key " << PluginConfigInternalParams::KEY_CPU_RUNTIME_CACHE_CAPACITY + << ". Expected only integer numbers"; + } + // any negative value will be treated + // as zero that means disabling the cache + rtCacheCapacity = std::max(val_i, 0); } else { IE_THROW(NotFound) << "Unsupported property " << key << " by CPU plugin"; } diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 6a1e785866c..8ee5cfa0cc0 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -26,6 +26,7 @@ struct Config { bool enableDynamicBatch = false; std::string dumpToDot = ""; int batchLimit = 0; + size_t rtCacheCapacity = 100ul; InferenceEngine::IStreamsExecutor::Config streamExecutorConfig; InferenceEngine::PerfHintsConfig perfHintsConfig; #if defined(__arm__) || defined(__aarch64__) diff --git a/src/plugins/intel_cpu/src/emitters/jit_bf16_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_bf16_emitters.hpp index 454a6c335af..bf6edac0085 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_bf16_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_bf16_emitters.hpp @@ -10,8 +10,8 @@ namespace MKLDNNPlugin { class jit_emu_vcvtneps2bf16 : public jit_emitter { public: - jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, node, exec_prc) { + jit_emu_vcvtneps2bf16(mkldnn::impl::cpu::x64::jit_generator* host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::BF16) : jit_emitter(host, host_isa, exec_prc) { prepare_table(); } diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp index c801082c01f..5605c69f218 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.cpp @@ -18,8 +18,8 @@ namespace MKLDNNPlugin { /// ADD /// jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_add_emitter::jit_add_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_add_emitter::get_inputs_num() const { return 2; } @@ -55,8 +55,8 @@ void jit_add_emitter::emit_isa(const std::vector &in_vec_idxs, const std /// MUL_ADD /// jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_mul_add_emitter::jit_mul_add_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_mul_add_emitter::get_inputs_num() const { return 3; } @@ -117,8 +117,8 @@ size_t jit_mul_add_emitter::aux_vecs_count() const { /// SUB /// jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_subtract_emitter::jit_subtract_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_subtract_emitter::get_inputs_num() const { return 2; } @@ -155,8 +155,8 @@ void jit_subtract_emitter::emit_isa(const std::vector &in_vec_idxs, cons /// MULTIPLY /// jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_multiply_emitter::jit_multiply_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_multiply_emitter::get_inputs_num() const { return 2; } @@ -193,8 +193,8 @@ void jit_multiply_emitter::emit_isa(const std::vector &in_vec_idxs, cons /// DIVIDE /// jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_divide_emitter::jit_divide_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_divide_emitter::get_inputs_num() const { return 2; } @@ -260,8 +260,8 @@ size_t jit_divide_emitter::aux_vecs_count() const { /// FLOOR_MOD /// jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_floor_mod_emitter::jit_floor_mod_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_floor_mod_emitter::get_inputs_num() const { return 2; } @@ -312,8 +312,8 @@ size_t jit_floor_mod_emitter::aux_vecs_count() const { /// MOD /// jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_mod_emitter::jit_mod_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_mod_emitter::get_inputs_num() const { return 2; } @@ -364,8 +364,8 @@ size_t jit_mod_emitter::aux_vecs_count() const { /// MAXIMUM /// jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_maximum_emitter::jit_maximum_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_maximum_emitter::get_inputs_num() const { return 2; } @@ -414,8 +414,8 @@ std::set jit_maximum_emitter::get_supported_precisio /// MINIMUM /// jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_minimum_emitter::jit_minimum_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_minimum_emitter::get_inputs_num() const { return 2; } @@ -465,8 +465,8 @@ std::set jit_minimum_emitter::get_supported_precisio jit_squared_difference_emitter::jit_squared_difference_emitter( jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_squared_difference_emitter::jit_squared_difference_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_squared_difference_emitter::get_inputs_num() const { return 2; } @@ -506,8 +506,8 @@ void jit_squared_difference_emitter::emit_isa(const std::vector &in_vec_ /// POWER_DYNAMIC /// jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_power_dynamic_emitter::jit_power_dynamic_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_power_dynamic_emitter::get_inputs_num() const { return 2; } @@ -617,8 +617,8 @@ jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, co : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_equal_emitter::jit_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -678,8 +678,8 @@ jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_not_equal_emitter::jit_not_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -739,8 +739,8 @@ jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_greater_emitter::jit_greater_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -800,8 +800,8 @@ jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_is : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_greater_equal_emitter::jit_greater_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -861,8 +861,8 @@ jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, cons : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_less_emitter::jit_less_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -922,8 +922,8 @@ jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t ho : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_less_equal_emitter::jit_less_equal_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -984,8 +984,8 @@ jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_and_emitter::jit_logical_and_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1066,8 +1066,8 @@ jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t ho : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_or_emitter::jit_logical_or_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1147,8 +1147,8 @@ jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_xor_emitter::jit_logical_xor_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1228,8 +1228,8 @@ jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_logical_not_emitter::jit_logical_not_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } @@ -1298,16 +1298,10 @@ jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_ prepare_table(); } -jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { - const MKLDNNEltwiseNode *powerNode = dynamic_cast(node); - if (powerNode == nullptr) { - IE_THROW() << "Can't cast to MKLDNNEltwiseNode"; - } - power = powerNode->getAlpha(); - scale = powerNode->getBeta(); - shift = powerNode->getGamma(); - +jit_power_static_emitter::jit_power_static_emitter(jit_generator *host, cpu_isa_t host_isa, + float inpPower, float inpScale, float inpShift, + Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc), power(inpPower), scale(inpScale), shift(inpShift) { prepare_table(); } @@ -1483,8 +1477,8 @@ jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, co : jit_emitter(host, host_isa, node, exec_prc) { prepare_table(); } -jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_prelu_emitter::jit_prelu_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } size_t jit_prelu_emitter::get_inputs_num() const { return 2; } @@ -1541,8 +1535,8 @@ size_t jit_prelu_emitter::aux_vecs_count() const { /// SQRT /// jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, const std::shared_ptr& node, Precision exec_prc) : jit_emitter(host, host_isa, node, exec_prc) {} -jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) {} +jit_sqrt_emitter::jit_sqrt_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) {} size_t jit_sqrt_emitter::get_inputs_num() const { return 1; } @@ -1599,8 +1593,8 @@ void jit_negative_emitter::emit_isa(const std::vector &in_vec_idxs, cons } /// ERF /// -jit_erf_emitter::jit_erf_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, Precision exec_prc) -: jit_emitter(host, host_isa, node, exec_prc) { +jit_erf_emitter::jit_erf_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc) +: jit_emitter(host, host_isa, exec_prc) { prepare_table(); } diff --git a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp index e03d32de9c1..3e930c57915 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_eltwise_emitters.hpp @@ -12,7 +12,7 @@ namespace MKLDNNPlugin { class jit_add_emitter : public jit_emitter { public: - jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -30,7 +30,7 @@ private: class jit_mul_add_emitter : public jit_emitter { public: - jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_mul_add_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -51,7 +51,7 @@ private: class jit_subtract_emitter : public jit_emitter { public: - jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_subtract_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -70,7 +70,7 @@ private: class jit_multiply_emitter : public jit_emitter { public: - jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_multiply_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -89,7 +89,7 @@ private: class jit_divide_emitter : public jit_emitter { public: - jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_divide_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -110,7 +110,7 @@ private: class jit_floor_mod_emitter : public jit_emitter { public: - jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_floor_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -130,7 +130,7 @@ private: class jit_mod_emitter : public jit_emitter { public: - jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_mod_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -150,7 +150,7 @@ private: class jit_maximum_emitter : public jit_emitter { public: - jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_maximum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -170,7 +170,7 @@ private: class jit_minimum_emitter : public jit_emitter { public: - jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_minimum_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -191,7 +191,6 @@ private: class jit_squared_difference_emitter : public jit_emitter { public: jit_squared_difference_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, - const MKLDNNNode* node, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_squared_difference_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, @@ -211,7 +210,7 @@ private: class jit_power_dynamic_emitter : public jit_emitter { public: - jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_power_dynamic_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -230,7 +229,7 @@ private: class jit_equal_emitter : public jit_emitter { public: - jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -252,7 +251,7 @@ private: class jit_not_equal_emitter : public jit_emitter { public: - jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_not_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -274,7 +273,7 @@ private: class jit_greater_emitter : public jit_emitter { public: - jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_greater_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -296,7 +295,7 @@ private: class jit_greater_equal_emitter : public jit_emitter { public: - jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_greater_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -318,7 +317,7 @@ private: class jit_less_emitter : public jit_emitter { public: - jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_less_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -340,7 +339,7 @@ private: class jit_less_equal_emitter : public jit_emitter { public: - jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_less_equal_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, @@ -363,7 +362,7 @@ private: class jit_logical_and_emitter : public jit_emitter { public: - jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_logical_and_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -385,7 +384,7 @@ private: class jit_logical_or_emitter : public jit_emitter { public: - jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_logical_or_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -407,7 +406,7 @@ private: class jit_logical_xor_emitter : public jit_emitter { public: - jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_logical_xor_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -428,7 +427,7 @@ private: class jit_logical_not_emitter : public jit_emitter { public: - jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_logical_not_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -449,8 +448,9 @@ private: class jit_power_static_emitter : public jit_emitter { public: - jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, - InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); + jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, + float inpPower, float inpScale, float inpShift, + InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_power_static_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -474,7 +474,7 @@ private: class jit_prelu_emitter : public jit_emitter { public: - jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_prelu_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -494,7 +494,7 @@ private: class jit_sqrt_emitter : public jit_emitter { public: - jit_sqrt_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_sqrt_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_sqrt_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -528,7 +528,7 @@ private: class jit_erf_emitter : public jit_emitter { public: - jit_erf_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_erf_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_erf_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, diff --git a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp index 12f422b7d7f..dc3c2c4022d 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_emitter.hpp @@ -28,7 +28,7 @@ struct emitter_context { class jit_emitter : public ngraph::snippets::Emitter { public: - jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_emitter(dnnl::impl::cpu::x64::jit_generator* host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_vec) : Emitter(nullptr), h(host), host_isa_(host_isa), exec_prc_(exec_prc), in_out_type_(in_out_type), l_table (new Xbyak::Label()) { k_mask = Xbyak::Opmask(1); // FIXME: in general case we need preserve k_mask state as well diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp index f6ad72a3e36..16b55daf9d3 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp @@ -18,9 +18,9 @@ using namespace Xbyak::util; namespace MKLDNNPlugin { /// LOAD /// -jit_load_emitter::jit_load_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, +jit_load_emitter::jit_load_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, node, exec_prc, in_out_type), name(node ? node->getName() : "unknown") { +: jit_emitter(host, host_isa, exec_prc, in_out_type), name("unknown") { prepare_table(); v_len_elt = get_vec_length() / exec_prc.size(); } @@ -486,12 +486,12 @@ void jit_load_emitter::register_table_entries() { } /// STORE /// -jit_store_emitter::jit_store_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, +jit_store_emitter::jit_store_emitter(jit_generator *host, cpu_isa_t host_isa, Precision exec_prc, emitter_in_out_map in_out_type) -: jit_emitter(host, host_isa, node, exec_prc, in_out_type), name(node ? node->getName() : "unknown") { +: jit_emitter(host, host_isa, exec_prc, in_out_type), name("unknown") { v_len_elt = get_vec_length() / exec_prc.size(); if (!mayiuse(cpu::x64::avx512_core_bf16) && mayiuse(cpu::x64::avx512_core)) { - emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa, nullptr)); + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(host, host_isa)); } } diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp index ec863d0c69e..8e792cf1299 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.hpp @@ -44,7 +44,7 @@ struct store_emitter_context : public emitter_context { class jit_load_emitter : public jit_emitter { public: - jit_load_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_load_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::gpr_to_vec); /** * load_num values with src_prc precision are loaded from ptr[Reg64(in_idxs[0]) + offset_byte] address to Vmm[out_idxs[0]] as dst_prc. @@ -100,7 +100,7 @@ private: class jit_store_emitter : public jit_emitter { public: - jit_store_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_store_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32, emitter_in_out_map in_out_type = emitter_in_out_map::vec_to_gpr); /** diff --git a/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.cpp index 70f481a7199..69e83e0e354 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.cpp @@ -22,15 +22,10 @@ jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, set_injector(); } -jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc) - : jit_emitter(host, host_isa, node, exec_prc) { - auto eltwiseNode = dynamic_cast(node); - if (!eltwiseNode) { - IE_THROW() << "Cannot cast " << node->getName() << " to MKLDNNEltwiseNode"; - } - kind = static_cast(eltwiseNode->getMKLDNNAlgorithm()); - alpha = eltwiseNode->getAlpha(); - beta = eltwiseNode->getBeta(); +jit_mkldnn_emitter::jit_mkldnn_emitter(jit_generator *host, cpu_isa_t host_isa, + mkldnn_alg_kind_t algKind, float alpha, float beta, + InferenceEngine::Precision exec_prc) + : jit_emitter(host, host_isa, exec_prc), kind(algKind), alpha(alpha), beta(beta) { set_injector(); } @@ -83,8 +78,10 @@ void jit_mkldnn_emitter::emit_data() const { } } -jit_mkldnn_aux_emitter::jit_mkldnn_aux_emitter(jit_generator *host, cpu_isa_t host_isa, const MKLDNNNode* node, InferenceEngine::Precision exec_prc) - : jit_mkldnn_emitter(host, host_isa, node, exec_prc) { +jit_mkldnn_aux_emitter::jit_mkldnn_aux_emitter(jit_generator *host, cpu_isa_t host_isa, + mkldnn_alg_kind_t algKind, float inpAlpha, float inpBeta, + InferenceEngine::Precision exec_prc) + : jit_mkldnn_emitter(host, host_isa, algKind, inpAlpha, inpBeta, exec_prc) { } } // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.hpp index c79a6444a28..5260b496807 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_mkldnn_emitters.hpp @@ -25,7 +25,8 @@ public: const emitter_context *emit_context = nullptr) const override {}; protected: - jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, + mkldnn_alg_kind_t algKind, float inpAlpha, float inpBeta, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); jit_mkldnn_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr& n, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); @@ -45,7 +46,8 @@ private: class jit_mkldnn_aux_emitter : public jit_mkldnn_emitter { public: - jit_mkldnn_aux_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, const MKLDNNNode* node, + jit_mkldnn_aux_emitter(mkldnn::impl::cpu::x64::jit_generator *host, mkldnn::impl::cpu::x64::cpu_isa_t host_isa, + mkldnn_alg_kind_t algKind, float inpAlpha, float inpBeta, InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32); private: diff --git a/src/plugins/intel_cpu/src/mkldnn_graph.cpp b/src/plugins/intel_cpu/src/mkldnn_graph.cpp index 15d6f1b79a5..f9374c48610 100644 --- a/src/plugins/intel_cpu/src/mkldnn_graph.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_graph.cpp @@ -67,9 +67,11 @@ void MKLDNNGraph::CreateGraph(NET &net, const MKLDNNExtensionManager::Ptr& extMg if (IsReady()) ForgetGraphData(); - // disable caching if graph was created only once + // disable weights caching if graph was created only once weightsCache = config.streamExecutorConfig._streams != 1 ? w_cache : nullptr; + rtParamsCache = std::make_shared(config.rtCacheCapacity); + Replicate(net, extMgr); InitGraph(); @@ -113,6 +115,7 @@ void MKLDNNGraph::Replicate(const std::shared_ptr &subgr if (isQuantized()) { node->setQuantizedGraphFlag(true); } + node->setRuntimeCache(rtParamsCache); graphNodes.push_back(node); @@ -209,6 +212,7 @@ void MKLDNNGraph::Replicate(const CNNNetwork &network, const MKLDNNExtensionMana if (isQuantized()) { node->setQuantizedGraphFlag(true); } + node->setRuntimeCache(rtParamsCache); graphNodes.push_back(node); if (op->get_type_info() == ngraph::op::v0::Parameter::get_type_info_static()) { @@ -1191,6 +1195,7 @@ bool MKLDNNGraph::InsertNode(MKLDNNNodePtr parent, MKLDNNNodePtr child, MKLDNNNo if (isQuantized()) { node->setQuantizedGraphFlag(true); } + node->setRuntimeCache(rtParamsCache); if (initNode) { node->getSupportedDescriptors(); diff --git a/src/plugins/intel_cpu/src/mkldnn_graph.h b/src/plugins/intel_cpu/src/mkldnn_graph.h index 9d6703df665..93f1b9b1bbc 100644 --- a/src/plugins/intel_cpu/src/mkldnn_graph.h +++ b/src/plugins/intel_cpu/src/mkldnn_graph.h @@ -10,6 +10,7 @@ #include "normalize_preprocess.h" #include "mkldnn_node.h" #include "mkldnn_edge.h" +#include "cache/multi_cache.h" #include #include #include @@ -247,6 +248,8 @@ private: std::vector constantGraphNodes; std::vector executableGraphNodes; + MultiCachePtr rtParamsCache; + void EnforceBF16(); }; diff --git a/src/plugins/intel_cpu/src/mkldnn_node.h b/src/plugins/intel_cpu/src/mkldnn_node.h index 48f497f9bb0..8f9498fbe82 100644 --- a/src/plugins/intel_cpu/src/mkldnn_node.h +++ b/src/plugins/intel_cpu/src/mkldnn_node.h @@ -29,6 +29,7 @@ #include "cpu_types.h" #include "cpu_shape.h" #include "memory_desc/cpu_memory_desc.h" +#include "cache/multi_cache.h" namespace MKLDNNPlugin { @@ -582,6 +583,19 @@ public: */ std::pair, std::vector> getScalesAndShifts(const MKLDNNNode *parentNode) const; + /** + * @brief Appends new item into ops list with the information on how the node should be executed as post operation. + * Seed node should call this routine and pass its post operations list as parameter. + * @param ops List of fused post operations + */ + virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims); + + virtual void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector& binaryPostOpsMem); + + void setRuntimeCache(MultiCachePtr cache) { + rtParamsCache = cache; + } + protected: bool canFuseSimpleOperation(const MKLDNNNodePtr& node) const; @@ -597,15 +611,7 @@ protected: virtual MemoryDescPtr getSrcMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx); virtual MemoryDescPtr getDstMemDesc(mkldnn::primitive_desc_iterator &primitive_desc_it, size_t idx); - /** - * @brief Appends new item into ops list with the information on how the node should be executed as post operation. - * Seed node should call this routine and pass its post operations list as parameter. - * @param ops List of fused post operations - */ - virtual void appendPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims); - virtual void appendBinPostOps(mkldnn::post_ops& ops, const VectorDims& postOpDims, std::vector& binaryPostOpsMem); - - virtual std::shared_ptr initPrimitiveAttr() { return nullptr; } + virtual AttrPtr initPrimitiveAttr() { return nullptr; } typedef std::function GetPrimitiveMemoryFormatFunc; @@ -755,6 +761,10 @@ protected: IE_THROW(NotImplemented) << "[DS] prapareParams not implemented for node with type " << NameFromType(getType()); } + MultiCachePtr getRuntimeCache() const { + return rtParamsCache; + } + std::vector lastInputDims = {}; std::shared_ptr opToShapeInfer; @@ -780,6 +790,8 @@ private: PerfCount perfCounter; PerfCounters profiling; + MultiCachePtr rtParamsCache; + bool isEdgesEmpty(const std::vector& edges) const; void createShapeInferSubgraph(const std::shared_ptr& op); diff --git a/src/plugins/intel_cpu/src/nodes/common/softmax.cpp b/src/plugins/intel_cpu/src/nodes/common/softmax.cpp index 6a8d54fc11a..1f3d476b371 100644 --- a/src/plugins/intel_cpu/src/nodes/common/softmax.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/softmax.cpp @@ -64,7 +64,7 @@ struct jit_uni_softmax_kernel_f32 : public jit_uni_softmax_kernel, public jit_ge exp_injector.reset(new jit_uni_eltwise_injector_f32(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr)); + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa)); this->preamble(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp index f02b8dd3b26..d44909a7185 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp @@ -25,6 +25,7 @@ #include #include "utils/general_utils.h" #include "utils/cpu_utils.hpp" +#include #include "ngraph/ngraph.hpp" #include @@ -64,14 +65,31 @@ struct EltwiseEmitterContext { std::shared_ptr emitter; jit_generator *host; cpu_isa_t host_isa; - const MKLDNNNode *node; + const MKLDNNEltwiseNode::EltwiseData& opData; InferenceEngine::Precision exec_prc; }; template struct EltwiseEmitter { void operator()(EltwiseEmitterContext & ctx) { - ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.node, ctx.exec_prc); + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.exec_prc); + } +}; + +template<> +struct EltwiseEmitter { + void operator()(EltwiseEmitterContext & ctx) { + auto algKind = static_cast(ctx.opData.mkldnnAlgorithm); + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, algKind, + ctx.opData.alpha, ctx.opData.beta, ctx.exec_prc); + } +}; + +template<> +struct EltwiseEmitter { + void operator()(EltwiseEmitterContext & ctx) { + ctx.emitter = std::make_shared(ctx.host, ctx.host_isa, ctx.opData.alpha, + ctx.opData.beta, ctx.opData.gamma, ctx.exec_prc); } }; @@ -81,8 +99,11 @@ template struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, public jit_generator { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_eltwise_generic) - explicit jit_uni_eltwise_generic(const jit_eltwise_params& jep, MKLDNNEltwiseNode& eltwiseNode) : - jit_uni_eltwise_kernel(jep, eltwiseNode), jit_generator() {} + explicit jit_uni_eltwise_generic(const jit_eltwise_params& jep, + const std::vector& eltwise_data, + const std::vector& ops_list, + const mkldnn::post_ops& post_ops) + : jit_uni_eltwise_kernel(jep), jit_generator(), eltwise_data_(eltwise_data), ops_list_(ops_list), post_ops_(post_ops) {} void create_ker() override { jit_generator::create_kernel(); @@ -92,19 +113,27 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu void generate() override { Precision exec_prc = Precision::UNSPECIFIED; - std::set supported_precision_intersection = get_supported_precisions(eltwiseNode); - for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) { - if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) { - std::set prcs = get_supported_precisions(*eltwiseNode.getFusedWith()[i].get()); - std::set prcs_intersect = {}; + std::set supported_precision_intersection = get_supported_precisions(eltwise_data_.front().algo); + for (size_t i = 1; i < eltwise_data_.size(); ++i) { + std::set prcs = get_supported_precisions(eltwise_data_[i].algo); + std::set prcs_intersect = {}; - std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(), - prcs.begin(), prcs.end(), std::inserter(prcs_intersect, prcs_intersect.begin())); + std::set_intersection(supported_precision_intersection.begin(), supported_precision_intersection.end(), + prcs.begin(), prcs.end(), std::inserter(prcs_intersect, prcs_intersect.begin())); - supported_precision_intersection = prcs_intersect; - } + supported_precision_intersection = prcs_intersect; } + static const Precision exec_precisions_priority[] = { + Precision::U8, + Precision::I8, + Precision::U16, + Precision::I16, + Precision::BF16, + Precision::I32, + Precision::FP32 + }; + for (auto prc : exec_precisions_priority) { if (std::find(supported_precision_intersection.begin(), supported_precision_intersection.end(), prc) != supported_precision_intersection.end()) { exec_prc = prc; @@ -120,29 +149,25 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu } if (exec_prc == Precision::UNSPECIFIED) { - IE_THROW() << "Eltwise jitter failed to specify execution precision for Eltwise node with name `" << eltwiseNode.getName() << "`"; + IE_THROW() << "Eltwise jitter failed to specify execution precision for Eltwise node"; } - eltwise_emitter = create_eltwise_emitter(eltwiseNode, exec_prc); + eltwise_emitter = create_eltwise_emitter(eltwise_data_.front(), exec_prc); + for (size_t i = 1; i < eltwise_data_.size(); ++i) { + post_op_emitters.push_back(create_eltwise_emitter(eltwise_data_[i], exec_prc)); + } - mkldnn::post_ops post_ops; - for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) { - if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) { - post_op_emitters.push_back(create_eltwise_emitter(*eltwiseNode.getFusedWith()[i].get(), exec_prc)); - } else if (eltwiseNode.getFusedWith()[i].get()->getType() == FakeQuantize) { - auto fakeQuantizeNode = dynamic_cast(eltwiseNode.getFusedWith()[i].get()); - if (!fakeQuantizeNode) { - IE_THROW() << "Cannot cast " << eltwiseNode.getFusedWith()[i]->getName() << " to MKLDNNFakeQuantizeNode"; - } - fakeQuantizeNode->appendPostOps(post_ops); - - quantization_injectors.push_back(std::make_shared>( - this, post_ops.get()->entry_[post_ops.len() - 1], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); + const auto& p = post_ops_.get(); + for (int i = 0; i < post_ops_.len(); ++i) { + if (!p->entry_[i].is_quantization()) { + IE_THROW() << "Eltwise jitter error. Unsupported post op detected"; } + quantization_injectors.push_back(std::make_shared>( + this, p->entry_[i], vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias)); } if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr)); + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa)); const auto &jep = jep_; @@ -169,6 +194,8 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu mov(reg_dst, ptr[reg_const_params + GET_OFF(dst_ptr)]); init_ptrs_with_offsets(reg_dst, jep.dst_offsets); + mov(reg_post_op_ptrs, ptr[reg_const_params + GET_OFF(post_op_data)]); + xor_(reg_oc_off, reg_oc_off); init_ptrs_with_offsets(reg_oc_off, jep.oc_offsets); @@ -211,7 +238,7 @@ struct jit_uni_eltwise_generic : public MKLDNNPlugin::jit_uni_eltwise_kernel, pu is_valid_configuration = false; if (!is_valid_configuration) - IE_THROW() << "Eltwise jitter has invalid configuration for Eltwise node with name `" << eltwiseNode.getName() << "`"; + IE_THROW() << "Eltwise jitter has invalid configuration for Eltwise node"; L(unroll_loop_label); { @@ -359,6 +386,7 @@ private: return Xmm(get_vmm_reg(idx).getIdx()); } + Reg64 reg_post_op_ptrs = rax; Reg64 reg_dst = rbx; Reg64 reg_work_amount = rdx; @@ -387,20 +415,14 @@ private: std::vector>> quantization_injectors = {}; - std::vector exec_precisions_priority = { - Precision::U8, - Precision::I8, - Precision::U16, - Precision::I16, - Precision::BF16, - Precision::I32, - Precision::FP32 - }; + const std::vector& eltwise_data_; + const std::vector& ops_list_; + const mkldnn::post_ops& post_ops_; - std::set get_supported_precisions(MKLDNNNode& node) { + std::set get_supported_precisions(Algorithm algo) { std::set precisions; - OV_SWITCH(MKLDNNPlugin, SupportedPrecisions, precisions, node.getAlgorithm(), + OV_SWITCH(MKLDNNPlugin, SupportedPrecisions, precisions, algo, OV_CASE(EltwiseRelu, jit_mkldnn_aux_emitter), OV_CASE(EltwiseGelu, jit_mkldnn_aux_emitter), OV_CASE(EltwiseElu, jit_mkldnn_aux_emitter), @@ -448,18 +470,16 @@ private: return precisions; } - std::shared_ptr create_eltwise_emitter(MKLDNNNode& node, Precision exec_prec) { - const auto& eltwiseNode = dynamic_cast(node); - + std::shared_ptr create_eltwise_emitter(const MKLDNNEltwiseNode::EltwiseData& data, Precision exec_prec) { EltwiseEmitterContext ctx = { nullptr, this, isa, - &node, + data, exec_prec }; - OV_SWITCH(MKLDNNPlugin, EltwiseEmitter, ctx, eltwiseNode.getAlgorithm(), + OV_SWITCH(MKLDNNPlugin, EltwiseEmitter, ctx, data.algo, OV_CASE(EltwiseRelu, jit_mkldnn_aux_emitter), OV_CASE(EltwiseGelu, jit_mkldnn_aux_emitter), OV_CASE(EltwiseElu, jit_mkldnn_aux_emitter), @@ -525,8 +545,8 @@ private: int input_idx = eltwise_emitter->get_inputs_num(); int eltwise_post_op_idx = 0; int quantization_post_op_idx = 0; - for (int i = 0; i < eltwiseNode.getFusedWith().size(); i++) { - if (eltwiseNode.getFusedWith()[i].get()->getType() == Eltwise) { + for (int i = 1; i < ops_list_.size(); i++) { + if (ops_list_[i] == Eltwise) { std::vector in_idxs; std::vector aux_idxs; in_idxs.push_back(vmm_dst.getIdx()); @@ -541,22 +561,27 @@ private: post_op_emitters[eltwise_post_op_idx]->emit_code(in_idxs, out_idxs, aux_idxs); eltwise_post_op_idx++; - } else { - bool do_dequantization = eltwiseNode.getFusedWith()[i]->getAlgorithm() == FQCommon; - bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != eltwiseNode.getFusedWith().size() - 1; + } else if (ops_list_[i] == FakeQuantize) { + auto& p = post_ops_.get()->entry_[quantization_post_op_idx]; + bool do_dequantization = p.quantization.alg == dnnl::impl::alg_kind::quantization_quantize_dequantize; + bool do_rounding = do_dequantization || jep_.dst_prc == Precision::FP32 || i != ops_list_.size() - 1; int s_idx = vmm_dst.getIdx(); - quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_oc_off); + size_t ptrs_table_off = quantization_post_op_idx * quantization_injectors[quantization_post_op_idx]->memoryStep(); + + quantization_injectors[quantization_post_op_idx]->init_crop_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); quantization_injectors[quantization_post_op_idx]->compute_crop(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); - quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_oc_off); + quantization_injectors[quantization_post_op_idx]->init_input_scale_shift_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); quantization_injectors[quantization_post_op_idx]->compute_input_scale_shift(s_idx, s_idx + 1, offset, do_rounding, is_scalar, jep_.oc_size == 1); - quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_oc_off); + quantization_injectors[quantization_post_op_idx]->init_output_scale_shift_ptrs(reg_post_op_ptrs + ptrs_table_off, reg_oc_off); quantization_injectors[quantization_post_op_idx]->compute_output_scale_shift(s_idx, s_idx + 1, offset, is_scalar, jep_.oc_size == 1); quantization_post_op_idx++; + } else { + IE_THROW(Unexpected) << "Eltwise jit kernel: unexpected operation type"; } } } @@ -991,6 +1016,528 @@ const std::map M }}, }; + +namespace { +struct EltwiseKey { + std::vector eltwise_data; + std::vector ops_list; + VectorDims outBlkDims; + VectorDims outOrder; + std::vector inpDims; + std::vector inpPrc; + InferenceEngine::Precision outPrc; + mkldnn::post_ops postOps; + bool useDynBatch; + bool useJit; + + size_t hash() const { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; + size_t seed = 0; + auto hash_combine_eltwiseData = [](size_t seed, const MKLDNNEltwiseNode::EltwiseData& eltwiseData) { + seed = hash_combine(seed, eltwiseData.algo); + seed = hash_combine(seed, eltwiseData.mkldnnAlgorithm); + seed = hash_combine(seed, eltwiseData.alpha); + seed = hash_combine(seed, eltwiseData.beta); + seed = hash_combine(seed, eltwiseData.gamma); + return seed; + }; + std::for_each(eltwise_data.begin(), eltwise_data.end(), [&](const MKLDNNEltwiseNode::EltwiseData& item) { + seed = hash_combine_eltwiseData(seed, item); + }); + seed = get_vector_hash(seed, ops_list); + seed = get_vector_hash(seed, outBlkDims); + seed = get_vector_hash(seed, outOrder); + for (auto&& item : inpDims) { + seed = get_vector_hash(seed, item); + } + std::for_each(inpPrc.begin(), inpPrc.end(), [&](const Precision& item) { + seed = hash_combine(seed, item.getPrecVal()); + }); + seed = hash_combine(seed, outPrc.getPrecVal()); + seed = get_post_op_hash(seed, *postOps.get()); + seed = hash_combine(seed, useDynBatch); + seed = hash_combine(seed, useJit); + return seed; + } + + bool operator==(const EltwiseKey& rhs) const { + if (inpDims.size() != rhs.inpDims.size()) { + return false; + } + + bool result = eltwise_data == rhs.eltwise_data && + ops_list == rhs.ops_list && + outBlkDims == rhs.outBlkDims && + outOrder == rhs.outOrder && + inpPrc == rhs.inpPrc && + outPrc == rhs.outPrc && + *postOps.get() == *rhs.postOps.get() && + useDynBatch == rhs.useDynBatch && + useJit == rhs.useJit; + + for (size_t i = 0; i < inpDims.size() && result; ++i) { + result = result && (inpDims[i] == rhs.inpDims[i]); + } + return result; + } +}; + +class EltwiseJitExecutor : public MKLDNNEltwiseNode::IEltwiseExecutor { +public: + static void offset_out_calc(VectorDims& offset, const VectorDims& dims) { + int k = 1; + for (int i = offset.size() - 1; i >= 0; i--) { + offset[i] = k; + k *= dims[i]; + } + } + + static void offset_in_calc(VectorDims& offset, const VectorDims& dims_in, const VectorDims& dims_out) { + int k = 1; + for (int i = offset.size() - 1; i >= 0; i--) { + offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; + k *= dims_in[i]; + } + } + + EltwiseJitExecutor(const std::vector& eltwise_data, + const std::vector& ops_list, + const VectorDims& outBlkDims, + const VectorDims& outOrder, + std::vector inpDims, + const std::vector& inpPrc, + const InferenceEngine::Precision& outPrc, + const mkldnn::post_ops& post_ops, + bool useDynBatch) { + auto collapseLastDims = [](std::vector& dims, int dimsToCollapse) { + for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { + dims[dims.size() - 1] *= dims[i]; + } + + for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { + dims[i] = dims[i - dimsToCollapse]; + } + + for (int i = dimsToCollapse - 1; i >= 0; i--) { + dims[i] = 1; + } + }; + + auto collapseLastOffsets = [](std::vector& dims, int dimsToCollapse) { + for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { + if (dims[dims.size() - 1] > 0 || dims[i] > 0) + dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * std::max(dims[i], static_cast(1)); + else + dims[dims.size() - 1] *= dims[i]; + } + + for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { + dims[i] = dims[i - dimsToCollapse]; + } + + for (int i = dimsToCollapse - 1; i >= 0; i--) { + dims[i] = 0; + } + }; + + auto isFusedWith = [&](Type type_) { + auto start_itr = ops_list.begin(); + std::advance(start_itr, 1); // apply offset since the first op in the list is the op itself + return any_of(start_itr, ops_list.end(), [=](Type type) { return type == type_; }); + }; + + if (inpDims.empty()) { + IE_THROW() << "Can not make Eltwise executor from empty input dims array"; + } else if (inpDims.front().empty()) { + IE_THROW() << "Can not make Eltwise executor from empty input dims members"; + } + + jit_eltwise_params jep = {}; + size_t inputsNumber = inpDims.size(); + + jep.input_size = inpDims.front().size(); + + jep.dims.resize(jep.input_size, 1); + + if (outBlkDims.empty()) { + IE_THROW() << "Can not make Eltwise executor from empty block dims vector"; + } + + size_t outRank = outBlkDims.size(); + for (int i = 0; i < outRank; i++) { + jep.dims[jep.dims.size() - 1 - i] = outBlkDims[outRank - 1 - i]; + } + + for (int i = 0; i < inpDims.size(); i++) { + for (int j = 0; j < inpDims[i].size(); j++) { + if (inpDims[i][j] != jep.dims[j] && inpDims[i][j] != 1) + IE_THROW() << "Eltwise executor got invalid input/output dims configuration."; + } + } + + if (outBlkDims.size() != outOrder.size()) { + IE_THROW() << "Can not make Elwtise executor due to out blocked dims and out order vectors size mismatch."; + } + + int lastUnchangedAxis = 0; + size_t oc_size = 0; + jep.oc_offsets.resize(jep.input_size, 0); + std::fill(jep.oc_offsets.begin(), jep.oc_offsets.end(), 0); + if (isFusedWith(FakeQuantize)) { + size_t offset_oc = 1; + for (int i = outOrder.size() - 1; i >= 0; i--) { + if (outOrder[i] == 1) { + int oc_dim_idx = i + (jep.input_size - outOrder.size()); + jep.oc_offsets[oc_dim_idx] = offset_oc; + offset_oc *= jep.dims[oc_dim_idx]; + if (oc_dim_idx + 1 != jep.input_size) { // since in nspc case we can safely collapse the last axis + lastUnchangedAxis = oc_dim_idx; + } + } + } + oc_size = jep.oc_offsets[jep.dims.size() - 1] != 0 ? jep.dims[jep.dims.size() - 1] : 1; + } + + int maxCollapsedDims = static_cast(jep.dims.size()) - lastUnchangedAxis - 2; + + size_t fullWorkAmount = 1; + for (int i = 0; i < jep.dims.size(); i++) { + fullWorkAmount *= jep.dims[i]; + } + + size_t minimalConcurrency = parallel_get_max_threads(); + size_t minimalJitWorkAmount = 256; + size_t currentJitWorkAmount = jep.dims[jep.dims.size() - 1]; + int collapsedDims = 0; + + bool hasDifferentDims = false; + while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount && + // we shouldn't collapse batch dimension in case dynamic batch is enabled + (!useDynBatch || (outBlkDims.size() - collapsedDims > 2))) { + if (collapsedDims >= maxCollapsedDims) + break; + + for (int j = 1; j < inpDims.size(); j++) { + if (inpDims[j].back() != inpDims[0].back()) { + hasDifferentDims = true; + } + } + + if (oc_size > 1 && oc_size != inpDims[0][inpDims[0].size() - 1]) { + hasDifferentDims = true; + } + + bool canCollapse = true; + for (int i = 0; i < inpDims.size(); i++) { + if (inpDims[i][inpDims[i].size() - 2] != 1) { + if (hasDifferentDims) { + canCollapse = false; + break; + } + } + } + + if (!canCollapse) { + break; + } + + size_t nextJitWorkAmount = currentJitWorkAmount * jep.dims[jep.dims.size() - 2]; + if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { + currentJitWorkAmount = nextJitWorkAmount; + collapsedDims++; + + for (int i = 0; i < inpDims.size(); i++) { + collapseLastDims(inpDims[i], 1); + } + collapseLastDims(jep.dims, 1); + + if (isFusedWith(FakeQuantize)) { + collapseLastOffsets(jep.oc_offsets, 1); + } + } else { + break; + } + } + + _batchDimIdx = jep.input_size - outBlkDims.size() + collapsedDims; + _schedulerWorkAmount = fullWorkAmount / jep.dims[jep.dims.size() - 1]; + + if (inpPrc.size() != inputsNumber) { + IE_THROW() << "Can not make Elwtise executor. Wrong input precisions vector size."; + } + + // init offset + jep.dst_offsets.resize(jep.input_size, 1); + offset_out_calc(jep.dst_offsets, jep.dims); + for (int j = 0; j < jep.input_size; j++) { + jep.dst_offsets[j] *= outPrc.size(); + } + + for (int i = 0; i < inputsNumber; i++) { + jep.src_offsets[i].resize(jep.input_size, 1); + offset_in_calc(jep.src_offsets[i], inpDims[i], jep.dims); + for (int j = 0; j < jep.input_size; j++) { + jep.src_offsets[i][j] *= inpPrc[i].size(); + } + } + + jep.inputs_number = inputsNumber; + + for (int i = 0; i < inputsNumber; i++) { + jep.src_prc[i] = inpPrc[i]; + jep.src_size[i] = inpDims[i][inpDims[i].size() - 1]; + } + jep.dst_prc = outPrc; + jep.work_amount = jep.dst_size = jep.dims.back(); + jep.oc_size = oc_size; + + std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), + [](size_t& offset) { return offset * sizeof(float);}); + + if (mayiuse(x64::avx512_common)) { + _pKernel.reset(new jit_uni_eltwise_generic(jep, eltwise_data, ops_list, post_ops)); + } else if (mayiuse(x64::avx2)) { + _pKernel.reset(new jit_uni_eltwise_generic(jep, eltwise_data, ops_list, post_ops)); + } else if (mayiuse(x64::sse41)) { + _pKernel.reset(new jit_uni_eltwise_generic(jep, eltwise_data, ops_list, post_ops)); + } else { + IE_THROW() << "Can't create jit eltwise kernel"; + } + + if (_pKernel) + _pKernel->create_ker(); + } + + void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + if (!_pKernel) + IE_THROW() << "Can't execute, kernel for eltwise node is not compiled"; + + if (_pKernel->jep_.input_size == optimalTensorRank) { + // execute Optimized 6D + parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], + [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { + auto args = jit_eltwise_call_args_indexes(); + args.indexes[0] = i0; + args.indexes[1] = i1; + args.indexes[2] = i2; + args.indexes[3] = i3; + args.indexes[4] = i4; + + (*_pKernel)(&args_ptrs, &args); + }); + } else { + // execute Optimized Generic + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(_schedulerWorkAmount, nthr, ithr, start, end); + + std::vector counters(dims_out.size() - 1, 0); + auto args = jit_eltwise_call_args_indexes(); + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = dims_out.size() - 2; j >= 0; j--) { + counters[j] = tmp % dims_out[j]; + tmp /= dims_out[j]; + } + + for (size_t j = 0; j < counters.size(); j++) + args.indexes[j] = counters[j]; + + (*_pKernel)(&args_ptrs, &args); + } + }); + } + } + const VectorDims& getOutDims() const override { + if (!_pKernel) + IE_THROW() << "Can't get jit eltwise params, kernel for Eltwise executor is not compiled"; + return _pKernel->jep_.dims; + } + size_t getBatchDimIdx() const override { + return _batchDimIdx; + } + +private: + std::unique_ptr _pKernel; + size_t _schedulerWorkAmount = 0; + size_t _batchDimIdx = 0; + +public: + static const int optimalTensorRank = 6; +}; + +class EltwiseRefExecutor : public MKLDNNEltwiseNode::IEltwiseExecutor { +public: + EltwiseRefExecutor(MKLDNNEltwiseNode::EltwiseData opData, + const VectorDims& outBlkDims, + std::vector inpDims) + : _opData(std::move(opData)) { + if (inpDims.empty()) { + IE_THROW() << "Can not make Eltwise executor from empty input dims array"; + } else if (inpDims.front().empty()) { + IE_THROW() << "Can not make Eltwise executor from empty input dims array members"; + } + + if (outBlkDims.empty()) { + IE_THROW() << "Can not make Elwtise executor from empty output blocked dims vector"; + } + + _inputNum = inpDims.size(); + size_t input_size = inpDims.front().size(); + _batchDimIdx = input_size - outBlkDims.size(); + + _dims.resize(input_size, 1); + for (int i = 0; i < outBlkDims.size(); i++) { + _dims[_dims.size() - 1 - i] = outBlkDims[outBlkDims.size() - 1 - i]; + } + + _fullWorkAmount = 1; + for (int i = 0; i < _dims.size(); i++) { + _fullWorkAmount *= _dims[i]; + } + + // init offset + _dst_offsets.resize(input_size, 1); + EltwiseJitExecutor::offset_out_calc(_dst_offsets, _dims); + for (int j = 0; j < input_size; j++) { + _dst_offsets[j] *= sizeof(float); // only FP32 out prc is supported + } + + for (int i = 0; i < _inputNum; i++) { + _src_offsets[i].resize(input_size, 1); + EltwiseJitExecutor::offset_in_calc(_src_offsets[i], inpDims[i], _dims); + for (int j = 0; j < input_size; j++) { + _src_offsets[i][j] *= sizeof(float); // only FP32 inp prcs are supported + } + } + } + + void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override { + std::shared_ptr ref_eltwise_injector = nullptr; + if (_opData.mkldnnAlgorithm != mkldnn::algorithm::undef) { + ref_eltwise_injector = std::make_shared( + static_cast(_opData.mkldnnAlgorithm), _opData.alpha, _opData.beta, 1.f); + } + + parallel_nt(0, [&](const int ithr, const int nthr) { + size_t start = 0, end = 0; + splitter(_fullWorkAmount, nthr, ithr, start, end); + + std::vector counters(dims_out.size(), 0); + + for (size_t iwork = start; iwork < end; ++iwork) { + size_t tmp = iwork; + for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) { + counters[j] = tmp % dims_out[j]; + tmp /= dims_out[j]; + } + + size_t index_in[MAX_ELTWISE_INPUTS] = {0}; + for (int i = 0; i < _inputNum; i++) { + index_in[i] = 0; + for (int j = 0; j < counters.size(); j++) { + index_in[i] += counters[j] * _src_offsets[i][j]; + } + index_in[i] /= sizeof(float); + } + + size_t index_out = 0; + for (int j = 0; j < counters.size(); j++) { + index_out += counters[j] * _dst_offsets[j]; + } + index_out /= sizeof(float); + + std::vector src_f(_inputNum); + for (int i = 0; i < _inputNum; i++) { + src_f[i] = (reinterpret_cast(args_ptrs.src_ptr[i]) + index_in[i])[0]; + } + float* dst_ptr_f = reinterpret_cast(args_ptrs.dst_ptr) + index_out; + + switch (_opData.algo) { + case EltwiseRelu: case EltwiseGelu: case EltwiseElu: case EltwiseTanh: case EltwiseSigmoid: case EltwiseAbs: + case EltwiseSqrt: case EltwiseSoftRelu: case EltwiseExp: case EltwiseClamp: + case EltwiseSwish: case EltwiseHswish: case EltwiseMish: case EltwiseHsigmoid: + case EltwiseRoundHalfToEven: case EltwiseRoundHalfAwayFromZero: + *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); break; + case EltwiseAdd: *dst_ptr_f = src_f[0] + src_f[1]; break; + case EltwiseMulAdd: *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break; + case EltwiseSubtract: *dst_ptr_f = src_f[0] - src_f[1]; break; + case EltwiseMultiply: *dst_ptr_f = src_f[0] * src_f[1]; break; + case EltwiseDivide: *dst_ptr_f = src_f[0] / src_f[1]; break; + case EltwiseFloorMod: *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break; + case EltwiseMod: *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break; + case EltwiseMaximum: *dst_ptr_f = std::max(src_f[0], src_f[1]); break; + case EltwiseMinimum: *dst_ptr_f = std::min(src_f[0], src_f[1]); break; + case EltwiseSquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break; + case EltwisePowerDynamic: *dst_ptr_f = powf(src_f[0], src_f[1]); break; + case EltwiseEqual: *dst_ptr_f = src_f[0] == src_f[1]; break; + case EltwiseNotEqual: *dst_ptr_f = src_f[0] != src_f[1]; break; + case EltwiseGreater: *dst_ptr_f = src_f[0] > src_f[1]; break; + case EltwiseGreaterEqual: *dst_ptr_f = src_f[0] >= src_f[1]; break; + case EltwiseLess: *dst_ptr_f = src_f[0] < src_f[1]; break; + case EltwiseLessEqual: *dst_ptr_f = src_f[0] <= src_f[1]; break; + case EltwiseLogicalAnd: *dst_ptr_f = src_f[0] && src_f[1]; break; + case EltwiseLogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break; + case EltwiseLogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break; + case EltwiseLogicalNot: *dst_ptr_f = !src_f[0]; break; + case EltwisePowerStatic: *dst_ptr_f = powf(_opData.beta * src_f[0] + _opData.gamma, _opData.alpha); break; + case EltwisePrelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : src_f[0] * src_f[1]; break; + case EltwiseErf: *dst_ptr_f = std::erf(src_f[0]); break; + default: IE_THROW() << "Unsupported operation type for Eltwise executor"; + } + } + }); + } + + const VectorDims& getOutDims() const override { + return _dims; + } + + size_t getBatchDimIdx() const override { + return _batchDimIdx; + } + +private: + const MKLDNNEltwiseNode::EltwiseData _opData; + VectorDims _dims; + VectorDims _src_offsets[MAX_ELTWISE_INPUTS]; + VectorDims _dst_offsets; + size_t _fullWorkAmount = 0; + size_t _inputNum = 0; + size_t _batchDimIdx = 0; +}; + +} // namespace + +bool MKLDNNEltwiseNode::EltwiseData::operator==(const EltwiseData &rhs) const noexcept { + return algo == rhs.algo && + mkldnnAlgorithm == rhs.mkldnnAlgorithm && + alpha == rhs.alpha && + beta == rhs.beta && + gamma == rhs.gamma; +} + +static MKLDNNEltwiseNode::executorPtr buildExecutor(const EltwiseKey& key) { + MKLDNNEltwiseNode::executorPtr execPtr; + if (key.useJit) { + execPtr = std::make_shared(key.eltwise_data, + key.ops_list, + key.outBlkDims, + key.outOrder, + key.inpDims, + key.inpPrc, + key.outPrc, + key.postOps, + key.useDynBatch); + } else { + execPtr = std::make_shared(key.eltwise_data.front(), + key.outBlkDims, + key.inpDims); + } + return execPtr; +} + bool MKLDNNEltwiseNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (initializers.find(op->get_type_info()) == initializers.end()) { @@ -1073,6 +1620,10 @@ void MKLDNNEltwiseNode::initSupportedPrimitiveDescriptors() { // if dim rank is greater than the maximum possible, we should use the reference execution canUseOptimizedImpl = mayiuse(x64::sse41) && getInputShapeAtPort(0).getRank() <= MAX_ELTWISE_DIM_RANK; + if (!canUseOptimizedImpl && !fusedWith.empty()) { + IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' uses reference impl, but unexpectedly fused with other ops"; + } + size_t expectedInputsNum = getOpInputsNum(); for (auto& postOp : fusedWith) { auto* eltwiseNode = dynamic_cast(postOp.get()); @@ -1310,58 +1861,21 @@ void MKLDNNEltwiseNode::prepareParams() { memPtrs.push_back(getChildEdgeAt(0)->getMemoryPtr()); } - auto collapseLastDims = [](std::vector& dims, int dimsToCollapse) { - for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { - dims[dims.size() - 1] *= dims[i]; - } - - for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { - dims[i] = dims[i - dimsToCollapse]; - } - - for (int i = dimsToCollapse - 1; i >= 0; i--) { - dims[i] = 1; - } - }; - - auto collapseLastOffsets = [](std::vector& dims, int dimsToCollapse) { - for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { - if (dims[dims.size() - 1] > 0 || dims[i] > 0) - dims[dims.size() - 1] = std::max(dims[dims.size() - 1], static_cast(1)) * std::max(dims[i], static_cast(1)); - else - dims[dims.size() - 1] *= dims[i]; - } - - for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { - dims[i] = dims[i - dimsToCollapse]; - } - - for (int i = dimsToCollapse - 1; i >= 0; i--) { - dims[i] = 0; - } - }; - - jit_eltwise_params jep = {}; - std::vector dims_in; - auto outBlockingDesc = getChildEdgeAt(0)->getMemory().GetDescWithType(); const auto &outOrder = outBlockingDesc->getOrder(); const auto ¤tOutBlkDims = outBlockingDesc->getBlockDims(); + isDynBatchEnabled = getSelectedPrimitiveDescriptor()->getConfig().dynBatchSupport; - jep.input_size = std::max(static_cast(optimalTensorRank), currentOutBlkDims.size()); + size_t input_size = std::max(static_cast(EltwiseJitExecutor::optimalTensorRank), currentOutBlkDims.size()); + std::vector dims_in; // init dims dims_in.resize(inputNum); for (int i = 0; i < inputNum; i++) { - dims_in[i].resize(jep.input_size, 1); + dims_in[i].resize(input_size, 1); } - jep.dims.resize(jep.input_size, 1); - size_t outRank = currentOutBlkDims.size(); - for (int i = 0; i < outRank; i++) { - jep.dims[jep.dims.size() - 1 - i] = currentOutBlkDims[outRank - 1 - i]; - } for (int i = 0; i < inputNum; i++) { auto inBlockingDesc = getParentEdgeAt(i)->getMemory().GetDescWithType(); @@ -1383,108 +1897,6 @@ void MKLDNNEltwiseNode::prepareParams() { } } - for (int i = 0; i < dims_in.size(); i++) { - for (int j = 0; j < dims_in[i].size(); j++) { - if (dims_in[i][j] != jep.dims[j] && dims_in[i][j] != 1) - IE_THROW() << "Eltwise node with name `" << getName() << "` has invalid input/output dims configuration."; - } - } - - size_t oc_size = 0; - jep.oc_offsets.resize(jep.input_size, 0); - std::fill(jep.oc_offsets.begin(), jep.oc_offsets.end(), 0); - if (isFusedWith(FakeQuantize)) { - size_t offset_oc = 1; - for (int i = outOrder.size() - 1; i >= 0; i--) { - if (outOrder[i] == 1) { - int oc_dim_idx = i + (jep.input_size - outOrder.size()); - jep.oc_offsets[oc_dim_idx] = offset_oc; - offset_oc *= jep.dims[oc_dim_idx]; - } - } - oc_size = jep.oc_offsets[jep.dims.size() - 1] != 0 ? jep.dims[jep.dims.size() - 1] : 1; - } - - size_t fullWorkAmount = 1; - for (int i = 0; i < jep.dims.size(); i++) { - fullWorkAmount *= jep.dims[i]; - } - - isDynBatchEnabled = getSelectedPrimitiveDescriptor()->getConfig().dynBatchSupport; - - size_t minimalConcurrency = parallel_get_max_threads(); - size_t minimalJitWorkAmount = 256; - size_t currentJitWorkAmount = jep.dims[jep.dims.size() - 1]; - int collapsedDims = 0; - if (canUseOptimizedImpl) { - bool hasDifferentDims = false; - while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount && - // we shouldn't collapse batch dimension in case dynamic batch is enabled - (!isDynBatchEnabled || (currentOutBlkDims.size() - collapsedDims > 2))) { - if (static_cast(jep.dims.size()) - collapsedDims - 2 < 0) - break; - - for (int j = 1; j < dims_in.size(); j++) { - if (dims_in[j].back() != dims_in[0].back()) { - hasDifferentDims = true; - } - } - - if (oc_size > 1 && oc_size != dims_in[0][dims_in[0].size() - 1]) { - hasDifferentDims = true; - } - - bool canCollapse = true; - for (int i = 0; i < dims_in.size(); i++) { - if (dims_in[i][dims_in[i].size() - 2] != 1) { - if (hasDifferentDims) { - canCollapse = false; - break; - } - } - } - - if (!canCollapse) { - break; - } - - size_t nextJitWorkAmount = currentJitWorkAmount * jep.dims[jep.dims.size() - 2]; - if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { - currentJitWorkAmount = nextJitWorkAmount; - collapsedDims++; - - for (int i = 0; i < dims_in.size(); i++) { - collapseLastDims(dims_in[i], 1); - } - collapseLastDims(jep.dims, 1); - - if (isFusedWith(FakeQuantize)) { - collapseLastOffsets(jep.oc_offsets, 1); - } - } else { - break; - } - } - } - - size_t batchDimIdx = jep.input_size - currentOutBlkDims.size() + collapsedDims; - size_t schedulerWorkAmount = fullWorkAmount / jep.dims[jep.dims.size() - 1]; - - // init offset - jep.dst_offsets.resize(jep.input_size, 1); - offset_out_calc(jep.dst_offsets, jep.dims); - for (int j = 0; j < jep.input_size; j++) { - jep.dst_offsets[j] *= getChildEdgeAt(0)->getMemory().getDesc().getPrecision().size(); - } - - for (int i = 0; i < inputNum; i++) { - jep.src_offsets[i].resize(jep.input_size, 1); - offset_in_calc(jep.src_offsets[i], dims_in[i], jep.dims); - for (int j = 0; j < jep.input_size; j++) { - jep.src_offsets[i][j] *= getParentEdgeAt(i)->getMemory().getDesc().getPrecision().size(); - } - } - start_offset_in.resize(inputNum); for (size_t i = 0; i < inputNum; i++) { const auto desc = getParentEdgeAt(i)->getMemory().GetDescWithType(); @@ -1493,24 +1905,45 @@ void MKLDNNEltwiseNode::prepareParams() { const auto desc = getChildEdgeAt(0)->getMemory().GetDescWithType(); start_offset_out = desc->getOffsetPadding() * desc->getPrecision().size(); - jep.inputs_number = inputNum; - - for (int i = 0; i < inputNum; i++) { - jep.src_prc[i] = getParentEdgesAtPort(i).front()->getMemory().getDesc().getPrecision(); - jep.src_size[i] = dims_in[i][dims_in[i].size() - 1]; + std::vector inpPrc; + for (size_t i = 0; i < inputNum; ++i) { + inpPrc.push_back(getParentEdgeAt(i)->getMemory().getDesc().getPrecision()); } - jep.dst_prc = getChildEdgesAtPort(0).front()->getMemory().getDesc().getPrecision(); - jep.work_amount = jep.dst_size = jep.dims.back(); - jep.oc_size = oc_size; - std::transform(jep.oc_offsets.begin(), jep.oc_offsets.end(), jep.oc_offsets.begin(), - [](size_t& offset) { return offset * sizeof(float);}); + auto outPrc = getChildEdgeAt(0)->getMemory().getDesc().getPrecision(); - if (canUseOptimizedImpl) { - execPtr = std::make_shared(jep, *this, schedulerWorkAmount, batchDimIdx); - } else { - execPtr = std::make_shared(jep, fullWorkAmount, batchDimIdx); + EltwiseData thisOp{getAlgorithm(), getMKLDNNAlgorithm(), getAlpha(), getBeta(), getGamma()}; + + EltwiseKey key = {{thisOp}, {getType()}, currentOutBlkDims, outOrder, dims_in, inpPrc, outPrc, mkldnn::post_ops(), isDynBatchEnabled, canUseOptimizedImpl}; + + for (const auto &node : fusedWith) { + key.ops_list.push_back(node->getType()); + if (node->getType() == Eltwise) { + if (auto eltwise = std::dynamic_pointer_cast(node)) { + key.eltwise_data.push_back({eltwise->getAlgorithm(), eltwise->getMKLDNNAlgorithm(), eltwise->getAlpha(), + eltwise->getBeta(), eltwise->getGamma()}); + } + } else if (node->getType() == FakeQuantize) { + node->appendPostOps(key.postOps, {}); + } else { + IE_THROW(Unexpected) << "Eltwise node with name '" << getName() << "' has unexpected fused op of type '" << node->getTypeStr() << "'"; + } } + + // TODO: we need to rewrite quantization_t to remove the pointers from its content and update all the jit kernels at once + // together with the corresponding appendPostOps method to pass the scales and shifts pointers at runtime. + // Until then we have to read them from the quantization_t directly, store them somewhere + // and nullify them to get read of the address dependency in the key structure + for (int i = 0; i < key.postOps.len(); ++i) { + auto &data = key.postOps.get()->entry_[i].quantization.data; + fqDataPtrs.insert(fqDataPtrs.end(), std::begin(data), std::end(data)); + memset(data, 0, sizeof(data)); + } + // end of TODO + + auto cache = getRuntimeCache(); + auto result = cache->getOrCreate(key, buildExecutor); + execPtr = result.first; } bool MKLDNNEltwiseNode::needPrepareParams() const { @@ -1545,146 +1978,11 @@ void MKLDNNEltwiseNode::initOptimalPrimitiveDescriptor() { } } -void MKLDNNEltwiseNode::offset_out_calc(VectorDims& offset, VectorDims& dims) { - int k = 1; - for (int i = offset.size() - 1; i >= 0; i--) { - offset[i] = k; - k *= dims[i]; - } -} - -void MKLDNNEltwiseNode::offset_in_calc(VectorDims& offset, VectorDims& dims_in, VectorDims& dims_out) { - int k = 1; - for (int i = offset.size() - 1; i >= 0; i--) { - offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; - k *= dims_in[i]; - } -} - -void MKLDNNEltwiseNode::executeOptimized6D(const std::unique_ptr &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs, - const VectorDims &dims_out) const { - parallel_for5d(dims_out[0], dims_out[1], dims_out[2], dims_out[3], dims_out[4], - [&](size_t i0, size_t i1, size_t i2, size_t i3, size_t i4) { - auto args = jit_eltwise_call_args_indexes(); - args.indexes[0] = i0; - args.indexes[1] = i1; - args.indexes[2] = i2; - args.indexes[3] = i3; - args.indexes[4] = i4; - - (*pKernel)(&args_ptrs, &args); - }); -} - -void MKLDNNEltwiseNode::executeOptimizedGeneric(const std::unique_ptr &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs, - const VectorDims &dims_out, const size_t schedulerWorkAmount) const { - parallel_nt(0, [&](const int ithr, const int nthr) { - size_t start = 0, end = 0; - splitter(schedulerWorkAmount, nthr, ithr, start, end); - - std::vector counters(dims_out.size() - 1, 0); - auto args = jit_eltwise_call_args_indexes(); - for (size_t iwork = start; iwork < end; ++iwork) { - size_t tmp = iwork; - for (ptrdiff_t j = dims_out.size() - 2; j >= 0; j--) { - counters[j] = tmp % dims_out[j]; - tmp /= dims_out[j]; - } - - for (size_t j = 0; j < counters.size(); j++) - args.indexes[j] = counters[j]; - - (*pKernel)(&args_ptrs, &args); - } - }); -} - -void MKLDNNEltwiseNode::executeReference(const jit_eltwise_params &jep, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out, - const size_t fullWorkAmount) const { - std::shared_ptr ref_eltwise_injector = nullptr; - if (getMKLDNNAlgorithm() != mkldnn::algorithm::undef) { - ref_eltwise_injector = std::make_shared(static_cast(getMKLDNNAlgorithm()), alpha, beta, 1.f); - } - - parallel_nt(0, [&](const int ithr, const int nthr) { - size_t start = 0, end = 0; - splitter(fullWorkAmount, nthr, ithr, start, end); - - std::vector counters(dims_out.size(), 0); - - for (size_t iwork = start; iwork < end; ++iwork) { - size_t tmp = iwork; - for (ptrdiff_t j = dims_out.size() - 1; j >= 0; j--) { - counters[j] = tmp % dims_out[j]; - tmp /= dims_out[j]; - } - - size_t index_in[MAX_ELTWISE_INPUTS] = {0}; - for (int i = 0; i < inputNum; i++) { - index_in[i] = 0; - for (int j = 0; j < counters.size(); j++) { - index_in[i] += counters[j] * jep.src_offsets[i][j]; - } - index_in[i] /= sizeof(float); - } - - size_t index_out = 0; - for (int j = 0; j < counters.size(); j++) { - index_out += counters[j] * jep.dst_offsets[j]; - } - index_out /= sizeof(float); - - std::vector src_f(inputNum); - for (int i = 0; i < inputNum; i++) { - src_f[i] = (reinterpret_cast(args_ptrs.src_ptr[i]) + index_in[i])[0]; - } - float* dst_ptr_f = reinterpret_cast(args_ptrs.dst_ptr) + index_out; - - switch (getAlgorithm()) { - case EltwiseRelu: case EltwiseGelu: case EltwiseElu: case EltwiseTanh: case EltwiseSigmoid: case EltwiseAbs: - case EltwiseSqrt: case EltwiseSoftRelu: case EltwiseExp: case EltwiseClamp: - case EltwiseSwish: case EltwiseHswish: case EltwiseMish: case EltwiseHsigmoid: case EltwiseRoundHalfToEven: case EltwiseRoundHalfAwayFromZero: - *dst_ptr_f = ref_eltwise_injector->compute_scalar(src_f[0]); break; - case EltwiseAdd: *dst_ptr_f = src_f[0] + src_f[1]; break; - case EltwiseMulAdd: *dst_ptr_f = src_f[0] * src_f[1] + src_f[2]; break; - case EltwiseSubtract: *dst_ptr_f = src_f[0] - src_f[1]; break; - case EltwiseMultiply: *dst_ptr_f = src_f[0] * src_f[1]; break; - case EltwiseDivide: *dst_ptr_f = src_f[0] / src_f[1]; break; - case EltwiseFloorMod: *dst_ptr_f = src_f[0] - floorf(src_f[0] / src_f[1]) * src_f[1]; break; - case EltwiseMod: *dst_ptr_f = src_f[0] - truncf(src_f[0] / src_f[1]) * src_f[1]; break; - case EltwiseMaximum: *dst_ptr_f = std::max(src_f[0], src_f[1]); break; - case EltwiseMinimum: *dst_ptr_f = std::min(src_f[0], src_f[1]); break; - case EltwiseSquaredDifference: *dst_ptr_f = powf((src_f[0] - src_f[1]), 2.f); break; - case EltwisePowerDynamic: *dst_ptr_f = powf(src_f[0], src_f[1]); break; - case EltwiseEqual: *dst_ptr_f = src_f[0] == src_f[1]; break; - case EltwiseNotEqual: *dst_ptr_f = src_f[0] != src_f[1]; break; - case EltwiseGreater: *dst_ptr_f = src_f[0] > src_f[1]; break; - case EltwiseGreaterEqual: *dst_ptr_f = src_f[0] >= src_f[1]; break; - case EltwiseLess: *dst_ptr_f = src_f[0] < src_f[1]; break; - case EltwiseLessEqual: *dst_ptr_f = src_f[0] <= src_f[1]; break; - case EltwiseLogicalAnd: *dst_ptr_f = src_f[0] && src_f[1]; break; - case EltwiseLogicalOr: *dst_ptr_f = src_f[0] || src_f[1]; break; - case EltwiseLogicalXor: *dst_ptr_f = (src_f[0] || src_f[1]) - (src_f[0] && src_f[1]); break; - case EltwiseLogicalNot: *dst_ptr_f = !src_f[0]; break; - case EltwisePowerStatic: *dst_ptr_f = powf(beta * src_f[0] + gamma, alpha); break; - case EltwisePrelu: *dst_ptr_f = src_f[0] > 0 ? src_f[0] : src_f[0] * src_f[1]; break; - case EltwiseErf: *dst_ptr_f = std::erf(src_f[0]); break; - default: IE_THROW() << "Unsupported operation type for Eltwise node with name `" << getName() << "`"; - } - } - }); -} - -void MKLDNNEltwiseNode::executeDynamicImpl(mkldnn::stream strm) { - execute(strm); -} - void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { if (execPtr) { jit_eltwise_call_args_ptrs args_ptrs = {}; - const auto &jep = execPtr->getJep(); - const auto &batchDimIdx = execPtr->batchDimIdx; - VectorDims dims_out = jep.dims; + auto batchDimIdx = execPtr->getBatchDimIdx(); + VectorDims dims_out = execPtr->getOutDims(); for (int i = 0; i < memPtrs.size() - 1; i++) args_ptrs.src_ptr[i] = reinterpret_cast(memPtrs[i]->GetData()) + start_offset_in[i]; args_ptrs.dst_ptr = reinterpret_cast(memPtrs.back()->GetData()) + start_offset_out; @@ -1696,12 +1994,20 @@ void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { dims_out[batchDimIdx] = static_cast(batchToProcess()); } - execPtr->exec(*this, args_ptrs, dims_out); + std::vector vecPostOpData(fqDataPtrs.size()); + + args_ptrs.post_op_data = fqDataPtrs.data(); + + execPtr->exec(args_ptrs, dims_out); } else { - IE_THROW() << "Can't execute eltwise node. Primitive didn't created"; + IE_THROW() << "Can't execute eltwise node. Primitive has not been created"; } } +void MKLDNNEltwiseNode::executeDynamicImpl(mkldnn::stream strm) { + execute(strm); +} + bool MKLDNNEltwiseNode::created() const { return getType() == Eltwise; } @@ -1927,41 +2233,4 @@ InferenceEngine::Precision MKLDNNEltwiseNode::getRuntimePrecision() const { return getMaxPrecision(inputPrecisions); } -MKLDNNEltwiseNode::EltwiseJitExecutor::EltwiseJitExecutor(const jit_eltwise_params &_jep, MKLDNNEltwiseNode& node, const size_t schedWA, const size_t batch) - : schedulerWorkAmount(schedWA), EltwiseExecutor(batch) { - if (mayiuse(x64::avx512_common)) { - pKernel.reset(new jit_uni_eltwise_generic(_jep, node)); - } else if (mayiuse(x64::avx2)) { - pKernel.reset(new jit_uni_eltwise_generic(_jep, node)); - } else if (mayiuse(x64::sse41)) { - pKernel.reset(new jit_uni_eltwise_generic(_jep, node)); - } else { - IE_THROW() << "Can't create jit eltwise kernel"; - } - - if (pKernel) - pKernel->create_ker(); -} - -void MKLDNNEltwiseNode::EltwiseJitExecutor::exec(const MKLDNNEltwiseNode& node, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) { - if (!pKernel) - IE_THROW() << "Can't execute, kernel for eltwise node is not compiled"; - - if (pKernel->jep_.input_size == MKLDNNEltwiseNode::optimalTensorRank) { - node.executeOptimized6D(pKernel, args_ptrs, dims_out); - } else { - node.executeOptimizedGeneric(pKernel, args_ptrs, dims_out, schedulerWorkAmount); - } -} - -void MKLDNNEltwiseNode::EltwiseRefExecutor::exec(const MKLDNNEltwiseNode& node, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) { - node.executeReference(jep, args_ptrs, dims_out, fullWorkAmount); -} - -const jit_eltwise_params& MKLDNNEltwiseNode::EltwiseJitExecutor::getJep() const { - if (!pKernel) - IE_THROW() << "Can't get jit eltwise params, kernel for eltwise node is not compiled"; - return pKernel->jep_; -} - REG_MKLDNN_PRIM_FOR(MKLDNNEltwiseNode, Eltwise); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.h b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.h index b10d8b0a3c5..18b7c947972 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.h +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.h @@ -38,6 +38,8 @@ struct jit_eltwise_params { struct jit_eltwise_call_args_ptrs { const void *src_ptr[MAX_ELTWISE_INPUTS]; void *dst_ptr; + //ptr to array of post op inputs pointers (flat list) + const void** post_op_data; }; struct jit_eltwise_call_args_indexes { @@ -54,16 +56,37 @@ struct jit_uni_eltwise_kernel { ker_(const_args, indexes); } - explicit jit_uni_eltwise_kernel(const jit_eltwise_params& jep, MKLDNNEltwiseNode& node) : ker_(nullptr), jep_(jep), eltwiseNode(node) {} + explicit jit_uni_eltwise_kernel(const jit_eltwise_params& jep) : ker_(nullptr), jep_(jep) {} virtual ~jit_uni_eltwise_kernel() {} virtual void create_ker() = 0; jit_eltwise_params jep_; - MKLDNNEltwiseNode& eltwiseNode; }; class MKLDNNEltwiseNode : public MKLDNNNode { +public: + struct EltwiseData { + Algorithm algo; + mkldnn::algorithm mkldnnAlgorithm; + float alpha; + float beta; + float gamma; + + bool operator==(const EltwiseData& rhs) const noexcept; + }; + + class IEltwiseExecutor { + public: + IEltwiseExecutor() = default; + virtual void exec(const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) = 0; + virtual size_t getBatchDimIdx() const = 0; + virtual const VectorDims& getOutDims() const = 0; + virtual ~IEltwiseExecutor() = default; + }; + + using executorPtr = std::shared_ptr; + public: MKLDNNEltwiseNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); @@ -108,41 +131,11 @@ public: private: - struct EltwiseExecutor { - EltwiseExecutor(size_t batch) : batchDimIdx(batch) {} - virtual void exec(const MKLDNNEltwiseNode& node, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) = 0; - virtual const jit_eltwise_params& getJep() const = 0; - virtual ~EltwiseExecutor() = default; - - size_t batchDimIdx = 0; - }; - using executorPtr = std::shared_ptr; executorPtr execPtr = nullptr; - - struct EltwiseJitExecutor : public EltwiseExecutor { - EltwiseJitExecutor(const jit_eltwise_params &_jep, MKLDNNEltwiseNode& node, const size_t schedWA, const size_t batch); - void exec(const MKLDNNEltwiseNode& node, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override; - const jit_eltwise_params& getJep() const override; - - std::unique_ptr pKernel; - size_t schedulerWorkAmount = 0; - }; - - struct EltwiseRefExecutor : public EltwiseExecutor { - EltwiseRefExecutor(const jit_eltwise_params &_jep, const size_t fullWA, const size_t batch) : jep(_jep), fullWorkAmount(fullWA), - EltwiseExecutor(batch) {} - void exec(const MKLDNNEltwiseNode& node, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out) override; - const jit_eltwise_params& getJep() const override { return jep; } - - jit_eltwise_params jep; - size_t fullWorkAmount = 0; - }; - BroadcastingPolicy broadcastingPolicy; mkldnn::algorithm mkldnnAlgorithm = mkldnn::algorithm::undef; - static const int optimalTensorRank = 6; bool canUseOptimizedImpl = false; bool isDynBatchEnabled = false; bool specialConvolutionAddFusing = false; @@ -163,22 +156,13 @@ private: std::vector shiftsBuffer = {}; std::vector memPtrs = {}; + std::vector fqDataPtrs; using Initializer = std::function&, MKLDNNEltwiseNode& node)>; static const std::map initializers; static BroadcastingPolicy determineBroadcastingPolicy(const std::shared_ptr& op); - void executeOptimized6D(const std::unique_ptr &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs, - const VectorDims &dims_out) const; - void executeOptimizedGeneric(const std::unique_ptr &pKernel, const jit_eltwise_call_args_ptrs &args_ptrs, - const VectorDims &dims_out, const size_t schedulerWorkAmount) const; - void executeReference(const jit_eltwise_params &jep, const jit_eltwise_call_args_ptrs &args_ptrs, const VectorDims &dims_out, - const size_t fullWorkAmount) const; - - void offset_out_calc(VectorDims& offset, VectorDims& dims); - void offset_in_calc(VectorDims& offset, VectorDims& dims_in, VectorDims& dims_out); - size_t getOpInputsNum() const; }; diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_interpolate_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_interpolate_node.cpp index 9ccd5e86e98..704ece9719d 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_interpolate_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_interpolate_node.cpp @@ -76,7 +76,7 @@ struct jit_uni_interpolate_kernel_f32 : public jit_uni_interpolate_kernel, publi } if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr)); + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa)); this->preamble(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.cpp index 5513a27f0d2..98889274ff7 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include "common/cpu_memcpy.h" #include #include "memory_desc/dnnl_blocked_memory_desc.h" @@ -21,12 +20,63 @@ #include "utils/general_utils.h" #include "memory_desc/cpu_memory_desc_utils.h" #include "mkldnn_extension_utils.h" -#include "utils/cpu_utils.hpp" +#include using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; +namespace { +struct MatMulKey { + DnnlMemoryDescCPtr inp0; + DnnlMemoryDescCPtr inp1; + DnnlMemoryDescCPtr bias; + DnnlMemoryDescCPtr out; + mkldnn::primitive_attr attr; + impl_desc_type implType; + + size_t hash() const; + bool operator==(const MatMulKey& rhs) const; +}; + +size_t MatMulKey::hash() const { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; + + size_t seed = 0; + + for (const auto& ptr : {inp0, inp1, bias, out}) { + if (ptr) { + seed = hash_combine(seed, get_md_hash(ptr->getDnnlDesc().data)); + } + } + + seed = hash_combine(seed, get_attr_hash(*attr.get())); + seed = hash_combine(seed, implType); + return seed; +} + +bool MatMulKey::operator==(const MatMulKey &rhs) const { + bool retVal = true; + if (inp0 != rhs.inp0) { + retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc(); + } + if (inp1 != rhs.inp1) { + retVal = retVal && inp1 && rhs.inp1 && inp1->getDnnlDesc() == rhs.inp1->getDnnlDesc(); + } + if (bias != rhs.bias) { + retVal = retVal && bias && rhs.bias && bias->getDnnlDesc() == rhs.bias->getDnnlDesc(); + } + if (out != rhs.out) { + retVal = retVal && out && rhs.out && out->getDnnlDesc() == rhs.out->getDnnlDesc(); + } + retVal = retVal && *attr.get() == *rhs.attr.get() && + implType == rhs.implType; + return retVal; +} + +} // namespace + bool MKLDNNMatMulNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { const auto matMul = std::dynamic_pointer_cast(op); @@ -377,35 +427,58 @@ void MKLDNNMatMulNode::prepareParams() { auto dstDnnlDesc = dstMemPtr->GetDescWithType(); - std::shared_ptr matmul_desc; - + DnnlMemoryDescPtr dnnlBiasMemDesc = nullptr; if (withBiases) { - matmul_desc.reset(new mkldnn::matmul::desc{src0TransposedDesc->getDnnlDesc(), - src1TransposedDesc->getDnnlDesc(), - getBiasDescFrom(dstDnnlDesc), - dstDnnlDesc->getDnnlDesc()}); - } else { - matmul_desc.reset(new mkldnn::matmul::desc(src0TransposedDesc->getDnnlDesc(), - src1TransposedDesc->getDnnlDesc(), - dstDnnlDesc->getDnnlDesc())); + auto& biasMemory = getParentEdgeAt(2)->getMemoryPtr(); + if (!biasMemory || !biasMemory->GetPrimitivePtr()) + IE_THROW() << errorPrefix << " did not allocate bias memory"; + dnnlBiasMemDesc = biasMemory->GetDescWithType(); } - MKLDNNDescriptor desc(matmul_desc); - primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine(), *attr); - matmul::primitive_desc prim_desc; + MatMulKey key = {src0TransposedDesc, src1TransposedDesc, dnnlBiasMemDesc, + dstDnnlDesc, *attr, selected_pd->getImplementationType()}; - while (static_cast(itpd)) { - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); + auto engine = getEngine(); - if (impl_type == selected_pd->getImplementationType()) { - prim_desc = itpd.get(); - break; + auto builder = [&engine](const MatMulKey& key) -> std::shared_ptr { + std::shared_ptr matmul_desc; + + if (key.bias) { + matmul_desc.reset(new mkldnn::matmul::desc{key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.bias->getDnnlDesc(), + key.out->getDnnlDesc()}); + } else { + matmul_desc.reset(new mkldnn::matmul::desc(key.inp0->getDnnlDesc(), + key.inp1->getDnnlDesc(), + key.out->getDnnlDesc())); } - if (!itpd.next_impl()) - IE_THROW() << "Primitive descriptor was not found for node " << getName() << "."; + + MKLDNNDescriptor desc(matmul_desc); + primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine, key.attr); + matmul::primitive_desc prim_desc; + + while (static_cast(itpd)) { + impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); + + if (impl_type == key.implType) { + prim_desc = itpd.get(); + break; + } + if (!itpd.next_impl()) + return nullptr; + } + return std::make_shared(prim_desc); + }; + + auto cache = getRuntimeCache(); + auto result = cache->getOrCreate(key, builder); + + if (!result.first) { + IE_THROW() << "Primitive descriptor was not found for node " << getName() << "."; } - prim.reset(new matmul(prim_desc)); + prim = result.first; primArgs[DNNL_ARG_SRC_0] = src0MemPtr->GetPrimitive(); primArgs[DNNL_ARG_WEIGHTS_0] = src1MemPtr->GetPrimitive(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.h b/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.h index 2d7e8c00e60..9bc864de05f 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.h +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_matmul_node.h @@ -59,7 +59,6 @@ private: std::array inDataDesc; DnnlBlockedMemoryDescPtr outDataDesc; - AttrPtr pAttr; }; } // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_mvn_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_mvn_node.cpp index d7cc67f8a51..706775d1f59 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_mvn_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_mvn_node.cpp @@ -59,7 +59,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k } void generate() override { - load_emitter.reset(new jit_load_emitter(this, isa, nullptr)); + load_emitter.reset(new jit_load_emitter(this, isa)); this->preamble(); mov(reg_src, ptr[reg_params + GET_OFF(src)]); @@ -384,8 +384,8 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator } } - load_emitter.reset(new jit_load_emitter(this, isa, nullptr)); - store_emitter.reset(new jit_store_emitter(this, isa, nullptr)); + load_emitter.reset(new jit_load_emitter(this, isa)); + store_emitter.reset(new jit_store_emitter(this, isa)); this->preamble(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp index 8fe284f7d82..3cc419f03dd 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp @@ -41,8 +41,8 @@ struct jit_uni_nms_kernel_f32 : public jit_uni_nms_kernel, public jit_generator } void generate() override { - load_emitter.reset(new jit_load_emitter(this, isa, nullptr)); - store_emitter.reset(new jit_store_emitter(this, isa, nullptr)); + load_emitter.reset(new jit_load_emitter(this, isa)); + store_emitter.reset(new jit_store_emitter(this, isa)); exp_injector.reset(new jit_uni_eltwise_injector_f32(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.0f)); this->preamble(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_normalize_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_normalize_node.cpp index 3de521da034..219726dd41a 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_normalize_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_normalize_node.cpp @@ -189,7 +189,7 @@ struct jit_uni_normalize_kernel_f32 : public jit_uni_normalize_kernel, public ji } if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr)); + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa)); this->preamble(); @@ -1356,10 +1356,16 @@ private: auto quant = post_op.quantization; - float crop_low = quant.crop_low_data->shifts_[quant.crop_low_data->count_ == 1 ? 0 : index_c]; - float crop_high = quant.crop_high_data->shifts_[quant.crop_high_data->count_ == 1 ? 0 : index_c]; - float input_scale = quant.input_scale_data->scales_[quant.input_scale_data->count_ == 1 ? 0 : index_c]; - float input_shift = quant.input_shift_data->shifts_[quant.input_shift_data->count_ == 1 ? 0 : index_c]; + using quantization_fields = post_ops_t::entry_t::quantization_t::quantization_fields; + auto dataVal = [&](const quantization_fields& field) { + const int channelIdx = quant.per_channel[field] ? index_c : 0; + return quant.data[field][channelIdx]; + }; + + float crop_low = dataVal(quant.crop_low); + float crop_high = dataVal(quant.crop_high); + float input_scale = dataVal(quant.inp_scale); + float input_shift = dataVal(quant.inp_shift); dst_value = nstl::min(crop_high, nstl::max(crop_low, dst_value)); dst_value = dst_value * input_scale + input_shift; @@ -1369,8 +1375,8 @@ private: } if (do_dequantization) { - float output_scale = quant.output_scale_data->scales_[quant.output_scale_data->count_ == 1 ? 0 : index_c]; - float output_shift = quant.output_shift_data->shifts_[quant.output_shift_data->count_ == 1 ? 0 : index_c]; + float output_scale = dataVal(quant.output_scale); + float output_shift = dataVal(quant.output_shift); dst_value = dst_value * output_scale + output_shift; } } diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_reduce_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_reduce_node.cpp index d236a9b45b5..65f186d93f1 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_reduce_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_reduce_node.cpp @@ -93,7 +93,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene } if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16 = std::make_shared(this, isa, nullptr); + emu_vcvtneps2bf16 = std::make_shared(this, isa); this->preamble(); @@ -1075,7 +1075,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi } if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16 = std::make_shared(this, isa, nullptr); + emu_vcvtneps2bf16 = std::make_shared(this, isa); this->preamble(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_region_yolo_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_region_yolo_node.cpp index 84666f3ce2e..60ce04587ee 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_region_yolo_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_region_yolo_node.cpp @@ -39,7 +39,7 @@ struct jit_uni_logistic_kernel_f32 : public jit_uni_logistic_kernel, public jit_ exp_injector.reset(new jit_uni_eltwise_injector_f32(this, mkldnn::impl::alg_kind::eltwise_exp, 0.f, 0.f, 1.f)); if (!mayiuse(avx512_core_bf16) && mayiuse(avx512_core)) - emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa, nullptr)); + emu_vcvtneps2bf16.reset(new jit_emu_vcvtneps2bf16(this, isa)); this->preamble(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_roi_pooling_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_roi_pooling_node.cpp index ee126838ca6..572247387e3 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_roi_pooling_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_roi_pooling_node.cpp @@ -44,8 +44,8 @@ struct jit_uni_roi_pooling_kernel_f32 : public jit_uni_roi_pooling_kernel, publi }; void generate() override { - load_emitter.reset(new jit_load_emitter(this, isa, nullptr)); - store_emitter.reset(new jit_store_emitter(this, isa, nullptr)); + load_emitter.reset(new jit_load_emitter(this, isa)); + store_emitter.reset(new jit_store_emitter(this, isa)); this->preamble(); diff --git a/src/plugins/intel_cpu/thirdparty/mkl-dnn b/src/plugins/intel_cpu/thirdparty/mkl-dnn index 7cd4218a3d4..f06708e9cf6 160000 --- a/src/plugins/intel_cpu/thirdparty/mkl-dnn +++ b/src/plugins/intel_cpu/thirdparty/mkl-dnn @@ -1 +1 @@ -Subproject commit 7cd4218a3d45c2c7e7b321dc825c3635a723b60e +Subproject commit f06708e9cf6c3973efee9d2a1a4df086050e1fcd diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp index ed20233d782..898b16566c8 100644 --- a/src/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp +++ b/src/tests/functional/plugin/cpu/single_layer_tests/eltwise.cpp @@ -1,524 +1,676 @@ -//// Copyright (C) 2018-2021 Intel Corporation -//// SPDX-License-Identifier: Apache-2.0 -//// -// -//#include -//#include -//#include "test_utils/cpu_test_utils.hpp" -// -//using namespace InferenceEngine; -//using namespace CPUTestUtils; -// -//namespace CPULayerTestsDefinitions { -// -//typedef std::tuple< -// LayerTestsDefinitions::EltwiseTestParams, -// CPUSpecificParams> EltwiseLayerCPUTestParamsSet; -// -//class EltwiseLayerCPUTest : public testing::WithParamInterface, -// virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { -//public: -// static std::string getTestCaseName(testing::TestParamInfo obj) { -// LayerTestsDefinitions::EltwiseTestParams basicParamsSet; -// CPUSpecificParams cpuParams; -// std::tie(basicParamsSet, cpuParams) = obj.param; -// -// std::ostringstream result; -// result << LayerTestsDefinitions::EltwiseLayerTest::getTestCaseName(testing::TestParamInfo( -// basicParamsSet, 0)); -// result << CPUTestsBase::getTestCaseName(cpuParams); -// -// return result.str(); -// } -// -//protected: -// void SetUp() override { -// LayerTestsDefinitions::EltwiseTestParams basicParamsSet; -// CPUSpecificParams cpuParams; -// std::tie(basicParamsSet, cpuParams) = this->GetParam(); -// -// std::pair, std::vector>> shapes; -// InferenceEngine::Precision netPrecision; -// ngraph::helpers::InputLayerType secondaryInputType; -// CommonTestUtils::OpType opType; -// ngraph::helpers::EltwiseTypes eltwiseType; -// std::map additional_config; -// std::tie(shapes, eltwiseType, secondaryInputType, opType, netPrecision, inPrc, outPrc, inLayout, targetDevice, additional_config) = basicParamsSet; -// targetStaticShapes = shapes.second; -// inputDynamicShapes = shapes.first; -// std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; -// auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); -// -// selectedType = getPrimitiveType() + "_" + netPrecision.name(); -// -// ngraph::Shape inputShape1 = targetStaticShapes.front().front(), inputShape2 = targetStaticShapes.front().back(); -// if (targetStaticShapes.front().size() == 1) { -// inputShape1 = inputShape2 = targetStaticShapes.front().front(); -// } else if (targetStaticShapes.front().size() == 2) { -// inputShape1 = targetStaticShapes.front().front(); -// inputShape2 = targetStaticShapes.front().back(); -// } else { -// IE_THROW() << "Incorrect number of input shapes"; -// } -// -// configuration.insert(additional_config.begin(), additional_config.end()); -// auto input = ngraph::builder::makeParams(ngPrc, {inputShape1}); -// -// std::vector shape_input_secondary; -// switch (opType) { -// case CommonTestUtils::OpType::SCALAR: { -// shape_input_secondary = std::vector({1}); -// break; -// } -// case CommonTestUtils::OpType::VECTOR: -// shape_input_secondary = inputShape2; -// break; -// default: -// FAIL() << "Unsupported Secondary operation type"; -// } -// -// std::shared_ptr secondaryInput; -// if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE || -// eltwiseType == ngraph::helpers::EltwiseTypes::MOD) { -// std::vector data(ngraph::shape_size(shape_input_secondary)); -// data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape_input_secondary), 10, 2); -// secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data); -// } else if (eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD) { -// int negative_data_size = ngraph::shape_size(shape_input_secondary) / 2; -// int positive_data_size = ngraph::shape_size(shape_input_secondary) - negative_data_size; -// std::vector negative_data(negative_data_size); -// std::vector data(positive_data_size); -// negative_data = NGraphFunctions::Utils::generateVector(negative_data_size, -10, -2); -// data = NGraphFunctions::Utils::generateVector(positive_data_size, 10, 2); -// data.insert(data.end(), negative_data.begin(), negative_data.end()); -// secondaryInput = ngraph::builder::makeConstant(ngPrc, shape_input_secondary, data); -// } else { -// secondaryInput = ngraph::builder::makeInputLayer(ngPrc, secondaryInputType, shape_input_secondary); -// if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { -// input.push_back(std::dynamic_pointer_cast(secondaryInput)); -// } -// } -// -// auto eltwise = ngraph::builder::makeEltwise(input[0], secondaryInput, eltwiseType); -// -// function = makeNgraphFunction(ngPrc, input, eltwise, "Eltwise"); -// } -//}; -// -//TEST_P(EltwiseLayerCPUTest, CompareWithRefs) { -// SKIP_IF_CURRENT_TEST_IS_DISABLED() -// -// Run(); -// CheckPluginRelatedResults(executableNetwork, "Eltwise"); -//} -// -//namespace { -// -//std::vector secondaryInputTypes = { -// ngraph::helpers::InputLayerType::CONSTANT, -// ngraph::helpers::InputLayerType::PARAMETER, -//}; -// -//std::vector opTypes = { -// CommonTestUtils::OpType::VECTOR, -//}; -// -//std::vector eltwiseOpTypesBinInp = { -// ngraph::helpers::EltwiseTypes::ADD, -// ngraph::helpers::EltwiseTypes::MULTIPLY, -// ngraph::helpers::EltwiseTypes::SUBTRACT, -// ngraph::helpers::EltwiseTypes::DIVIDE, -// ngraph::helpers::EltwiseTypes::FLOOR_MOD, -// ngraph::helpers::EltwiseTypes::SQUARED_DIFF, -//}; -// -//std::vector eltwiseOpTypesDiffInp = { // Different number of input nodes depending on optimizations -// ngraph::helpers::EltwiseTypes::POWER, -// // ngraph::helpers::EltwiseTypes::MOD // Does not execute because of transformations -//}; -// -//std::map additional_config; -// -//std::vector netPrc = {Precision::BF16, Precision::FP32}; -// -//std::vector cpuParams_4D = { -// CPUSpecificParams({nChw16c, nChw16c}, {nChw16c}, {}, {}), -// CPUSpecificParams({nhwc, nhwc}, {nhwc}, {}, {}), -// CPUSpecificParams({nchw, nchw}, {nchw}, {}, {}) -//}; -// -//std::vector cpuParams_5D = { -// CPUSpecificParams({nCdhw16c, nCdhw16c}, {nCdhw16c}, {}, {}), -// CPUSpecificParams({ndhwc, ndhwc}, {ndhwc}, {}, {}), -// CPUSpecificParams({ncdhw, ncdhw}, {ncdhw}, {}, {}) -//}; -// -//std::vector, std::vector>>> inShapes_4D = { -// {{}, {{{2, 4, 4, 1}}}}, -// {{}, {{{2, 17, 5, 4}}}}, -// {{}, {{{2, 17, 5, 4}, {1, 17, 1, 1}}}}, -// {{}, {{{2, 17, 5, 1}, {1, 17, 1, 4}}}}, -//}; -// -//const auto params_4D = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::ValuesIn(secondaryInputTypes), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder, EltwiseLayerCPUTest, params_4D, EltwiseLayerCPUTest::getTestCaseName); -// -//const auto params_4D_emptyCPUSpec = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D), -// ::testing::ValuesIn(eltwiseOpTypesDiffInp), -// ::testing::ValuesIn(secondaryInputTypes), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::Values(emptyCPUSpec)); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_emptyCPUSpec, EltwiseLayerCPUTest, params_4D_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName); -// -//std::vector, std::vector>>> inShapes_5D = { -// {{}, {{{2, 4, 3, 4, 1}}}}, -// {{}, {{{2, 17, 7, 5, 4}}}}, -// {{}, {{{2, 17, 6, 5, 4}, {1, 17, 6, 1, 1}}}}, -// {{}, {{{2, 17, 6, 5, 1}, {1, 17, 1, 1, 4}}}}, -//}; -// -//const auto params_5D = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::ValuesIn(secondaryInputTypes), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder, EltwiseLayerCPUTest, params_5D, EltwiseLayerCPUTest::getTestCaseName); -// -//const auto params_5D_emptyCPUSpec = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D), -// ::testing::ValuesIn(eltwiseOpTypesDiffInp), -// ::testing::ValuesIn(secondaryInputTypes), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::Values(emptyCPUSpec)); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D, EltwiseLayerCPUTest, params_5D_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName); -// -//std::vector, std::vector>>> inShapes_4D_Blocked_Planar = { -// {{}, {{{2, 17, 31, 3}, {2, 1, 31, 3}}}}, -// {{}, {{{2, 17, 5, 1}, {2, 1, 1, 4}}}}, -//}; -// -//std::vector cpuParams_4D_Blocked_Planar = { -// CPUSpecificParams({nChw16c, nchw}, {nChw16c}, {}, {}), -//}; -// -//const auto params_4D_Blocked_Planar = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D_Blocked_Planar), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Planar))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Blocked_Planar, EltwiseLayerCPUTest, params_4D_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); -// -// -//std::vector, std::vector>>> inShapes_4D_Planar_Blocked = { -// {{}, {{{2, 1, 31, 3}, {2, 17, 31, 3}}}}, -// {{}, {{{2, 1, 1, 4}, {2, 17, 5, 1}}}}, -//}; -// -//std::vector cpuParams_4D_Planar_Blocked = { -// CPUSpecificParams({nchw, nChw16c}, {nChw16c}, {}, {}), -//}; -// -//const auto params_4D_Planar_Blocked = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D_Planar_Blocked), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Planar_Blocked))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Planar_Blocked, EltwiseLayerCPUTest, params_4D_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName); -// -// -//std::vector, std::vector>>> inShapes_5D_Blocked_Planar = { -// {{}, {{{2, 17, 31, 4, 3}, {2, 1, 31, 1, 3}}}}, -// {{}, {{{2, 17, 5, 3, 1}, {2, 1, 1, 3, 4}}}}, -//}; -// -//std::vector cpuParams_5D_Blocked_Planar = { -// CPUSpecificParams({nCdhw16c, ncdhw}, {nCdhw16c}, {}, {}), -//}; -// -//const auto params_5D_Blocked_Planar = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D_Blocked_Planar), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Blocked_Planar))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_Blocked_Planar, EltwiseLayerCPUTest, params_5D_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); -// -// -//std::vector, std::vector>>> inShapes_5D_Planar_Blocked = { -// {{}, {{{2, 1, 31, 1, 3}, {2, 17, 31, 4, 3}}}}, -// {{}, {{{2, 1, 1, 3, 4}, {2, 17, 5, 3, 1}}}}, -//}; -// -//std::vector cpuParams_5D_Planar_Blocked = { -// CPUSpecificParams({ncdhw, nCdhw16c}, {nCdhw16c}, {}, {}), -//}; -// -//const auto params_5D_Planar_Blocked = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D_Planar_Blocked), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Planar_Blocked))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_Planar_Blocked, EltwiseLayerCPUTest, params_5D_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName); -// -// -//std::vector, std::vector>>> inShapes_4D_1D = { -// {{}, {{{2, 17, 5, 4}, {4}}}}, -// {{}, {{{1, 3, 3, 3}, {3}}}}, -//}; -// -//std::vector cpuParams_4D_1D = { -// CPUSpecificParams({nChw16c, x}, {nChw16c}, {}, {}), -// CPUSpecificParams({nhwc, x}, {nhwc}, {}, {}), -// CPUSpecificParams({nchw, x}, {nchw}, {}, {}) -//}; -// -//const auto params_4D_1D = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D_1D), -// ::testing::Values(ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY), -// ::testing::ValuesIn(secondaryInputTypes), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_1D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_1D, EltwiseLayerCPUTest, params_4D_1D, EltwiseLayerCPUTest::getTestCaseName); -// -//std::vector, std::vector>>> inShapes_5D_1D = { -// {{}, {{{2, 17, 5, 4, 10}, {10}}}}, -// {{}, {{{1, 3, 3, 3, 3}, {3}}}}, -//}; -// -//std::vector cpuParams_5D_1D = { -// CPUSpecificParams({nCdhw16c, x}, {nCdhw16c}, {}, {}), -// CPUSpecificParams({ndhwc, x}, {ndhwc}, {}, {}), -// CPUSpecificParams({ncdhw, x}, {ncdhw}, {}, {}) -//}; -// -//const auto params_5D_1D = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D_1D), -// ::testing::Values(ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY), -// ::testing::ValuesIn(secondaryInputTypes), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_1D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_1D, EltwiseLayerCPUTest, params_5D_1D, EltwiseLayerCPUTest::getTestCaseName); -// -// -//std::vector eltwiseOpTypesBinDyn = { -// ngraph::helpers::EltwiseTypes::ADD, -// ngraph::helpers::EltwiseTypes::MULTIPLY, -// ngraph::helpers::EltwiseTypes::SUBTRACT, -// ngraph::helpers::EltwiseTypes::SQUARED_DIFF, -//}; -// +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "functional_test_utils/ov_tensor_utils.hpp" +#include "test_utils/fusing_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ov::test; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + subgraph::EltwiseTestParams, + CPUSpecificParams, + fusingSpecificParams> EltwiseLayerCPUTestParamsSet; + +class EltwiseLayerCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, public CpuTestWithFusing { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + subgraph::EltwiseTestParams basicParamsSet; + CPUSpecificParams cpuParams; + fusingSpecificParams fusingParams; + std::tie(basicParamsSet, cpuParams, fusingParams) = obj.param; + + std::ostringstream result; + result << subgraph::EltwiseLayerTest::getTestCaseName(testing::TestParamInfo( + basicParamsSet, 0)); + result << CPUTestsBase::getTestCaseName(cpuParams); + result << CpuTestWithFusing::getTestCaseName(fusingParams); + + return result.str(); + } + +protected: + void generate_inputs(const std::vector& targetInputStaticShapes) override { + inputs.clear(); + const auto& funcInputs = function->inputs(); + for (int i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + ov::runtime::Tensor tensor; + bool isReal = funcInput.get_element_type().is_real(); + switch (eltwiseType) { + case ngraph::helpers::EltwiseTypes::POWER: + case ngraph::helpers::EltwiseTypes::MOD: + case ngraph::helpers::EltwiseTypes::FLOOR_MOD: + tensor = isReal ? + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 2, 2, 8) : + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 4, 2); + break; + case ngraph::helpers::EltwiseTypes::DIVIDE: + tensor = isReal ? + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 2, 2, 8) : + ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 100, 101); + break; + case ngraph::helpers::EltwiseTypes::ERF: + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 6, -3); + break; + default: + if (funcInput.get_element_type().is_real()) { + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 80, 0, 8); + } else { + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i]); + } + break; + } + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } + } + + + void SetUp() override { + subgraph::EltwiseTestParams basicParamsSet; + CPUSpecificParams cpuParams; + fusingSpecificParams fusingParams; + std::tie(basicParamsSet, cpuParams, fusingParams) = this->GetParam(); + + std::vector shapes; + ElementType netType; + ngraph::helpers::InputLayerType secondaryInputType; + CommonTestUtils::OpType opType; + Config additional_config; + std::tie(shapes, eltwiseType, secondaryInputType, opType, netType, inType, outType, targetDevice, configuration) = basicParamsSet; + + if (ElementType::bf16 == netType) { + rel_threshold = 2e-2f; + } + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + std::tie(postOpMgrPtr, fusedOps) = fusingParams; + + selectedType = makeSelectedTypeStr(getPrimitiveType(), netType); + + shapes.resize(2); + switch (opType) { + case CommonTestUtils::OpType::SCALAR: { + std::vector identityShapes(shapes[0].second.size(), {1}); + shapes[1] = {{}, identityShapes}; + break; + } + case CommonTestUtils::OpType::VECTOR: + if (shapes[1].second.empty()) { + shapes[1] = shapes[0]; + } + break; + default: + FAIL() << "Unsupported Secondary operation type"; + } + + init_input_shapes(shapes); + + configuration.insert(additional_config.begin(), additional_config.end()); + auto parameters = ngraph::builder::makeDynamicParams(netType, {inputDynamicShapes.front()}); + + std::shared_ptr secondaryInput; + if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { + secondaryInput = ngraph::builder::makeDynamicParams(netType, {inputDynamicShapes.back()}).front(); + parameters.push_back(std::dynamic_pointer_cast(secondaryInput)); + } else { + auto pShape = inputDynamicShapes.back(); + ngraph::Shape shape; + if (pShape.is_static()) { + shape = pShape.get_shape(); + } else { + ASSERT_TRUE(pShape.rank().is_static()); + shape = std::vector(pShape.rank().get_length(), 1); + for (size_t i = 0; i < pShape.size(); ++i) { + if (pShape[i].is_static()) { + shape[i] = pShape[i].get_length(); + } + } + } + if (eltwiseType == ngraph::helpers::EltwiseTypes::DIVIDE || + eltwiseType == ngraph::helpers::EltwiseTypes::MOD) { + std::vector data(ngraph::shape_size(shape)); + data = NGraphFunctions::Utils::generateVector(ngraph::shape_size(shape), 10, 2); + secondaryInput = ngraph::builder::makeConstant(netType, shape, data); + } else if (eltwiseType == ngraph::helpers::EltwiseTypes::FLOOR_MOD) { + auto negative_data_size = ngraph::shape_size(shape) / 2; + auto positive_data_size = ngraph::shape_size(shape) - negative_data_size; + std::vector negative_data(negative_data_size); + std::vector data(positive_data_size); + negative_data = NGraphFunctions::Utils::generateVector(negative_data_size, -10, -2); + data = NGraphFunctions::Utils::generateVector(positive_data_size, 10, 2); + data.insert(data.end(), negative_data.begin(), negative_data.end()); + secondaryInput = ngraph::builder::makeConstant(netType, shape, data); + } else if (eltwiseType == ngraph::helpers::EltwiseTypes::POWER) { + secondaryInput = ngraph::builder::makeConstant(netType, shape, {}, true, 3); + } else { + secondaryInput = ngraph::builder::makeConstant(netType, shape, {}, true); + } + } + + auto eltwise = ngraph::builder::makeEltwise(parameters[0], secondaryInput, eltwiseType); + + function = makeNgraphFunction(netType, parameters, eltwise, "Eltwise"); + } + +private: + ngraph::helpers::EltwiseTypes eltwiseType; +}; + +TEST_P(EltwiseLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + run(); + CheckPluginRelatedResults(executableNetwork, "Eltwise"); +} + +namespace { + +std::vector secondaryInputTypes = { + ngraph::helpers::InputLayerType::CONSTANT, + ngraph::helpers::InputLayerType::PARAMETER, +}; + +std::vector opTypes = { + CommonTestUtils::OpType::VECTOR, +}; + +std::vector eltwiseOpTypesBinInp = { + ngraph::helpers::EltwiseTypes::ADD, + ngraph::helpers::EltwiseTypes::MULTIPLY, + ngraph::helpers::EltwiseTypes::SUBTRACT, + ngraph::helpers::EltwiseTypes::DIVIDE, + ngraph::helpers::EltwiseTypes::FLOOR_MOD, + ngraph::helpers::EltwiseTypes::SQUARED_DIFF, +}; + +std::vector eltwiseOpTypesDiffInp = { // Different number of input nodes depending on optimizations + ngraph::helpers::EltwiseTypes::POWER, + // ngraph::helpers::EltwiseTypes::MOD // Does not execute because of transformations +}; + +std::map additional_config; + +std::vector netType = {ElementType::bf16, ElementType::f32}; + +std::vector cpuParams_4D = { + CPUSpecificParams({nChw16c, nChw16c}, {nChw16c}, {}, {}), + CPUSpecificParams({nhwc, nhwc}, {nhwc}, {}, {}), + CPUSpecificParams({nchw, nchw}, {nchw}, {}, {}) +}; + +std::vector cpuParams_5D = { + CPUSpecificParams({nCdhw16c, nCdhw16c}, {nCdhw16c}, {}, {}), + CPUSpecificParams({ndhwc, ndhwc}, {ndhwc}, {}, {}), + CPUSpecificParams({ncdhw, ncdhw}, {ncdhw}, {}, {}) +}; + +const std::vector fusingParamsSet{ + emptyFusingSpec, + // eltwise + fusingSigmoid, + fusingPRelu1D, + // depthwise + fusingReluScaleShift, + // fake quantize + fusingFakeQuantizePerTensorRelu, + fusingFakeQuantizePerChannelRelu, + fusingFQPerChannelSigmoidFQPerChannel +}; + +std::vector> inShapes_4D = { + {{2, 4, 4, 1}}, + {{2, 17, 5, 4}}, + {{2, 17, 5, 4}, {1, 17, 1, 1}}, + {{2, 17, 5, 1}, {1, 17, 1, 4}}, +}; + +const auto params_4D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder, EltwiseLayerCPUTest, params_4D, EltwiseLayerCPUTest::getTestCaseName); + +std::vector> inShapes_4D_fusing = { + {{2, 4, 4, 1}}, + {{2, 17, 5, 4}}, + {{2, 17, 5, 1}, {1, 17, 1, 4}}, +}; + +const auto params_4D_fusing = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_fusing)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), + ::testing::ValuesIn(opTypes), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(cpuParams_4D), + ::testing::ValuesIn(fusingParamsSet)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Fusing, EltwiseLayerCPUTest, params_4D_fusing, EltwiseLayerCPUTest::getTestCaseName); + +const auto params_4D_emptyCPUSpec = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D)), + ::testing::ValuesIn(eltwiseOpTypesDiffInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::Values(emptyCPUSpec), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_emptyCPUSpec, EltwiseLayerCPUTest, params_4D_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName); + +std::vector> inShapes_5D = { + {{2, 4, 3, 4, 1}}, + {{2, 17, 7, 5, 4}}, + {{2, 17, 6, 5, 4}, {1, 17, 6, 1, 1}}, + {{2, 17, 6, 5, 1}, {1, 17, 1, 1, 4}}, +}; + +const auto params_5D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder, EltwiseLayerCPUTest, params_5D, EltwiseLayerCPUTest::getTestCaseName); + +const auto params_5D_emptyCPUSpec = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D)), + ::testing::ValuesIn(eltwiseOpTypesDiffInp), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::Values(emptyCPUSpec), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D, EltwiseLayerCPUTest, params_5D_emptyCPUSpec, EltwiseLayerCPUTest::getTestCaseName); + +std::vector> inShapes_4D_Blocked_Planar = { + {{2, 17, 31, 3}, {2, 1, 31, 3}}, + {{2, 17, 5, 1}, {2, 1, 1, 4}}, +}; + +std::vector cpuParams_4D_Blocked_Planar = { + CPUSpecificParams({nChw16c, nchw}, {nChw16c}, {}, {}), +}; + +const auto params_4D_Blocked_Planar = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_Blocked_Planar)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Blocked_Planar)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Blocked_Planar, EltwiseLayerCPUTest, params_4D_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector> inShapes_4D_Planar_Blocked = { + {{2, 1, 31, 3}, {2, 17, 31, 3}}, + {{2, 1, 1, 4}, {2, 17, 5, 1}}, +}; + +std::vector cpuParams_4D_Planar_Blocked = { + CPUSpecificParams({nchw, nChw16c}, {nChw16c}, {}, {}), +}; + +const auto params_4D_Planar_Blocked = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_Planar_Blocked)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_Planar_Blocked)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_Planar_Blocked, EltwiseLayerCPUTest, params_4D_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector> inShapes_5D_Blocked_Planar = { + {{2, 17, 31, 4, 3}, {2, 1, 31, 1, 3}}, + {{2, 17, 5, 3, 1}, {2, 1, 1, 3, 4}}, +}; + +std::vector cpuParams_5D_Blocked_Planar = { + CPUSpecificParams({nCdhw16c, ncdhw}, {nCdhw16c}, {}, {}), +}; + +const auto params_5D_Blocked_Planar = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D_Blocked_Planar)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Blocked_Planar)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_Blocked_Planar, EltwiseLayerCPUTest, params_5D_Blocked_Planar, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector> inShapes_5D_Planar_Blocked = { + {{2, 1, 31, 1, 3}, {2, 17, 31, 4, 3}}, + {{2, 1, 1, 3, 4}, {2, 17, 5, 3, 1}}, +}; + +std::vector cpuParams_5D_Planar_Blocked = { + CPUSpecificParams({ncdhw, nCdhw16c}, {nCdhw16c}, {}, {}), +}; + +const auto params_5D_Planar_Blocked = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D_Planar_Blocked)), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_Planar_Blocked)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_Planar_Blocked, EltwiseLayerCPUTest, params_5D_Planar_Blocked, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector> inShapes_4D_1D = { + {{2, 17, 5, 4}, {4}}, + {{1, 3, 3, 3}, {3}}, +}; + +std::vector cpuParams_4D_1D = { + CPUSpecificParams({nChw16c, x}, {nChw16c}, {}, {}), + CPUSpecificParams({nhwc, x}, {nhwc}, {}, {}), + CPUSpecificParams({nchw, x}, {nchw}, {}, {}) +}; + +const auto params_4D_1D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_4D_1D)), + ::testing::Values(ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D_1D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_1D, EltwiseLayerCPUTest, params_4D_1D, EltwiseLayerCPUTest::getTestCaseName); + +std::vector> inShapes_5D_1D = { + {{2, 17, 5, 4, 10}, {10}}, + {{1, 3, 3, 3, 3}, {3}}, +}; + +std::vector cpuParams_5D_1D = { + CPUSpecificParams({nCdhw16c, x}, {nCdhw16c}, {}, {}), + CPUSpecificParams({ndhwc, x}, {ndhwc}, {}, {}), + CPUSpecificParams({ncdhw, x}, {ncdhw}, {}, {}) +}; + +const auto params_5D_1D = ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inShapes_5D_1D)), + ::testing::Values(ngraph::helpers::EltwiseTypes::ADD, ngraph::helpers::EltwiseTypes::MULTIPLY), + ::testing::ValuesIn(secondaryInputTypes), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D_1D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_1D, EltwiseLayerCPUTest, params_5D_1D, EltwiseLayerCPUTest::getTestCaseName); + + +std::vector eltwiseOpTypesBinDyn = { + ngraph::helpers::EltwiseTypes::ADD, + ngraph::helpers::EltwiseTypes::MULTIPLY, + ngraph::helpers::EltwiseTypes::SUBTRACT, + ngraph::helpers::EltwiseTypes::SQUARED_DIFF, +}; + //// ============================================ 4D ============================================ -//std::vector, std::vector>>> inShapes_4D_dyn_const = { -// { -// // dynamic -// {{-1, {2, -1}, -1, -1}}, -// // target -// { -// {{3, 2, 1, 1}}, -// {{3, 2, 5, 1}}, -// {{3, 2, 1, 6}}, -// {{3, 2, 4, 11}}, -// } -// }, -//}; -// -//const auto params_4D_dyn_const = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D_dyn_const), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder_dyn_const, EltwiseLayerCPUTest, params_4D_dyn_const, EltwiseLayerCPUTest::getTestCaseName); -// -//std::vector, std::vector>>> inShapes_4D_dyn_param = { -// { -// // dynamic -// {{-1, {2, -1}, -1, -1}, -// {-1, {2, -1}, -1, -1}}, -// // target -// { -// {{3, 2, 1, 1}, {1, 2, 5, 1}}, -// {{1, 7, 5, 1}, {3, 7, 1, 10}}, -// {{3, 3, 4, 11}, {3, 3, 4, 11}}, -// } -// }, -//}; -// -//const auto params_4D_dyn_param = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_4D_dyn_param), -// ::testing::ValuesIn(eltwiseOpTypesBinDyn), -// ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_4D_dyn_param, EltwiseLayerCPUTest::getTestCaseName); -// -// +std::vector inShapes_4D_dyn_const = { + { + // dynamic + {3, 2, -1, -1}, + // target + { + {3, 2, 1, 1}, + {3, 2, 5, 1}, + {3, 2, 1, 6}, + {3, 2, 4, 11}, + } + }, +}; + +const auto params_4D_dyn_const = ::testing::Combine( + ::testing::Combine( + ::testing::Values(inShapes_4D_dyn_const), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder_dyn_const, EltwiseLayerCPUTest, params_4D_dyn_const, EltwiseLayerCPUTest::getTestCaseName); + +std::vector inShapes_4D_dyn_param = { + { + // dynamic + {-1, {2, 15}, -1, -1}, + // target + { + {3, 2, 1, 1}, + {1, 7, 5, 1}, + {3, 3, 4, 11}, + } + }, + { + // dynamic + {-1, {2, 25}, -1, -1}, + // target + { + {1, 2, 5, 1}, + {3, 7, 1, 10}, + {3, 3, 4, 11} + } + } +}; + +const auto params_4D_dyn_param = ::testing::Combine( + ::testing::Combine( + ::testing::Values(inShapes_4D_dyn_param), + ::testing::ValuesIn(eltwiseOpTypesBinDyn), + ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_4D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_4D_dyn_param, EltwiseLayerCPUTest::getTestCaseName); + +std::vector inShapes_4D_dyn_param_fusing = { + { + // dynamic + {-1, 7, -1, -1}, + // target + { + {3, 7, 1, 1}, + {1, 7, 5, 1}, + {3, 7, 1, 1}, + {3, 7, 4, 11}, + } + }, + { + // dynamic + {-1, 7, -1, -1}, + // target + { + {1, 7, 5, 1}, + {3, 7, 1, 10}, + {1, 7, 5, 1}, + {3, 7, 4, 11} + } + } +}; + +const auto params_4D_dyn_param_fusing = ::testing::Combine( + ::testing::Combine( + ::testing::Values(inShapes_4D_dyn_param_fusing), + ::testing::ValuesIn(eltwiseOpTypesBinDyn), + ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), + ::testing::ValuesIn(opTypes), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(cpuParams_4D), + ::testing::ValuesIn(fusingParamsSet)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_4D_dyn_param_fusing, EltwiseLayerCPUTest, params_4D_dyn_param_fusing, EltwiseLayerCPUTest::getTestCaseName); + //// ============================================ 5D ============================================ -//std::vector, std::vector>>> inShapes_5D_dyn_const = { -// { -// // dynamic -// {{-1, {2, -1}, -1, -1, -1}}, -// // target -// { -// {{3, 2, 1, 1, 1}}, -// {{3, 2, 5, 1, 7}}, -// {{3, 2, 1, 6, 1}}, -// {{3, 2, 4, 11, 2}}, -// } -// }, -//}; -// -//const auto params_5D_dyn_const = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D_dyn_const), -// ::testing::ValuesIn(eltwiseOpTypesBinInp), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_const, EltwiseLayerCPUTest, params_5D_dyn_const, EltwiseLayerCPUTest::getTestCaseName); -// -//std::vector, std::vector>>> inShapes_5D_dyn_param = { -// { -// // dynamic -// {{-1, {2, -1}, -1, -1, -1}, -// {-1, {2, -1}, -1, -1, -1}}, -// // target -// { -// {{3, 2, 1, 1, 1}, {1, 2, 5, 1, 5}}, -// {{1, 7, 5, 1, 12}, {3, 7, 1, 10, 1}}, -// {{3, 3, 4, 11, 6}, {3, 3, 4, 11, 6}}, -// } -// }, -//}; -// -//const auto params_5D_dyn_param = ::testing::Combine( -// ::testing::Combine( -// ::testing::ValuesIn(inShapes_5D_dyn_param), -// ::testing::ValuesIn(eltwiseOpTypesBinDyn), -// ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), -// ::testing::ValuesIn(opTypes), -// ::testing::ValuesIn(netPrc), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Precision::FP32), -// ::testing::Values(InferenceEngine::Layout::ANY), -// ::testing::Values(CommonTestUtils::DEVICE_CPU), -// ::testing::Values(additional_config)), -// ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D))); -// -//INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_5D_dyn_param, EltwiseLayerCPUTest::getTestCaseName); -// -//} // namespace -//} // namespace CPULayerTestsDefinitions +std::vector inShapes_5D_dyn_const = { + { + // dynamic + {3, 2, -1, -1, -1}, + // target + { + {3, 2, 1, 1, 1}, + {3, 2, 5, 1, 7}, + {3, 2, 1, 6, 1}, + {3, 2, 4, 11, 2}, + } + }, +}; + +const auto params_5D_dyn_const = ::testing::Combine( + ::testing::Combine( + ::testing::Values(inShapes_5D_dyn_const), + ::testing::ValuesIn(eltwiseOpTypesBinInp), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_const, EltwiseLayerCPUTest, params_5D_dyn_const, EltwiseLayerCPUTest::getTestCaseName); + +std::vector inShapes_5D_dyn_param = { + { + // dynamic + {-1, {2, 15}, -1, -1, -1}, + // target + { + {3, 2, 1, 1, 1}, + {1, 7, 5, 1, 12}, + {3, 3, 4, 11, 6}, + } + }, + { + // dynamic + {-1, {2, 25}, -1, -1, -1}, + // target + { + {1, 2, 5, 1, 5}, + {3, 7, 1, 10, 1}, + {3, 3, 4, 11, 6} + } + } +}; + +const auto params_5D_dyn_param = ::testing::Combine( + ::testing::Combine( + ::testing::Values(inShapes_5D_dyn_param), + ::testing::ValuesIn(eltwiseOpTypesBinDyn), + ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), + ::testing::ValuesIn(opTypes), + ::testing::ValuesIn(netType), + ::testing::Values(ElementType::f32), + ::testing::Values(ElementType::f32), + ::testing::Values(CommonTestUtils::DEVICE_CPU), + ::testing::Values(additional_config)), + ::testing::ValuesIn(filterCPUSpecificParams(cpuParams_5D)), + ::testing::Values(emptyFusingSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_CompareWithRefs_5D_MemOrder_dyn_param, EltwiseLayerCPUTest, params_5D_dyn_param, EltwiseLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp index 063e13e5d7d..cad52b8b445 100644 --- a/src/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp +++ b/src/tests/functional/plugin/cpu/single_layer_tests/mat_mul.cpp @@ -604,8 +604,8 @@ INSTANTIATE_TEST_SUITE_P(smoke_MM_Dynamic, MatMulLayerCPUTest, testParamsDynamic const std::vector IS_Dynamic_Fusing = { { { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...} - {{-1, -1}, {{16, 12}, {33, 7}}}, // input 0 - {{-1, 33}, {{12, 33}, {7, 33}}} // input 1 + {{-1, -1}, {{16, 12}, {33, 7}, {16, 12}}}, // input 0 + {{-1, 33}, {{12, 33}, {7, 33}, {12, 33}}} // input 1 }, {false, false} }, @@ -625,8 +625,8 @@ const std::vector IS_Dynamic_Fusing = { }, { { //dynamic case description each pair per each input has {{dynamic shape}, {{static shape case1}, {static shape case2}, ...} - {{-1, -1, -1}, {{10, 10, 10}, {5, 5, 5}}}, // input 0 - {{-1, -1, 5}, {{10, 10, 5}, {5, 5, 5}}} // input 1 + {{-1, -1, -1}, {{10, 10, 10}, {5, 5, 5}, {10, 10, 10}}}, // input 0 + {{-1, -1, 5}, {{10, 10, 5}, {5, 5, 5}, {10, 10, 5}}} // input 1 }, {false, false} }, diff --git a/src/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp b/src/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp index 188ad9f76eb..cf6f2526f01 100644 --- a/src/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp +++ b/src/tests/functional/plugin/cpu/subgraph_tests/src/eltwise_chain.cpp @@ -1,305 +1,468 @@ -//// Copyright (C) 2018-2021 Intel Corporation -//// SPDX-License-Identifier: Apache-2.0 -//// +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 // -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include -//#include "common_test_utils/common_utils.hpp" -//#include "functional_test_utils/precision_utils.hpp" -//#include "functional_test_utils/skip_tests_config.hpp" -//#include "test_utils/cpu_test_utils.hpp" -//#include "ie_system_conf.h" -// -//using namespace CPUTestUtils; -//using InferenceEngine::Precision; -//using ngraph::helpers::EltwiseTypes; -//using FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc; -// -//namespace CPUSubgraphTestsDefinitions { -// -//typedef std::tuple< -// std::pair, std::vector>>, // Input shapes -// ngraph::helpers::InputLayerType, // Secondary input type -// std::vector, // Input precisions -// std::vector, // Eltwise operations -// bool, // With quantization -// std::string // Device name -//> EltwiseChainTuple; -// -//class EltwiseChainTest : public testing::WithParamInterface, -// virtual public LayerTestsUtils::LayerTestsCommon { -//public: -// static std::string getTestCaseName(const testing::TestParamInfo &obj) { -// std::pair, std::vector>> inputShapes; -// ngraph::helpers::InputLayerType secondaryInputType; -// std::vector inputPrecisions; -// std::vector eltwiseOpTypes; -// bool withQuantization; -// std::string targetName; -// std::tie(inputShapes, secondaryInputType, inputPrecisions, eltwiseOpTypes, withQuantization, targetName) = obj.param; -// std::ostringstream results; -// -// results << "IS=" << CommonTestUtils::partialShape2str(inputShapes.first) << "_"; -// results << "TS="; -// for (const auto& shape : inputShapes.second) { -// results << "("; -// for (const auto& item : shape) { -// results << CommonTestUtils::vec2str(item) << "_"; -// } -// results << ")_"; -// } -// for (int i = 0; i < inputPrecisions.size(); i++) { -// results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i].name() << "_"; -// } -// for (int i = 0; i < eltwiseOpTypes.size(); i++) { -// results << "Op" << std::to_string(i) << "=" << eltwiseOpTypes[i] << "_"; -// } -// results << "secondaryInputType=" << secondaryInputType << "_"; -// results << "WithQuant=" << withQuantization << "_"; -// results << "targetDevice=" << targetName; -// -// return results.str(); -// } -// -// InferenceEngine::Blob::Ptr GenerateInput(const InferenceEngine::InputInfo &info) const override { -// return FuncTestUtils::createAndFillBlob(info.getTensorDesc(), 10, 1); -// } -// -//protected: -// void SetUp() override { -// threshold = 0.1f; -// -// std::pair, std::vector>> inputShapes; -// ngraph::helpers::InputLayerType secondaryInputType; -// std::vector inputPrecisions; -// std::vector eltwiseOpTypes; -// bool withQuantization; -// std::tie(inputShapes, secondaryInputType, inputPrecisions, eltwiseOpTypes, withQuantization, targetDevice) = this->GetParam(); -// -// targetStaticShapes = inputShapes.second; -// inputDynamicShapes = inputShapes.first; -// -// ngraph::ParameterVector ngraphParam; -// std::vector> ngraphInputs; -// if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { -// for (size_t i = 0; i < targetStaticShapes[0].size(); i++) { -// ngraphParam.push_back(std::make_shared(convertIE2nGraphPrc(inputPrecisions[i]), targetStaticShapes[0][i])); -// ngraphInputs.push_back(ngraphParam.back()); -// } -// } else { -// ngraphParam = ngraph::builder::makeParams(convertIE2nGraphPrc(inputPrecisions[0]), {targetStaticShapes[0][0]}); -// for (int i = 1; i < inputPrecisions.size(); i++) { -// std::vector ngraphInput1Data(ngraph::shape_size(targetStaticShapes[0][i])); -// ngraphInputs.push_back(ngraph::builder::makeConstant(convertIE2nGraphPrc(inputPrecisions[i]), targetStaticShapes[0][i], -// ngraphInput1Data, true)); -// } -// } -// -// if (withQuantization) { -// std::vector> eltwiseOps; -// eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0])); -// for (int i = 1; i < eltwiseOpTypes.size() - 1; i++) { -// eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i])); -// } -// -// std::vector constShape(targetStaticShapes[0][0].size(), 1); -// constShape[1] = targetStaticShapes[0][0][1]; -// auto fq = ngraph::builder::makeFakeQuantize(eltwiseOps[eltwiseOps.size() - 1], -// ::ngraph::element::Type(::ngraph::element::Type_t::f32), -// 256, constShape); -// -// eltwiseOps.push_back(ngraph::builder::makeEltwise(fq, ngraphInputs[eltwiseOpTypes.size() - 1], eltwiseOpTypes[eltwiseOpTypes.size() - 1])); -// -// ngraph::ResultVector results{std::make_shared(eltwiseOps[eltwiseOps.size() - 1])}; -// function = std::make_shared(results, ngraphParam, "eltwise_chain_fq"); -// } else { -// std::vector> eltwiseOps; -// eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0])); -// for (int i = 1; i < eltwiseOpTypes.size(); i++) { -// eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i])); -// } -// -// ngraph::ResultVector results{std::make_shared(eltwiseOps[eltwiseOps.size() - 1])}; -// function = std::make_shared(results, ngraphParam, "eltwise_chain"); -// } -// } -//}; -// -//TEST_P(EltwiseChainTest, CompareWithRefs) { -// SKIP_IF_CURRENT_TEST_IS_DISABLED() -// -// Run(); -//} -// -//namespace { -// -//std::vector, std::vector>>> inputShapes = { -// { {}, {{{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}}}}, -// { {}, {{{1, 48, 5, 6}, {1, 48, 1, 1}, {1, 48, 5, 6}, {1, 1, 5, 6}}}}, -// { {}, {{{1, 72, 28, 28}, {1, 72, 1, 1}, {1, 72, 1, 1}, {1, 72, 1, 1}}}}, -// { {}, {{{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}}}, -// { {}, {{{1, 2, 3}, {3}, {3}, {3}}}}, -// { {}, {{{1, 12, 5, 5}, {5, 5}, {12, 5, 5}, {1}}}}, -// { {}, {{{3, 12, 5, 5}, {1, 12, 5, 1}, {3, 1, 1, 1}, {3, 12, 5, 5}}}}, -// { {}, {{{1, 1, 1, 1}, {1, 12, 5, 1}, {3, 12, 1, 5}, {3, 12, 5, 1}}}}, -// { {}, {{{1, 1, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}}} -//}; -// -//std::vector> inputPrecisions = { -// { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 }, -// { Precision::I32, Precision::I32, Precision::I32, Precision::I32 } -//}; -// -//std::vector> eltwiseOps = { -// { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT }, -// { EltwiseTypes::DIVIDE, EltwiseTypes::SQUARED_DIFF, EltwiseTypes::ADD }, -//}; -// -//INSTANTIATE_TEST_SUITE_P(smoke_EltwiseChain, EltwiseChainTest, -// ::testing::Combine( -// ::testing::ValuesIn(inputShapes), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(inputPrecisions), -// ::testing::ValuesIn(eltwiseOps), -// ::testing::Values(false), -// ::testing::Values(CommonTestUtils::DEVICE_CPU)), -// EltwiseChainTest::getTestCaseName); -// -//std::vector, std::vector>>> inputShapesFQ = { -// { {}, {{{1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}}}}, -// { {}, {{{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}}}, -// { {}, {{{2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}}}}, -// { {}, {{{2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}}}}, -// { {}, {{{2, 5, 7, 5}, {2, 5, 1, 5}, {2, 5, 7, 5}, {2, 5, 7, 5}}}}, -// { {}, {{{2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}}}}, -// { {}, {{{2, 256, 7, 5}, {2, 256, 7, 5}, {2, 256, 1, 5}, {2, 256, 7, 5}}}}, -// { {}, {{{1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}}}}, -// { {}, {{{1, 12, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}}}, -// { {}, {{{1, 12, 1, 1, 6}, {1, 12, 5, 5, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 5, 1}}}}, -// { {}, {{{1, 12, 1, 1, 1}, {1, 12, 5, 1, 7}, {3, 12, 1, 5, 7}, {3, 12, 5, 1, 7}}}}, -// { {}, {{{1, 7, 1, 1, 12}, {1, 7, 5, 1, 12}, {3, 7, 1, 5, 12}, {3, 7, 5, 1, 12}}}}, -// { {}, {{{1, 7, 1, 1, 12, 3, 7}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 3, 7}, {3, 7, 5, 1, 12, 3, 7}}}}, -// { {}, {{{1, 7, 1, 1, 12, 3, 1}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 1, 7}, {3, 7, 5, 1, 12, 3, 1}}}} -//}; -// -//std::vector> inputPrecisionsFQ { -// { Precision::FP32, Precision::FP32, Precision::FP32, Precision::FP32 } -//}; -// -//INSTANTIATE_TEST_SUITE_P(smoke_EltwiseChainWithFQ, EltwiseChainTest, -// ::testing::Combine( -// ::testing::ValuesIn(inputShapesFQ), -// ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), -// ::testing::ValuesIn(inputPrecisionsFQ), -// ::testing::ValuesIn(eltwiseOps), -// ::testing::Values(true), -// ::testing::Values(CommonTestUtils::DEVICE_CPU)), -// EltwiseChainTest::getTestCaseName); -// -//// =============================================== dynamic ============================================== -//std::vector, std::vector>>> inputShapes_dyn = { -// { -// // dynamic -// { -// {-1, -1, -1}, -// {-1}, -// {-1}, -// {-1} -// }, -// // target -// { -// {{1, 2, 3}, {3}, {3}, {3}}, -// {{5, 2, 7}, {7}, {1}, {1}}, -// {{3, 1, 10}, {1}, {1}, {1}}, -// } -// }, -// { -// // dynamic -// { -// {-1, -1, -1, -1}, -// {-1, -1}, -// {-1, -1, -1}, -// {-1} -// }, -// // target -// { -// {{1, 12, 5, 5}, {5, 5}, {12, 5, 5}, {1}}, -// {{5, 16, 1, 5}, {1, 5}, {1, 5, 1}, {1}}, -// {{2, 1, 1, 5}, {5, 1}, {16, 5, 5}, {5}}, -// } -// }, -// { -// // dynamic -// { -// {-1, -1, -1, -1}, -// {-1, -1, -1, -1}, -// {-1, -1, -1, -1}, -// {-1, -1, -1, -1} -// }, -// // target -// { -// {{1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}}, -// {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}, -// {{2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}}, -// {{2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}}, -// {{2, 5, 7, 5}, {2, 5, 1, 5}, {2, 5, 7, 5}, {2, 5, 7, 5}}, -// {{2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}}, -// {{2, 256, 7, 5}, {2, 256, 7, 5}, {2, 256, 1, 5}, {2, 256, 7, 5}}, -// {{1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}} -// } -// }, -// { -// // dynamic -// { -// {-1, -1, -1, -1, -1}, -// {-1, -1, -1, -1, -1}, -// {-1, -1, -1, -1, -1}, -// {-1, -1, -1, -1, -1} -// }, -// // target -// { -// {{1, 12, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}, -// {{1, 12, 1, 1, 6}, {1, 12, 5, 5, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 5, 1}}, -// {{1, 12, 1, 1, 1}, {1, 12, 5, 1, 7}, {3, 12, 1, 5, 7}, {3, 12, 5, 1, 7}}, -// {{1, 7, 1, 1, 12}, {1, 7, 5, 1, 12}, {3, 7, 1, 5, 12}, {3, 7, 5, 1, 12}} -// } -// }, -// { -// // dynamic -// { -// {-1, -1, -1, -1, -1, -// -1, -1}, -// {-1, -1, -1, -1, -1, -// -1, -1}, -// {-1, -1, -1, -1, -1, -// -1, -1}, -// {-1, -1, -1, -1, -1, -// -1, -1} -// }, -// // target -// { -// {{1, 7, 1, 1, 12, 3, 7}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 3, 7}, {3, 7, 5, 1, 12, 3, 7}}, -// {{1, 7, 1, 1, 12, 3, 1}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 1, 7}, {3, 7, 5, 1, 12, 3, 1}}, -// {{5, 7, 1, 2, 12, 1, 8}, {1, 7, 5, 1, 12, 3, 8}, {5, 1, 1, 2, 12, 1, 8}, {1, 7, 5, 1, 12, 3, 1}} -// } -// } -//}; -// -//INSTANTIATE_TEST_SUITE_P(smoke_EltwiseChain_dyn, EltwiseChainTest, -// ::testing::Combine( -// ::testing::ValuesIn(inputShapes_dyn), -// ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), -// ::testing::ValuesIn(inputPrecisions), -// ::testing::ValuesIn(eltwiseOps), -// ::testing::Values(false), -// ::testing::Values(CommonTestUtils::DEVICE_CPU)), -// EltwiseChainTest::getTestCaseName); -// -//} // namespace -//} // namespace CPUSubgraphTestsDefinitions + +#include +#include +#include +#include +#include +#include +#include +#include "common_test_utils/common_utils.hpp" +#include "functional_test_utils/ov_tensor_utils.hpp" +#include "functional_test_utils/skip_tests_config.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace CPUTestUtils; +using ngraph::helpers::EltwiseTypes; +using namespace ov::test; + +namespace CPUSubgraphTestsDefinitions { + +typedef std::tuple< + std::vector, // Input shapes + ngraph::helpers::InputLayerType, // Secondary input type + std::vector, // Input precisions + std::vector, // Eltwise operations + bool, // With quantization + std::string // Device name +> EltwiseChainTuple; + +class EltwiseChainTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj) { + std::vector inputShapes; + ngraph::helpers::InputLayerType secondaryInputType; + std::vector inputPrecisions; + std::vector eltwiseOpTypes; + bool withQuantization; + std::string targetName; + std::tie(inputShapes, secondaryInputType, inputPrecisions, eltwiseOpTypes, withQuantization, targetName) = obj.param; + std::ostringstream results; + + results << "IS=("; + for (const auto& shape : inputShapes) { + results << CommonTestUtils::partialShape2str({shape.first}) << "_"; + } + results << ")_TS=("; + for (const auto& shape : inputShapes) { + for (const auto& item : shape.second) { + results << CommonTestUtils::vec2str(item) << "_"; + } + } + for (int i = 0; i < inputPrecisions.size(); i++) { + results << "InPRC" << std::to_string(i) << "=" << inputPrecisions[i] << "_"; + } + for (int i = 0; i < eltwiseOpTypes.size(); i++) { + results << "Op" << std::to_string(i) << "=" << eltwiseOpTypes[i] << "_"; + } + results << "secondaryInputType=" << secondaryInputType << "_"; + results << "WithQuant=" << withQuantization << "_"; + results << "targetDevice=" << targetName; + + return results.str(); + } + + void generate_inputs(const std::vector& targetInputStaticShapes) override { + inputs.clear(); + const auto& funcInputs = function->inputs(); + for (int i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + ov::runtime::Tensor tensor; + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], 10, 1, 1); + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } + } + +protected: + void SetUp() override { + abs_threshold = 0.1f; + + std::vector inputShapes; + ngraph::helpers::InputLayerType secondaryInputType; + std::vector inputPrecisions; + std::vector eltwiseOpTypes; + bool withQuantization; + std::tie(inputShapes, secondaryInputType, inputPrecisions, eltwiseOpTypes, withQuantization, targetDevice) = this->GetParam(); + + init_input_shapes(inputShapes); + + ngraph::ParameterVector ngraphParam; + std::vector> ngraphInputs; + if (secondaryInputType == ngraph::helpers::InputLayerType::PARAMETER) { + for (size_t i = 0; i < inputDynamicShapes.size(); i++) { + ngraphParam.push_back(std::make_shared(inputPrecisions[i], inputDynamicShapes[i])); + ngraphInputs.push_back(ngraphParam.back()); + } + } else { + ngraphParam = ngraph::builder::makeDynamicParams(inputPrecisions[0], {inputDynamicShapes.front()}); + for (int i = 1; i < inputPrecisions.size(); i++) { + std::vector ngraphInput1Data(ngraph::shape_size(targetStaticShapes[0][i])); + ngraphInputs.push_back(ngraph::builder::makeConstant(inputPrecisions[i], targetStaticShapes[0][i], + ngraphInput1Data, true)); + } + } + + if (withQuantization) { + std::vector> eltwiseOps; + eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0])); + for (int i = 1; i < eltwiseOpTypes.size() - 1; i++) { + eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i])); + } + + std::vector constShape(targetStaticShapes[0][0].size(), 1); + constShape[1] = targetStaticShapes[0][0][1]; + auto fq = ngraph::builder::makeFakeQuantize(eltwiseOps[eltwiseOps.size() - 1], + ::ngraph::element::Type(::ngraph::element::Type_t::f32), + 256, constShape); + + eltwiseOps.push_back(ngraph::builder::makeEltwise(fq, ngraphInputs[eltwiseOpTypes.size() - 1], eltwiseOpTypes[eltwiseOpTypes.size() - 1])); + + ngraph::ResultVector results{std::make_shared(eltwiseOps[eltwiseOps.size() - 1])}; + function = std::make_shared(results, ngraphParam, "eltwise_chain_fq"); + } else { + std::vector> eltwiseOps; + eltwiseOps.push_back(ngraph::builder::makeEltwise(ngraphParam[0], ngraphInputs[0], eltwiseOpTypes[0])); + for (int i = 1; i < eltwiseOpTypes.size(); i++) { + eltwiseOps.push_back(ngraph::builder::makeEltwise(eltwiseOps[eltwiseOps.size() - 1], ngraphInputs[i], eltwiseOpTypes[i])); + } + + ngraph::ResultVector results{std::make_shared(eltwiseOps[eltwiseOps.size() - 1])}; + function = std::make_shared(results, ngraphParam, "eltwise_chain"); + } + } +}; + +TEST_P(EltwiseChainTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + + run(); +} + +namespace { + +std::vector> inputShapes = { + {{1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}, {1, 1, 2, 3}}, + {{1, 48, 5, 6}, {1, 48, 1, 1}, {1, 48, 5, 6}, {1, 1, 5, 6}}, + {{1, 72, 28, 28}, {1, 72, 1, 1}, {1, 72, 1, 1}, {1, 72, 1, 1}}, + {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}, + {{1, 2, 3}, {3}, {3}, {3}}, + {{1, 12, 5, 5}, {5, 5}, {12, 5, 5}, {1}}, + {{3, 12, 5, 5}, {1, 12, 5, 1}, {3, 1, 1, 1}, {3, 12, 5, 5}}, + {{1, 1, 1, 1}, {1, 12, 5, 1}, {3, 12, 1, 5}, {3, 12, 5, 1}}, + {{1, 1, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}} +}; + +std::vector> inputPrecisions = { + { ElementType::f32, ElementType::f32, ElementType::f32, ElementType::f32 }, + { ElementType::i32, ElementType::i32, ElementType::i32, ElementType::i32 } +}; + +std::vector> eltwiseOps = { + { EltwiseTypes::ADD, EltwiseTypes::MULTIPLY, EltwiseTypes::SUBTRACT }, + { EltwiseTypes::DIVIDE, EltwiseTypes::SQUARED_DIFF, EltwiseTypes::ADD }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_EltwiseChain, EltwiseChainTest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inputShapes)), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(eltwiseOps), + ::testing::Values(false), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + EltwiseChainTest::getTestCaseName); + +std::vector> inputShapesFQ = { + {{1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}, {1, 2, 2, 3}}, + {{2, 33, 5, 5}, {2, 33, 5, 5}, {2, 33, 1, 5}, {2, 33, 5, 5}}, + {{2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}, {2, 33, 5, 17}}, + {{2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}, {2, 33, 5, 256}}, + {{2, 5, 7, 5}, {2, 5, 1, 5}, {2, 5, 7, 5}, {2, 5, 7, 5}}, + {{2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}, {2, 17, 7, 5}}, + {{2, 256, 7, 5}, {2, 256, 7, 5}, {2, 256, 1, 5}, {2, 256, 7, 5}}, + {{1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}, {1, 36, 34, 34}}, + {{1, 12, 1, 1, 6}, {1, 12, 5, 1, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 1, 1}}, + {{1, 12, 1, 1, 6}, {1, 12, 5, 5, 6}, {3, 12, 1, 5, 1}, {3, 12, 5, 5, 1}}, + {{1, 12, 1, 1, 1}, {1, 12, 5, 1, 7}, {3, 12, 1, 5, 7}, {3, 12, 5, 1, 7}}, + {{1, 7, 1, 1, 12}, {1, 7, 5, 1, 12}, {3, 7, 1, 5, 12}, {3, 7, 5, 1, 12}}, + {{1, 7, 1, 1, 12, 3, 7}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 3, 7}, {3, 7, 5, 1, 12, 3, 7}}, + {{1, 7, 1, 1, 12, 3, 1}, {1, 7, 5, 1, 12, 3, 7}, {3, 7, 1, 5, 12, 1, 7}, {3, 7, 5, 1, 12, 3, 1}} +}; + +std::vector> inputPrecisionsFQ { + { ElementType::f32, ElementType::f32, ElementType::f32, ElementType::f32 } +}; + +INSTANTIATE_TEST_SUITE_P(smoke_EltwiseChainWithFQ, EltwiseChainTest, + ::testing::Combine( + ::testing::ValuesIn(static_shapes_to_test_representation(inputShapesFQ)), + ::testing::Values(ngraph::helpers::InputLayerType::CONSTANT), + ::testing::ValuesIn(inputPrecisionsFQ), + ::testing::ValuesIn(eltwiseOps), + ::testing::Values(true), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + EltwiseChainTest::getTestCaseName); + +// =============================================== dynamic ============================================== +std::vector> inputShapes_dyn = { + { + // inp1 + { + // dynamic + {-1, -1, -1}, + // target + { + {1, 2, 3}, + {5, 2, 7}, + {3, 1, 10}, + } + }, + // inp2 + { + // dynamic + {-1}, + // target + { + {3}, {7}, {1}, + } + }, + // inp3 + { + // dynamic + {-1}, + // target + { + {3}, {1}, {1} + } + }, + // inp4 + { + // dynamic + {-1}, + // target + { + {3}, {1}, {1} + } + } + }, + { + // inp1 + { + // dynamic + {-1, -1, -1, -1}, + // target + { + {1, 12, 5, 5}, + {5, 16, 1, 5}, + {2, 1, 1, 5}, + } + }, + // inp2 + { + // dynamic + {-1, -1}, + // target + { + {5, 5}, {1, 5}, {5, 1}, + } + }, + // inp3 + { + // dynamic + {-1, -1, -1}, + // target + { + {12, 5, 5}, + {1, 5, 1}, + {16, 5, 5}, + } + }, + // inp4 + { + // dynamic + {-1}, + // target + { + {1}, {1}, {5} + } + } + }, + { + // inp1 + { + // dynamic + {-1, -1, -1, -1}, + // target + { + {1, 2, 2, 3}, + {2, 33, 5, 5}, + {2, 33, 5, 17}, + {2, 33, 5, 256}, + {2, 5, 7, 5}, + {2, 17, 7, 5}, + {2, 256, 7, 5}, + {1, 36, 34, 34}, + } + }, + // inp2 + { + // dynamic + {-1, -1, -1, -1}, + // target + { + {1, 2, 2, 3}, + {2, 33, 5, 5}, + {2, 33, 5, 17}, + {2, 33, 5, 256}, + {2, 5, 1, 5}, + {2, 17, 7, 5}, + {2, 256, 7, 5}, + {1, 36, 34, 34}, + } + }, + // inp3 + { + // dynamic + {-1, -1, -1, -1}, + // target + { + {1, 2, 2, 3}, + {2, 33, 1, 5}, + {2, 33, 5, 17}, + {2, 33, 5, 256}, + {2, 5, 7, 5}, + {2, 17, 7, 5}, + {2, 256, 1, 5}, + {1, 36, 34, 34} + } + }, + // inp4 + { + // dynamic + {-1, -1, -1, -1}, + // target + { + {1, 2, 2, 3}, + {2, 33, 5, 5}, + {2, 33, 5, 17}, + {2, 33, 5, 256}, + {2, 5, 7, 5}, + {2, 17, 7, 5}, + {2, 256, 7, 5}, + {1, 36, 34, 34} + } + } + }, + { + // inp1 + { + // dynamic + {-1, -1, -1, -1, -1}, + // target + { + {1, 12, 1, 1, 6}, + {1, 12, 1, 1, 6}, + {1, 12, 1, 1, 1}, + {1, 7, 1, 1, 12}, + } + }, + // inp2 + { + // dynamic + {-1, -1, -1, -1, -1}, + // target + { + {1, 12, 5, 1, 6}, + {1, 12, 5, 5, 6}, + {1, 12, 5, 1, 7}, + {1, 7, 5, 1, 12}, + } + }, + // inp3 + { + // dynamic + {-1, -1, -1, -1, -1}, + // target + { + {3, 12, 1, 5, 1}, + {3, 12, 1, 5, 1}, + {3, 12, 1, 5, 7}, + {3, 7, 1, 5, 12} + } + }, + // inp4 + { + // dynamic + {-1, -1, -1, -1, -1}, + // target + { + {3, 12, 5, 1, 1}, + {3, 12, 5, 5, 1}, + {3, 12, 5, 1, 7}, + {3, 7, 5, 1, 12} + } + } + }, + { + // inp1 + { + // dynamic + {-1, -1, -1, -1, -1, -1, -1}, + // target + { + {1, 7, 1, 1, 12, 3, 7}, + {1, 7, 1, 1, 12, 3, 1}, + {5, 7, 1, 2, 12, 1, 8}, + } + }, + // inp2 + { + // dynamic + {-1, -1, -1, -1, -1, -1, -1}, + // target + { + {1, 7, 5, 1, 12, 3, 7}, + {1, 7, 5, 1, 12, 3, 7}, + {1, 7, 5, 1, 12, 3, 8}, + } + }, + // inp3 + { + // dynamic + {-1, -1, -1, -1, -1, -1, -1}, + // target + { + {3, 7, 1, 5, 12, 3, 7}, + {3, 7, 1, 5, 12, 1, 7}, + {5, 1, 1, 2, 12, 1, 8}, + } + }, + // inp4 + { + // dynamic + {-1, -1, -1, -1, -1, -1, -1}, + // target + { + {3, 7, 5, 1, 12, 3, 7}, + {3, 7, 5, 1, 12, 3, 1}, + {1, 7, 5, 1, 12, 3, 1} + } + } + } +}; + +INSTANTIATE_TEST_SUITE_P(smoke_EltwiseChain_dyn, EltwiseChainTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_dyn), + ::testing::Values(ngraph::helpers::InputLayerType::PARAMETER), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(eltwiseOps), + ::testing::Values(false), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + EltwiseChainTest::getTestCaseName); + +} // namespace +} // namespace CPUSubgraphTestsDefinitions diff --git a/src/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp b/src/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp index 60ea76acf1a..40b7e9e6930 100644 --- a/src/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp +++ b/src/tests/functional/plugin/cpu/test_utils/cpu_test_utils.cpp @@ -227,10 +227,14 @@ std::string CPUTestsBase::getTestCaseName(CPUSpecificParams params) { std::string selectedType; std::tie(inFmts, outFmts, priority, selectedType) = params; if (!inFmts.empty()) { - result << "_inFmts=" << fmts2str(inFmts, ""); + auto str = fmts2str(inFmts, ""); + std::replace(str.begin(), str.end(), ',', '.'); + result << "_inFmts=" << str; } if (!outFmts.empty()) { - result << "_outFmts=" << fmts2str(outFmts, ""); + auto str = fmts2str(outFmts, ""); + std::replace(str.begin(), str.end(), ',', '.'); + result << "_outFmts=" << str; } if (!selectedType.empty()) { result << "_primitive=" << selectedType; diff --git a/src/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp b/src/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp index 709269dc4ab..934953fd514 100644 --- a/src/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp +++ b/src/tests/functional/plugin/cpu/test_utils/fusing_test_utils.hpp @@ -237,6 +237,29 @@ const auto fusingFakeQuantizePerChannelRelu = fusingSpecificParams{std::make_sha return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Relu); }, "Relu"}}), {"FakeQuantize", "Relu"}}; +const auto fusingFQPerChannelSigmoidFQPerChannel = fusingSpecificParams{std::make_shared(std::vector{ + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + auto localPrc = inpNode->get_element_type(); + auto shape = inpNode->get_output_partial_shape(0); + if (shape.size() == 1) + IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only"; + ngraph::Shape newShape(shape.size(), 1); + newShape[1] = shape[1].get_length(); + return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape); + }, "FakeQuantize(PerChannel)"}, + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + return ngraph::builder::makeActivation(inpNode, ngPrc, ngraph::helpers::Sigmoid); + }, "Sigmoid"}, + {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params){ + auto localPrc = inpNode->get_element_type(); + auto shape = inpNode->get_output_partial_shape(0); + if (shape.size() == 1) + IE_THROW() << "If shape.size() == 1 then Granularity can be PerTensor only"; + ngraph::Shape newShape(shape.size(), 1); + newShape[1] = shape[1].get_length(); + return ngraph::builder::makeFakeQuantize(inpNode, localPrc, 256, newShape); + }, "FakeQuantize(PerChannel)"}}), {"FakeQuantize", "Sigmoid", "FakeQuantize"}}; + const auto fusingFakeQuantizePerTensorRelu = fusingSpecificParams{std::make_shared(std::vector{ {[](std::shared_ptr inpNode, const ngraph::element::Type& ngPrc, ngraph::ParameterVector& params) { auto localPrc = inpNode->get_element_type(); diff --git a/src/tests/unit/cpu/rt_cache.cpp b/src/tests/unit/cpu/rt_cache.cpp new file mode 100644 index 00000000000..213a068aa83 --- /dev/null +++ b/src/tests/unit/cpu/rt_cache.cpp @@ -0,0 +1,381 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include + +#include "cache/lru_cache.h" +#include "cache/multi_cache.h" + +using namespace MKLDNNPlugin; + +namespace { +struct IntKey { + size_t hash() const { + return std::hash().operator()(data); + } + bool operator==(const IntKey& rhs) const noexcept { + return this->data == rhs.data; + } + + int data; +}; +} // namespace + +TEST(LruCacheTests, Evict) { + constexpr size_t capacity = 10; + LruCache cache(capacity); + for (size_t i = 0; i < 2 * capacity; ++i) { + ASSERT_NO_THROW(cache.put({10}, 10)); + } + ASSERT_NO_THROW(cache.evict(5)); + ASSERT_NO_THROW(cache.evict(10)); + int result = cache.get({10}); + ASSERT_EQ(result, int()); + ASSERT_NO_THROW(cache.evict(0)); +} + +TEST(LruCacheTests, Put) { + constexpr size_t capacity = 10; + LruCache cache(capacity); + for (size_t i = 0; i < 2 * capacity; ++i) { + ASSERT_NO_THROW(cache.put({10}, 10)); + } + + ASSERT_EQ(cache.get({10}), 10); +} + +TEST(LruCacheTests, Get) { + constexpr size_t capacity = 10; + LruCache cache(capacity); + for (int i = 1; i < 2 * capacity; ++i) { + ASSERT_NO_THROW(cache.put({i}, i)); + } + + for (int i = 1; i < capacity; ++i) { + ASSERT_EQ(cache.get({i}), int()); + } + + for (int i = capacity; i < 2 * capacity; ++i) { + ASSERT_EQ(cache.get({i}), i); + } +} + +TEST(LruCacheTests, LruPolicy) { + constexpr size_t capacity = 10; + LruCache cache(capacity); + for (int i = 1; i < capacity; ++i) { + ASSERT_NO_THROW(cache.put({i}, i)); + } + + for (int i = 4; i < capacity; ++i) { + ASSERT_EQ(cache.get({i}), i); + } + + for (int i = 21; i < 25; ++i) { + ASSERT_NO_THROW(cache.put({i}, i)); + } + + for (int i = 1; i < 4; ++i) { + ASSERT_EQ(cache.get({i}), int()); + } +} + +TEST(LruCacheTests, Empty) { + constexpr size_t capacity = 0; + constexpr size_t attempts = 10; + LruCache cache(capacity); + for (int i = 1; i < attempts; ++i) { + ASSERT_NO_THROW(cache.put({i}, i)); + } + + for (int i = 1; i < attempts; ++i) { + ASSERT_EQ(cache.get({i}), int()); + } +} +namespace { +template +class mockBuilder { +public: + MOCK_METHOD(T, build, (const K&)); +}; +}// namespace + +TEST(CacheEntryTests, GetOrCreate) { + using testing::_; + using ValueType = std::shared_ptr; + + constexpr size_t capacity = 10; + + mockBuilder builderMock; + EXPECT_CALL(builderMock, build(_)) + .Times(3 * capacity) + .WillRepeatedly([](const IntKey& key){return key.data;}); + + auto builder = [&](const IntKey& key) { return std::make_shared(builderMock.build(key)); }; + + CacheEntry entry(capacity); + + //creating so we miss everytime + for (int i = 0; i < capacity; ++i) { + auto result = entry.getOrCreate({i}, builder); + ASSERT_NE(result.first, ValueType()); + ASSERT_EQ(*result.first, i); + ASSERT_EQ(result.second, CacheEntryBase::LookUpStatus::Miss); + } + + //always hit + for (int i = 0; i < capacity; ++i) { + auto result = entry.getOrCreate({i}, builder); + ASSERT_NE(result.first, ValueType()); + ASSERT_EQ(*result.first, i); + ASSERT_EQ(result.second, CacheEntryBase::LookUpStatus::Hit); + } + + //new values displace old ones + for (int i = capacity; i < 2 * capacity; ++i) { + auto result = entry.getOrCreate({i}, builder); + ASSERT_NE(result.first, ValueType()); + ASSERT_EQ(*result.first, i); + ASSERT_EQ(result.second, CacheEntryBase::LookUpStatus::Miss); + } + + //can not hit the old ones + for (int i = 0; i < capacity; ++i) { + auto result = entry.getOrCreate({i}, builder); + ASSERT_NE(result.first, ValueType()); + ASSERT_EQ(*result.first, i); + ASSERT_EQ(result.second, CacheEntryBase::LookUpStatus::Miss); + } +} + +TEST(CacheEntryTests, Empty) { + using testing::_; + using ValueType = std::shared_ptr; + + constexpr size_t capacity = 0; + constexpr size_t attempts = 10; + + mockBuilder builderMock; + EXPECT_CALL(builderMock, build(_)) + .Times(2 * attempts) + .WillRepeatedly([](const IntKey& key){return key.data;}); + + auto builder = [&](const IntKey& key) { return std::make_shared(builderMock.build(key)); }; + + CacheEntry entry(capacity); + + //creating so we miss everytime + for (int i = 0; i < attempts; ++i) { + auto result = entry.getOrCreate({i}, builder); + ASSERT_NE(result.first, ValueType()); + ASSERT_EQ(*result.first, i); + ASSERT_EQ(result.second, CacheEntryBase::LookUpStatus::Miss); + } + + //since the capacity is 0 we will always miss + for (int i = 0; i < attempts; ++i) { + auto result = entry.getOrCreate({i}, builder); + ASSERT_NE(result.first, ValueType()); + ASSERT_EQ(*result.first, i); + ASSERT_EQ(result.second, CacheEntryBase::LookUpStatus::Miss); + } +} + +namespace { +struct StringKey { + size_t hash() const { + return std::hash().operator()(data); + } + bool operator==(const StringKey& rhs) const noexcept { + return this->data == rhs.data; + } + + std::string data; +}; +} // namespace + +TEST(MultiCacheTests, GetOrCreate) { + using testing::_; + using IntValueType = std::shared_ptr; + using StrValueType = std::shared_ptr; + + constexpr size_t capacity = 10; + + mockBuilder intBuilderMock; + EXPECT_CALL(intBuilderMock, build(_)) + .Times(3 * capacity) + .WillRepeatedly([](const IntKey& key){return key.data;}); + + mockBuilder strBuilderMock; + EXPECT_CALL(strBuilderMock, build(_)) + .Times(3 * capacity) + .WillRepeatedly([](const StringKey& key){return key.data;}); + + auto intBuilder = [&](const IntKey& key) { return std::make_shared(intBuilderMock.build(key)); }; + auto strBuilder = [&](const StringKey& key) { return std::make_shared(strBuilderMock.build(key)); }; + + MultiCache cache(capacity); + + //creating so we miss everytime + for (int i = 0; i < capacity; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Miss); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Miss); + } + + //always hit + for (int i = 0; i < capacity; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Hit); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Hit); + } + + //new values displace old ones + for (int i = capacity; i < 2 * capacity; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Miss); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Miss); + } + + //can not hit the old ones + for (int i = 0; i < capacity; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Miss); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Miss); + } +} + +TEST(MultiCacheTests, Empty) { + using testing::_; + using IntValueType = std::shared_ptr; + using StrValueType = std::shared_ptr; + + constexpr size_t capacity = 0; + constexpr size_t attempts = 10; + + mockBuilder intBuilderMock; + EXPECT_CALL(intBuilderMock, build(_)) + .Times(2 * attempts) + .WillRepeatedly([](const IntKey& key){return key.data;}); + + mockBuilder strBuilderMock; + EXPECT_CALL(strBuilderMock, build(_)) + .Times(2 * attempts) + .WillRepeatedly([](const StringKey& key){return key.data;}); + + auto intBuilder = [&](const IntKey& key) { return std::make_shared(intBuilderMock.build(key)); }; + auto strBuilder = [&](const StringKey& key) { return std::make_shared(strBuilderMock.build(key)); }; + + MultiCache cache(capacity); + + //creating so we miss everytime + for (int i = 0; i < attempts; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Miss); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Miss); + } + + //since the capacity is 0 we will always miss + for (int i = 0; i < attempts; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Miss); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Miss); + } +} + +namespace { +class ScopedThread { +public: + explicit ScopedThread(std::thread t) : _t(std::move(t)) { + if (!_t.joinable()) { + std::logic_error("Thread is not joinable!"); + } + } + ~ScopedThread() { + _t.join(); + } + ScopedThread(ScopedThread&& rhs) noexcept = default; +private: + std::thread _t; +}; +}// namespace + + +TEST(MultiCacheTests, SmokeTypeIdSync) { + using IntValueType = std::shared_ptr; + using StrValueType = std::shared_ptr; + + constexpr size_t capacity = 10; + constexpr size_t numThreads = 30; + + auto intBuilder = [&](const IntKey& key) { return std::make_shared(key.data); }; + auto strBuilder = [&](const StringKey& key) { return std::make_shared(key.data); }; + + std::vector vecCache(numThreads, MultiCache(capacity)); + + auto testRoutine = [&](MultiCache& cache) { + //creating so we miss everytime + for (int i = 0; i < capacity; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Miss); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Miss); + } + + //always hit + for (int i = 0; i < capacity; ++i) { + auto intResult = cache.getOrCreate(IntKey{i}, intBuilder); + ASSERT_NE(intResult.first, IntValueType()); + ASSERT_EQ(*intResult.first, i); + ASSERT_EQ(intResult.second, CacheEntryBase::LookUpStatus::Hit); + auto strResult = cache.getOrCreate(StringKey{std::to_string(i)}, strBuilder); + ASSERT_NE(strResult.first, StrValueType()); + ASSERT_EQ(*strResult.first, std::to_string(i)); + ASSERT_EQ(strResult.second, CacheEntryBase::LookUpStatus::Hit); + } + }; + + std::vector vecThreads; + vecThreads.reserve(numThreads); + for (size_t i = 0; i < numThreads; ++i) { + vecThreads.emplace_back(std::thread(testRoutine, std::ref(vecCache[i]))); + } +} From 8dc7669156ded1aaecab995b1fb16f88b8a864a8 Mon Sep 17 00:00:00 2001 From: Anton Chetverikov Date: Wed, 29 Dec 2021 09:43:52 +0300 Subject: [PATCH 16/78] [MO] Deprecate TensorFlow 1.x environment support by MO (#8970) * Remove tf1 requirements file and add deprecation message then tf1 used while compression * Added tf requirements file * Update docs and warning message * Update install scripts * return tf configuration * Add compat.v1 to code snippets in docs * fix typo * Moved back tf2 file * Apply review comments * Fix missed eol * Fix missed eol * Apply review comments in docs * Remove tf version and fix error in script tag * Remove tf version * Remove unused import * Add note about python and TF version --- .../prepare_model/Config_Model_Optimizer.md | 20 +++-------- .../Convert_Model_From_TensorFlow.md | 4 +-- .../Convert_BERT_From_Tensorflow.md | 11 +++--- .../Convert_CRNN_From_Tensorflow.md | 3 +- .../Convert_NCF_From_Tensorflow.md | 6 ++-- .../Convert_Slim_Library_Models.md | 1 + .../Convert_XLNet_From_Tensorflow.md | 34 ++++++++++--------- .../installing-openvino-linux.md | 4 +-- .../installing-openvino-macos.md | 4 +-- .../installing-openvino-windows.md | 4 +-- docs/install_guides/pypi-openvino-dev.md | 1 + .../tools/mo/utils/versions_checker.py | 11 +++--- tools/mo/requirements_tf.txt | 5 ++- 13 files changed, 50 insertions(+), 58 deletions(-) diff --git a/docs/MO_DG/prepare_model/Config_Model_Optimizer.md b/docs/MO_DG/prepare_model/Config_Model_Optimizer.md index f2be1fee3cc..6c6b26177b7 100644 --- a/docs/MO_DG/prepare_model/Config_Model_Optimizer.md +++ b/docs/MO_DG/prepare_model/Config_Model_Optimizer.md @@ -375,22 +375,14 @@ install_prerequisites_caffe.sh ``` install_prerequisites_caffe.bat ``` -* For TensorFlow\* 1.x on Linux: +* For TensorFlow\* on Linux: ``` install_prerequisites_tf.sh ``` -* For TensorFlow 1.x on Windows: +* For TensorFlow on Windows: ``` install_prerequisites_tf.bat ``` -* For TensorFlow\* 2.x on Linux: -``` -install_prerequisites_tf2.sh -``` -* For TensorFlow 2.x on Windows: -``` -install_prerequisites_tf2.bat -``` * For MXNet\* on Linux: ``` install_prerequisites_mxnet.sh @@ -453,19 +445,15 @@ virtualenv -p /usr/bin/python3.6 .env3/bin/activate ```shell pip3 install -r requirements.txt ``` -> **NOTE**: TensorFlow 1.x and 2.x are incompatible. Use separate virtual environments if you want to install multiple TensorFlow versions. +> **NOTE**: Support of MO in TensorFlow 1.x environment is deprecated. Use TensorFlow 2.x environment to convert both TensorFlow 1.x and 2.x models. Use separate virtual environments if you want to install multiple TensorFlow versions. * To install dependencies only for Caffe: ```shell pip3 install -r requirements_caffe.txt ``` - * To install dependencies only for TensorFlow 1.x: + * To install dependencies only for TensorFlow: ```shell pip3 install -r requirements_tf.txt -``` - * To install dependencies only for TensorFlow 2.x: -```shell -pip3 install -r requirements_tf2.txt ``` * To install dependencies only for MXNet: ```shell diff --git a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md index 703a106768a..6273f2bfa37 100644 --- a/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md +++ b/docs/MO_DG/prepare_model/convert_model/Convert_Model_From_TensorFlow.md @@ -256,7 +256,7 @@ The graph is frozen and dumped to a file with the following code: ```python import tensorflow as tf from tensorflow.python.framework import graph_io -frozen = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ["name_of_the_output_node"]) +frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ["name_of_the_output_node"]) graph_io.write_graph(frozen, './', 'inference_graph.pb', as_text=False) ``` @@ -360,7 +360,7 @@ TensorFlow*-specific parameters: ## Convert TensorFlow* 2 Models -In order to convert TensorFlow* 2 models, installation of dependencies from `requirements_tf2.txt` is required. +In order to convert TensorFlow* 2 models, installation of dependencies from `requirements_tf.txt` is required. TensorFlow* 2.X officially supports two model formats: SavedModel and Keras H5 (or HDF5). Below are the instructions on how to convert each of them. diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_BERT_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_BERT_From_Tensorflow.md index 4382e90b78c..ebef0028dc5 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_BERT_From_Tensorflow.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_BERT_From_Tensorflow.md @@ -75,13 +75,14 @@ python3 download_glue_data.py --tasks MRPC 8. Open the file `run_classifier.py` and insert the following code after the line 645: ```python import os, sys + import tensorflow as tf from tensorflow.python.framework import graph_io - with tf.Session(graph=tf.get_default_graph()) as sess: + with tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph()) as sess: (assignment_map, initialized_variable_names) = \ - modeling.get_assignment_map_from_checkpoint(tf.trainable_variables(), init_checkpoint) - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - sess.run(tf.global_variables_initializer()) - frozen = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ["bert/pooler/dense/Tanh"]) + modeling.get_assignment_map_from_checkpoint(tf.compat.v1.trainable_variables(), init_checkpoint) + tf.compat.v1.train.init_from_checkpoint(init_checkpoint, assignment_map) + sess.run(tf.compat.v1.global_variables_initializer()) + frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ["bert/pooler/dense/Tanh"]) graph_io.write_graph(frozen, './', 'inference_graph.pb', as_text=False) print('BERT frozen model path {}'.format(os.path.join(os.path.dirname(__file__), 'inference_graph.pb'))) sys.exit(0) diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_CRNN_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_CRNN_From_Tensorflow.md index cd0e6c8863f..afd098933d9 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_CRNN_From_Tensorflow.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_CRNN_From_Tensorflow.md @@ -33,8 +33,9 @@ export PYTHONPATH="${PYTHONPATH}:/path/to/CRNN_Tensorflow/" * For Windows\* OS add `/path/to/CRNN_Tensorflow/` to the `PYTHONPATH` environment variable in settings. 3. Open the `tools/test_shadownet.py` script. After `saver.restore(sess=sess, save_path=weights_path)` line, add the following code: ```python +import tensorflow as tf from tensorflow.python.framework import graph_io -frozen = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['shadow/LSTMLayers/transpose_time_major']) +frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, ['shadow/LSTMLayers/transpose_time_major']) graph_io.write_graph(frozen, '.', 'frozen_graph.pb', as_text=False) ``` 4. Run the demo with the following command: diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_NCF_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_NCF_From_Tensorflow.md index e5b126135ab..7fd53b5effa 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_NCF_From_Tensorflow.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_NCF_From_Tensorflow.md @@ -12,11 +12,11 @@ Run the following commands: import tensorflow as tf from tensorflow.python.framework import graph_io -sess = tf.Session() -saver = tf.train.import_meta_graph("/path/to/model/model.meta") +sess = tf.compat.v1.Session() +saver = tf.compat.v1.train.import_meta_graph("/path/to/model/model.meta") saver.restore(sess, tf.train.latest_checkpoint('/path/to/model/')) -frozen = tf.graph_util.convert_variables_to_constants(sess, sess.graph_def, \ +frozen = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph_def, \ ["rating/BiasAdd"]) graph_io.write_graph(frozen, './', 'inference_graph.pb', as_text=False) ``` diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_Slim_Library_Models.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_Slim_Library_Models.md index 12acf9c00da..518fe816893 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_Slim_Library_Models.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_Slim_Library_Models.md @@ -72,6 +72,7 @@ The [inception_preprocessing.py](https://github.com/tensorflow/models/blob/maste ```python3 ... + import tensorflow as tf if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) ... diff --git a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_XLNet_From_Tensorflow.md b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_XLNet_From_Tensorflow.md index cb05866d6a6..f87e8385dae 100644 --- a/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_XLNet_From_Tensorflow.md +++ b/docs/MO_DG/prepare_model/convert_model/tf_specific/Convert_XLNet_From_Tensorflow.md @@ -37,6 +37,8 @@ To get pb-file from the archive contents, you need to do the following. 2. Save and run the following Python script in `~/XLNet-Base/xlnet`: +**Note** The original model repository has been tested with TensorFlow 1.13.1 under Python2. + ```python from collections import namedtuple @@ -59,28 +61,28 @@ xlnet_config = xlnet.XLNetConfig(json_path=XLNET_CONFIG_PATH) run_config = xlnet.RunConfig(is_training=False, use_tpu=False, use_bfloat16=False, dropout=0.1, dropatt=0.1,) -sentence_features_input_idx = tf.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='input_ids') -sentence_features_segment_ids = tf.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='seg_ids') -sentence_features_input_mask = tf.placeholder(tf.float32, shape=[LENGTHS, BATCH], name='input_mask') +sentence_features_input_idx = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='input_ids') +sentence_features_segment_ids = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='seg_ids') +sentence_features_input_mask = tf.compat.v1.placeholder(tf.float32, shape=[LENGTHS, BATCH], name='input_mask') -with tf.Session() as sess: +with tf.compat.v1.Session() as sess: xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=sentence_features_input_idx, seg_ids=sentence_features_segment_ids, input_mask=sentence_features_input_mask) - sess.run(tf.global_variables_initializer()) + sess.run(tf.compat.v1.global_variables_initializer()) model_utils.init_from_checkpoint(FLAGS, True) # Save the variables to disk. - saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() # Saving checkpoint save_path = saver.save(sess, OUTPUT_DIR + "model.ckpt") # Freezing model outputs = ['model/transformer/dropout_2/Identity'] - graph_def_freezed = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), outputs) + graph_def_freezed = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), outputs) # Saving non-frozen and frozen model to pb graph_io.write_graph(sess.graph.as_graph_def(), OUTPUT_DIR, 'model.pb', as_text=False) @@ -88,7 +90,7 @@ with tf.Session() as sess: as_text=False) # Write to tensorboard - with tf.summary.FileWriter(logdir=OUTPUT_DIR, graph_def=graph_def_freezed) as writer: + with tf.compat.v1.summary.FileWriter(logdir=OUTPUT_DIR, graph_def=graph_def_freezed) as writer: writer.flush() ``` @@ -143,28 +145,28 @@ xlnet_config = xlnet.XLNetConfig(json_path=XLNET_CONFIG_PATH) run_config = xlnet.RunConfig(is_training=False, use_tpu=False, use_bfloat16=False, dropout=0.1, dropatt=0.1,) -sentence_features_input_idx = tf.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='input_ids') -sentence_features_segment_ids = tf.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='seg_ids') -sentence_features_input_mask = tf.placeholder(tf.float32, shape=[LENGTHS, BATCH], name='input_mask') +sentence_features_input_idx = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='input_ids') +sentence_features_segment_ids = tf.compat.v1.placeholder(tf.int32, shape=[LENGTHS, BATCH], name='seg_ids') +sentence_features_input_mask = tf.compat.v1.placeholder(tf.float32, shape=[LENGTHS, BATCH], name='input_mask') -with tf.Session() as sess: +with tf.compat.v1.Session() as sess: xlnet_model = xlnet.XLNetModel(xlnet_config=xlnet_config, run_config=run_config, input_ids=sentence_features_input_idx, seg_ids=sentence_features_segment_ids, input_mask=sentence_features_input_mask) - sess.run(tf.global_variables_initializer()) + sess.run(tf.compat.v1.global_variables_initializer()) model_utils.init_from_checkpoint(FLAGS, True) # Save the variables to disk. - saver = tf.train.Saver() + saver = tf.compat.v1.train.Saver() # Saving checkpoint save_path = saver.save(sess, OUTPUT_DIR + "model.ckpt") # Freezing model outputs = ['model/transformer/dropout_2/Identity'] - graph_def_freezed = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), outputs) + graph_def_freezed = tf.compat.v1.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), outputs) # Saving non-frozen and frozen model to pb graph_io.write_graph(sess.graph.as_graph_def(), OUTPUT_DIR, 'model.pb', as_text=False) @@ -172,7 +174,7 @@ with tf.Session() as sess: as_text=False) # Write to tensorboard - with tf.summary.FileWriter(logdir=OUTPUT_DIR, graph_def=graph_def_freezed) as writer: + with tf.compat.v1.summary.FileWriter(logdir=OUTPUT_DIR, graph_def=graph_def_freezed) as writer: writer.flush() ``` diff --git a/docs/install_guides/installing-openvino-linux.md b/docs/install_guides/installing-openvino-linux.md index f7541989984..496027cd73b 100644 --- a/docs/install_guides/installing-openvino-linux.md +++ b/docs/install_guides/installing-openvino-linux.md @@ -202,12 +202,12 @@ For more information about the Model Optimizer, refer to the [Model Optimizer De cd /opt/intel/openvino_2022/tools/model_optimizer/install_prerequisites ``` -2. Run the script to configure the Model Optimizer for Caffe, TensorFlow 2.x, MXNet, Kaldi, and ONNX: +2. Run the script to configure the Model Optimizer for Caffe, TensorFlow, MXNet, Kaldi, and ONNX: ```sh sudo ./install_prerequisites.sh ``` -3. **Optional:** You can choose to install Model Optimizer support for only certain frameworks. In the same directory are individual scripts for Caffe, TensorFlow 1.x, TensorFlow 2.x, MXNet, Kaldi, and ONNX (`install_prerequisites_caffe.sh`, etc.). If you see error messages, make sure you installed all dependencies. +3. **Optional:** You can choose to install Model Optimizer support for only certain frameworks. In the same directory are individual scripts for Caffe, TensorFlow, MXNet, Kaldi, and ONNX (`install_prerequisites_caffe.sh`, etc.). If you see error messages, make sure you installed all dependencies. The Model Optimizer is configured for one or more frameworks. diff --git a/docs/install_guides/installing-openvino-macos.md b/docs/install_guides/installing-openvino-macos.md index 269f2307ba9..59cf7a89061 100644 --- a/docs/install_guides/installing-openvino-macos.md +++ b/docs/install_guides/installing-openvino-macos.md @@ -154,12 +154,12 @@ If you see error messages, verify that you installed all dependencies listed und cd /opt/intel/openvino_2022/tools/model_optimizer/install_prerequisites ``` -2. Run the script to configure the Model Optimizer for Caffe, TensorFlow 2.x, MXNet, Kaldi\*, and ONNX: +2. Run the script to configure the Model Optimizer for Caffe, TensorFlow, MXNet, Kaldi\*, and ONNX: ```sh sudo ./install_prerequisites.sh ``` -3. **Optional:** You can choose to install Model Optimizer support for only certain frameworks. In the same directory are individual scripts for Caffe, TensorFlow 1.x, TensorFlow 2.x, MXNet, Kaldi, and ONNX (`install_prerequisites_caffe.sh`, etc.). If you see error messages, make sure you installed all dependencies. +3. **Optional:** You can choose to install Model Optimizer support for only certain frameworks. In the same directory are individual scripts for Caffe, TensorFlow, MXNet, Kaldi, and ONNX (`install_prerequisites_caffe.sh`, etc.). If you see error messages, make sure you installed all dependencies. The Model Optimizer is configured for one or more frameworks. diff --git a/docs/install_guides/installing-openvino-windows.md b/docs/install_guides/installing-openvino-windows.md index 6e1d69c7e1f..44f28b62cd6 100644 --- a/docs/install_guides/installing-openvino-windows.md +++ b/docs/install_guides/installing-openvino-windows.md @@ -139,12 +139,12 @@ Type commands in the opened window: cd C:\Program Files (x86)\Intel\openvino_2022\tools\model_optimizer\install_prerequisites ``` -3. Run this batch file to configure the Model Optimizer for Caffe, TensorFlow 2.x, MXNet, Kaldi\*, and ONNX:
+3. Run this batch file to configure the Model Optimizer for Caffe, TensorFlow, MXNet, Kaldi\*, and ONNX:
```sh install_prerequisites.bat ``` -3. **Optional:** You can choose to install Model Optimizer support for only certain frameworks. In the same directory are individual batch files for Caffe, TensorFlow 1.x, TensorFlow 2.x, MXNet, Kaldi, and ONNX (`install_prerequisites_caffe.bat`, etc.). If you see error messages, make sure you installed all dependencies. +3. **Optional:** You can choose to install Model Optimizer support for only certain frameworks. In the same directory are individual batch files for Caffe, TensorFlow, MXNet, Kaldi, and ONNX (`install_prerequisites_caffe.bat`, etc.). If you see error messages, make sure you installed all dependencies. The Model Optimizer is configured for one or more frameworks. diff --git a/docs/install_guides/pypi-openvino-dev.md b/docs/install_guides/pypi-openvino-dev.md index cc594364998..69e5ddb035b 100644 --- a/docs/install_guides/pypi-openvino-dev.md +++ b/docs/install_guides/pypi-openvino-dev.md @@ -97,6 +97,7 @@ For example, to install and configure the components for working with TensorFlow ```sh pip install openvino-dev[tensorflow2,mxnet,caffe] ``` +**NOTE**: Support of MO in TensorFlow 1.x environment is deprecated. Use TensorFlow 2.x environment to convert both TensorFlow 1.x and 2.x models ### Step 5. Verify that the Package Is Installed diff --git a/tools/mo/openvino/tools/mo/utils/versions_checker.py b/tools/mo/openvino/tools/mo/utils/versions_checker.py index c36ba01253c..3fae34cba43 100644 --- a/tools/mo/openvino/tools/mo/utils/versions_checker.py +++ b/tools/mo/openvino/tools/mo/utils/versions_checker.py @@ -251,16 +251,15 @@ def check_requirements(framework=None): :param framework: framework name :return: exit code (0 - execution successful, 1 - error) """ + framework_suffix = "_{}".format(framework) env_setup = get_environment_setup(framework) if framework is None: framework_suffix = "" elif framework == "tf": - if "tensorflow" in env_setup and env_setup["tensorflow"] >= LooseVersion("2.0.0"): - framework_suffix = "_tf2" - else: - framework_suffix = "_tf" - else: - framework_suffix = "_{}".format(framework) + if "tensorflow" in env_setup and env_setup["tensorflow"] < LooseVersion("2.0.0"): + log.error('\t\nSupport of the Model Optimizer tool in TensorFlow 1.x environment is deprecated.' + 'It is highly recommended to use TensorFlow 2.x.\n', + extra={'is_warning': True}) file_name = "requirements{}.txt".format(framework_suffix) diff --git a/tools/mo/requirements_tf.txt b/tools/mo/requirements_tf.txt index 11eee1b8af1..d7514869e39 100644 --- a/tools/mo/requirements_tf.txt +++ b/tools/mo/requirements_tf.txt @@ -1,7 +1,6 @@ -# TensorFlow 1.x and 2.x are incompatible, use separate virtual environments for each version -tensorflow~=1.15.5 +tensorflow~=2.5.0 networkx~=2.5 -numpy>=1.16.6,<1.19 +numpy>=1.16.6,<1.20 defusedxml>=0.7.1 requests>=2.25.1 fastjsonschema~=2.15.1 From 8b6a065948d19808b152afa512903bac86db09cf Mon Sep 17 00:00:00 2001 From: Yegor Kruglov Date: Wed, 29 Dec 2021 09:44:10 +0300 Subject: [PATCH 17/78] [MO][IR READER] Updated MO and IR Reader to keep the second MaxPool output in restored IR (#9180) * updated MO to save second MaxPool output in restored IR * blank line at the end of file --- tools/mo/openvino/tools/mo/back/MaxPool.py | 6 +++++- tools/mo/openvino/tools/mo/ops/pooling.py | 18 +++++++++++------- .../ir_reader/extenders/pooling_extender.py | 2 +- .../tools/mo/utils/ir_reader/layer_to_class.py | 2 ++ 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tools/mo/openvino/tools/mo/back/MaxPool.py b/tools/mo/openvino/tools/mo/back/MaxPool.py index 67ef9589ef8..c5c666040e2 100644 --- a/tools/mo/openvino/tools/mo/back/MaxPool.py +++ b/tools/mo/openvino/tools/mo/back/MaxPool.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from openvino.tools.mo.back.FakeOutputResolver import FakeOutputResolver from openvino.tools.mo.back.replacement import BackReplacementPattern -from openvino.tools.mo.graph.graph import Graph +from openvino.tools.mo.graph.graph import Graph, Node from openvino.tools.mo.ops.result import Result @@ -31,6 +31,10 @@ class MaxPool(BackReplacementPattern): del node['exclude_pad'] # adding missed outputs for MaxPool node + MaxPool.normalize_outputs(node) + + @staticmethod + def normalize_outputs(node: Node): if node.out_port(0).disconnected(): output = Result(node.graph, {'name': node.name + '/Result_port_0/', 'keep_output_port': node.has_and_set('remove_values_output')}).create_node() diff --git a/tools/mo/openvino/tools/mo/ops/pooling.py b/tools/mo/openvino/tools/mo/ops/pooling.py index 0e7ccd50e61..f1d8b130337 100644 --- a/tools/mo/openvino/tools/mo/ops/pooling.py +++ b/tools/mo/openvino/tools/mo/ops/pooling.py @@ -82,7 +82,7 @@ class Pooling(Op): }, attrs) def backend_attrs(self): - return [ + backend_attrs_list = [ ('strides', lambda node: ','.join(map(str, node['stride'][node.spatial_dims]))), ('kernel', lambda node: ','.join(map(str, node['window'][node.spatial_dims]))), @@ -92,14 +92,18 @@ class Pooling(Op): ('exclude-pad', lambda node: bool_to_str(node, 'exclude_pad')), 'rounding_type', - ('auto_pad', lambda node: node.auto_pad if node.has_valid('auto_pad') else 'explicit'), - - ('dilations', lambda node: ','.join(map(str, node['dilation'][node.spatial_dims]))), - 'axis', - - ('index_element_type', lambda node: np_data_type_to_destination_type(node.index_element_type)) + ('auto_pad', lambda node: node.auto_pad if node.has_valid('auto_pad') else 'explicit') ] + if self.attrs.get('pool_method') == 'avg': + return backend_attrs_list + else: + return backend_attrs_list + [ + ('dilations', lambda node: ','.join(map(str, node['dilation'][node.spatial_dims]))), + 'axis', + ('index_element_type', lambda node: np_data_type_to_destination_type(node.index_element_type)) + ] + @staticmethod def infer(node: Node): assert (len(node.in_nodes()) == 1), 'MaxPool node {} from must have only one input but instead got ' \ diff --git a/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/pooling_extender.py b/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/pooling_extender.py index 991b9bffa1b..0e583632d97 100644 --- a/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/pooling_extender.py +++ b/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/pooling_extender.py @@ -43,7 +43,7 @@ def common_pool_extender(op: Node): op['batch_dims'] = int64_array([0]), op['channel_dims'] = int64_array([1]), - op['pool_method'] = 'max' if op.type is 'MaxPool' else 'avg' + op['pool_method'] = 'max' if op.type == 'MaxPool' else 'avg' dim = len(op.pads_begin) diff --git a/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py b/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py index 90a31257134..be9dfbe6a2c 100644 --- a/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py +++ b/tools/mo/openvino/tools/mo/utils/ir_reader/layer_to_class.py @@ -6,6 +6,7 @@ import os import numpy as np +from openvino.tools.mo.back.MaxPool import MaxPool from openvino.tools.mo.back.TopKNormalizer import TopKNormalizer from openvino.tools.mo.ops.Cast import Cast from openvino.tools.mo.ops.ReduceOps import ReduceOp @@ -271,6 +272,7 @@ preprocessing_op_nodes = { postprocessing_op_nodes = { 'TensorIterator': ti_add_edge_attrs, 'TopK': TopKNormalizer.normalize_outputs, + 'MaxPool': MaxPool.normalize_outputs, } From 9f69daf0f32961de219f41ca9777eac6e08baaf0 Mon Sep 17 00:00:00 2001 From: Roman Lyamin Date: Wed, 29 Dec 2021 09:58:48 +0300 Subject: [PATCH 18/78] [GPU] Remove batched key from config (#9451) --- src/inference/include/ie/gpu/gpu_config.hpp | 3 +-- .../plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/inference/include/ie/gpu/gpu_config.hpp b/src/inference/include/ie/gpu/gpu_config.hpp index 0534f211941..190cd93c6e0 100644 --- a/src/inference/include/ie/gpu/gpu_config.hpp +++ b/src/inference/include/ie/gpu/gpu_config.hpp @@ -156,11 +156,10 @@ DECLARE_GPU_CONFIG_KEY(MAX_NUM_THREADS); DECLARE_GPU_CONFIG_KEY(ENABLE_LOOP_UNROLLING); /** - * @brief This keys instructs the GPU plugin to use surface/buffer and batched memory type. + * @brief These keys instruct the GPU plugin to use surface/buffer memory type. */ DECLARE_GPU_CONFIG_KEY(SURFACE); DECLARE_GPU_CONFIG_KEY(BUFFER); -DECLARE_GPU_CONFIG_KEY(BATCHED); } // namespace GPUConfigParams diff --git a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp index bcb7ab46dda..cf3fc5772e8 100644 --- a/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp +++ b/src/tests/functional/plugin/gpu/remote_blob_tests/gpu_remote_tensor_tests.cpp @@ -782,7 +782,7 @@ TEST_P(OVRemoteTensorBatched_Test, NV12toBGR_image) { auto p = PrePostProcessor(fn_ptr_remote); p.input().tensor().set_element_type(ov::element::u8) .set_color_format(ov::preprocess::ColorFormat::NV12_TWO_PLANES, {"y", "uv"}) - .set_memory_type(std::string(GPU_CONFIG_KEY(SURFACE)) + GPU_CONFIG_KEY(BATCHED)); + .set_memory_type(GPU_CONFIG_KEY(SURFACE)); p.input().preprocess().convert_color(ov::preprocess::ColorFormat::BGR); p.input().model().set_layout("NCHW"); auto function = p.build(); From 3ee00e018a66ffb452a24a60f2c291cd2da43a5e Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Wed, 29 Dec 2021 11:06:14 +0300 Subject: [PATCH 19/78] [GPU] Moved onednn_gpu to plugin folder (#9458) --- .gitmodules | 2 +- CODEOWNERS | 3 +- scripts/CMakeLists.txt | 1 + src/plugins/intel_gpu/CMakeLists.txt | 2 +- .../intel_gpu/thirdparty/CMakeLists.txt | 60 +++++++++++++++++++ .../plugins/intel_gpu/thirdparty}/onednn_gpu | 0 thirdparty/CMakeLists.txt | 60 ------------------- 7 files changed, 64 insertions(+), 64 deletions(-) rename {thirdparty => src/plugins/intel_gpu/thirdparty}/onednn_gpu (100%) diff --git a/.gitmodules b/.gitmodules index 4c0f0a395b7..b279deeb065 100644 --- a/.gitmodules +++ b/.gitmodules @@ -54,7 +54,7 @@ path = cmake/developer_package/ncc_naming_style/ncc url = https://github.com/nithinn/ncc.git [submodule "thirdparty/onednn_gpu"] - path = thirdparty/onednn_gpu + path = src/plugins/intel_gpu/thirdparty/onednn_gpu url = https://github.com/oneapi-src/oneDNN.git [submodule "tools/pot/thirdparty/open_model_zoo"] path = thirdparty/open_model_zoo diff --git a/CODEOWNERS b/CODEOWNERS index da0b7e9b25f..1453cb29df3 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -45,8 +45,7 @@ Jenkinsfile @openvinotoolkit/openvino-admins # IE GPU: /src/inference/include/ie/gpu/ @openvinotoolkit/openvino-ie-gpu-maintainers @openvinotoolkit/openvino-ie-gpu-developers /src/inference/include/ie/cldnn/ @openvinotoolkit/openvino-ie-gpu-maintainers @openvinotoolkit/openvino-ie-gpu-developers -/src/inference/include/openvino/runtime/gpu/ @openvinotoolkit/openvino-ie-gpu-maintainers @openvinotoolkit/openvino-ie-gpu-developers -/inference-engine/thirdparty/clDNN/ @openvinotoolkit/openvino-ie-gpu-maintainers @openvinotoolkit/openvino-ie-gpu-developers +/src/inference/include/openvino/runtime/intel_gpu/ @openvinotoolkit/openvino-ie-gpu-maintainers @openvinotoolkit/openvino-ie-gpu-developers /src/plugins/intel_gpu/ @openvinotoolkit/openvino-ie-gpu-maintainers @openvinotoolkit/openvino-ie-gpu-developers # IE VPU: diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt index 28a0ac4dbde..f42d82bd9bd 100644 --- a/scripts/CMakeLists.txt +++ b/scripts/CMakeLists.txt @@ -11,6 +11,7 @@ ie_shellcheck_process(DIRECTORY "${OpenVINO_SOURCE_DIR}" "${OpenVINO_SOURCE_DIR}/build" "${OpenVINO_SOURCE_DIR}/thirdparty" "${OpenVINO_SOURCE_DIR}/src/plugins/intel_cpu/thirdparty" + "${OpenVINO_SOURCE_DIR}/src/plugins/intel_gpu/thirdparty" "${OpenVINO_SOURCE_DIR}/src/bindings/python/thirdparty/pybind11" "${IE_MAIN_SOURCE_DIR}/thirdparty" "${OpenVINO_SOURCE_DIR}/tools/pot/thirdparty" diff --git a/src/plugins/intel_gpu/CMakeLists.txt b/src/plugins/intel_gpu/CMakeLists.txt index 97d913c9573..dec82574254 100644 --- a/src/plugins/intel_gpu/CMakeLists.txt +++ b/src/plugins/intel_gpu/CMakeLists.txt @@ -19,10 +19,10 @@ endif() set(MAIN_DIR "${CMAKE_CURRENT_SOURCE_DIR}") set(INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/include") +add_subdirectory(thirdparty) add_subdirectory(src/runtime) add_subdirectory(src/kernel_selector) add_subdirectory(src/graph) -add_subdirectory(thirdparty) file(GLOB_RECURSE PLUGIN_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/plugin/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/include/intel_gpu/plugin/*.hpp) diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index fc973d83056..864177b417d 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -9,3 +9,63 @@ set_target_properties(rapidjson PROPERTIES ) ov_install_static_lib(rapidjson gpu) + +# +# oneDNN for GPU plugin +# + +if(ENABLE_ONEDNN_FOR_GPU) + function(build_onednn_gpu) + include(ExternalProject) + set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build/") + set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install/") + set(ONEDNN_PREFIX_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_root") + if(CMAKE_COMPILER_IS_GNUCXX) + ie_add_compiler_flags(-Wno-undef -Wno-suggest-override) + endif() + ExternalProject_Add(onednn_gpu_build + SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu" + BINARY_DIR "${ONEDNN_BUILD_DIR}" + INSTALL_DIR "${ONEDNN_INSTALL_DIR}" + PREFIX "${ONEDNN_PREFIX_DIR}" + EXCLUDE_FROM_ALL ON + CMAKE_ARGS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}" + "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE=${ENABLE_LTO}" + "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW" + "-DCMAKE_MSVC_RUNTIME_LIBRARY=${CMAKE_MSVC_RUNTIME_LIBRARY}" + "-DDNNL_CPU_RUNTIME=NONE" + "-DDNNL_GPU_RUNTIME=OCL" + "-DDNNL_LIBRARY_NAME=onednn_gpu" + "-DCMAKE_INSTALL_PREFIX=${ONEDNN_INSTALL_DIR}" + "-DCMAKE_INSTALL_LIBDIR=lib/$" + "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" + "-DDNNL_ENABLE_CONCURRENT_EXEC=ON" + "-DDNNL_ENABLE_PRIMITIVE_CACHE=OFF" + "-DDNNL_ENABLE_JIT_PROFILING=${BUILD_SHARED_LIBS}" + "-DDNNL_ENABLE_ITT_TASKS=${BUILD_SHARED_LIBS}" + "-DDNNL_BUILD_TESTS=OFF" + "-DDNNL_BUILD_EXAMPLES=OFF" + "-DDNNL_BLAS_VENDOR=NONE" + "-DDNNL_LIBRARY_TYPE=STATIC" + "-DOpenCL_LIBRARY=${OpenCL_LIBRARY}" + "-DOpenCL_INCLUDE_DIR=${OpenCL_INCLUDE_DIR}" + "-DOPENCL_VERSION_2_2=${OPENCL_VERSION_2_2}" + ) + add_library(onednn_gpu_tgt INTERFACE) + set_target_properties(onednn_gpu_tgt PROPERTIES + INTERFACE_LINK_DIRECTORIES "${ONEDNN_INSTALL_DIR}/lib/$" + INTERFACE_LINK_LIBRARIES "onednn_gpu" + INTERFACE_INCLUDE_DIRECTORIES "${ONEDNN_INSTALL_DIR}/include" + INTERFACE_COMPILE_DEFINITIONS ENABLE_ONEDNN_FOR_GPU + ) + add_dependencies(onednn_gpu_tgt onednn_gpu_build) + # TODO: install onednn_gpu in static builds + endfunction() + build_onednn_gpu() +endif() diff --git a/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu similarity index 100% rename from thirdparty/onednn_gpu rename to src/plugins/intel_gpu/thirdparty/onednn_gpu diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt index 84c10f8b044..bc4d8caa93e 100644 --- a/thirdparty/CMakeLists.txt +++ b/thirdparty/CMakeLists.txt @@ -128,66 +128,6 @@ if(ENABLE_OV_ONNX_FRONTEND) add_subdirectory(onnx) endif() -# -# oneDNN for GPU plugin -# - -if(ENABLE_ONEDNN_FOR_GPU) - function(build_onednn_gpu) - include(ExternalProject) - set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build/") - set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install/") - set(ONEDNN_PREFIX_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_root") - if(CMAKE_COMPILER_IS_GNUCXX) - ie_add_compiler_flags(-Wno-undef) - endif() - ExternalProject_Add(onednn_gpu_build - SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu" - BINARY_DIR "${ONEDNN_BUILD_DIR}" - INSTALL_DIR "${ONEDNN_INSTALL_DIR}" - PREFIX "${ONEDNN_PREFIX_DIR}" - EXCLUDE_FROM_ALL ON - CMAKE_ARGS - "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" - "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" - "-DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}" - "-DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}" - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" - "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" - "-DCMAKE_INTERPROCEDURAL_OPTIMIZATION_RELEASE=${ENABLE_LTO}" - "-DCMAKE_POLICY_DEFAULT_CMP0069=NEW" - "-DCMAKE_MSVC_RUNTIME_LIBRARY=${CMAKE_MSVC_RUNTIME_LIBRARY}" - "-DDNNL_CPU_RUNTIME=NONE" - "-DDNNL_GPU_RUNTIME=OCL" - "-DDNNL_LIBRARY_NAME=onednn_gpu" - "-DCMAKE_INSTALL_PREFIX=${ONEDNN_INSTALL_DIR}" - "-DCMAKE_INSTALL_LIBDIR=lib/$" - "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" - "-DDNNL_ENABLE_CONCURRENT_EXEC=ON" - "-DDNNL_ENABLE_PRIMITIVE_CACHE=OFF" - "-DDNNL_ENABLE_JIT_PROFILING=${BUILD_SHARED_LIBS}" - "-DDNNL_ENABLE_ITT_TASKS=${BUILD_SHARED_LIBS}" - "-DDNNL_BUILD_TESTS=OFF" - "-DDNNL_BUILD_EXAMPLES=OFF" - "-DDNNL_BLAS_VENDOR=NONE" - "-DDNNL_LIBRARY_TYPE=STATIC" - "-DOpenCL_LIBRARY=${OpenCL_LIBRARY}" - "-DOpenCL_INCLUDE_DIR=${OpenCL_INCLUDE_DIR}" - "-DOPENCL_VERSION_2_2=${OPENCL_VERSION_2_2}" - ) - add_library(onednn_gpu_tgt INTERFACE) - set_target_properties(onednn_gpu_tgt PROPERTIES - INTERFACE_LINK_DIRECTORIES "${ONEDNN_INSTALL_DIR}/lib/$" - INTERFACE_LINK_LIBRARIES "onednn_gpu" - INTERFACE_INCLUDE_DIRECTORIES "${ONEDNN_INSTALL_DIR}/include" - INTERFACE_COMPILE_DEFINITIONS ENABLE_ONEDNN_FOR_GPU - ) - add_dependencies(onednn_gpu_tgt onednn_gpu_build) - # TODO: install onednn_gpu in static builds - endfunction() - build_onednn_gpu() -endif() - # # nlohmann json # From 9234f4177d74d89364b627cfecddda72c4046aca Mon Sep 17 00:00:00 2001 From: Irina Efode Date: Wed, 29 Dec 2021 12:28:58 +0300 Subject: [PATCH 20/78] [IE TESTS] Test movement (#9429) * [IE TESTS] Movement of some tests in shared lib. Movement of 'QueryNetwork' validation to ApiConformance suite * Update read_ir.cpp --- .../src/read_ir/read_ir_query_network.cpp} | 2 +- .../src/op_impl_check/op_impl_check.cpp | 4 +-- .../src/read_ir/read_ir_compare_with_refs.cpp | 22 ++++++++++++ .../read_ir/read_ir_compare_with_refs.hpp | 13 +++++++ ...{read_ir.hpp => read_ir_query_network.hpp} | 4 --- .../op_impl_check/op_impl_check.hpp | 0 .../op_impl_check/single_op_graph.hpp | 0 .../op_impl_check/om_impl_check.cpp | 2 +- .../op_impl_check/single_op_graph.cpp | 4 +-- .../shared_test_classes/read_ir/read_ir.hpp | 1 + .../src/read_ir/read_ir.cpp | 35 +++++++++++++++++++ 11 files changed, 77 insertions(+), 10 deletions(-) rename src/tests/functional/plugin/conformance/test_runner/{op_conformance_runner/src/read_ir/read_ir.cpp => api_conformance_runner/src/read_ir/read_ir_query_network.cpp} (94%) create mode 100644 src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir_compare_with_refs.cpp create mode 100644 src/tests/functional/plugin/shared/include/read_ir/read_ir_compare_with_refs.hpp rename src/tests/functional/plugin/shared/include/read_ir/{read_ir.hpp => read_ir_query_network.hpp} (74%) rename src/tests/functional/plugin/shared/include/{ => single_layer_tests}/op_impl_check/op_impl_check.hpp (100%) rename src/tests/functional/plugin/shared/include/{ => single_layer_tests}/op_impl_check/single_op_graph.hpp (100%) rename src/tests/functional/plugin/shared/src/{ => single_layer_tests}/op_impl_check/om_impl_check.cpp (97%) rename src/tests/functional/plugin/shared/src/{ => single_layer_tests}/op_impl_check/single_op_graph.cpp (96%) diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir.cpp b/src/tests/functional/plugin/conformance/test_runner/api_conformance_runner/src/read_ir/read_ir_query_network.cpp similarity index 94% rename from src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir.cpp rename to src/tests/functional/plugin/conformance/test_runner/api_conformance_runner/src/read_ir/read_ir_query_network.cpp index 28a5258ec0e..e4d37e34b3d 100644 --- a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir.cpp +++ b/src/tests/functional/plugin/conformance/test_runner/api_conformance_runner/src/read_ir/read_ir_query_network.cpp @@ -4,7 +4,7 @@ #include "common_test_utils/file_utils.hpp" -#include "read_ir/read_ir.hpp" +#include "read_ir/read_ir_query_network.hpp" #include "conformance.hpp" namespace ConformanceTests { diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/op_impl_check.cpp b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/op_impl_check.cpp index 8412d98c288..ad27db35f2b 100644 --- a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/op_impl_check.cpp +++ b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/op_impl_check/op_impl_check.cpp @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "op_impl_check/op_impl_check.hpp" -#include "op_impl_check/single_op_graph.hpp" +#include "single_layer_tests/op_impl_check/op_impl_check.hpp" +#include "single_layer_tests/op_impl_check/single_op_graph.hpp" #include "conformance.hpp" namespace ConformanceTests { diff --git a/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir_compare_with_refs.cpp b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir_compare_with_refs.cpp new file mode 100644 index 00000000000..5d0e6df8d68 --- /dev/null +++ b/src/tests/functional/plugin/conformance/test_runner/op_conformance_runner/src/read_ir/read_ir_compare_with_refs.cpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "common_test_utils/file_utils.hpp" + +#include "read_ir/read_ir_compare_with_refs.hpp" +#include "conformance.hpp" + +namespace ConformanceTests { +using namespace LayerTestsDefinitions; + +namespace { +INSTANTIATE_TEST_SUITE_P(conformance, + ReadIRTest, + ::testing::Combine( + ::testing::ValuesIn(CommonTestUtils::getFileListByPatternRecursive(IRFolderPaths, std::regex(R"(.*\.xml)"))), + ::testing::Values(targetDevice), + ::testing::Values(pluginConfig)), + ReadIRTest::getTestCaseName); +} // namespace +} // namespace ConformanceTests diff --git a/src/tests/functional/plugin/shared/include/read_ir/read_ir_compare_with_refs.hpp b/src/tests/functional/plugin/shared/include/read_ir/read_ir_compare_with_refs.hpp new file mode 100644 index 00000000000..1d9f7c6aa53 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/read_ir/read_ir_compare_with_refs.hpp @@ -0,0 +1,13 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "shared_test_classes/read_ir/read_ir.hpp" + +namespace LayerTestsDefinitions { + +TEST_P(ReadIRTest, ReadIR) { + Run(); +} + +} // namespace LayerTestsDefinitions \ No newline at end of file diff --git a/src/tests/functional/plugin/shared/include/read_ir/read_ir.hpp b/src/tests/functional/plugin/shared/include/read_ir/read_ir_query_network.hpp similarity index 74% rename from src/tests/functional/plugin/shared/include/read_ir/read_ir.hpp rename to src/tests/functional/plugin/shared/include/read_ir/read_ir_query_network.hpp index 06b20f6a633..a602c7e51d3 100644 --- a/src/tests/functional/plugin/shared/include/read_ir/read_ir.hpp +++ b/src/tests/functional/plugin/shared/include/read_ir/read_ir_query_network.hpp @@ -6,10 +6,6 @@ namespace LayerTestsDefinitions { -TEST_P(ReadIRTest, ReadIR) { - Run(); -} - TEST_P(ReadIRTest, QueryNetwork) { QueryNetwork(); } diff --git a/src/tests/functional/plugin/shared/include/op_impl_check/op_impl_check.hpp b/src/tests/functional/plugin/shared/include/single_layer_tests/op_impl_check/op_impl_check.hpp similarity index 100% rename from src/tests/functional/plugin/shared/include/op_impl_check/op_impl_check.hpp rename to src/tests/functional/plugin/shared/include/single_layer_tests/op_impl_check/op_impl_check.hpp diff --git a/src/tests/functional/plugin/shared/include/op_impl_check/single_op_graph.hpp b/src/tests/functional/plugin/shared/include/single_layer_tests/op_impl_check/single_op_graph.hpp similarity index 100% rename from src/tests/functional/plugin/shared/include/op_impl_check/single_op_graph.hpp rename to src/tests/functional/plugin/shared/include/single_layer_tests/op_impl_check/single_op_graph.hpp diff --git a/src/tests/functional/plugin/shared/src/op_impl_check/om_impl_check.cpp b/src/tests/functional/plugin/shared/src/single_layer_tests/op_impl_check/om_impl_check.cpp similarity index 97% rename from src/tests/functional/plugin/shared/src/op_impl_check/om_impl_check.cpp rename to src/tests/functional/plugin/shared/src/single_layer_tests/op_impl_check/om_impl_check.cpp index 28df2923c50..565e8228088 100644 --- a/src/tests/functional/plugin/shared/src/op_impl_check/om_impl_check.cpp +++ b/src/tests/functional/plugin/shared/src/single_layer_tests/op_impl_check/om_impl_check.cpp @@ -6,7 +6,7 @@ #include #endif -#include "op_impl_check/op_impl_check.hpp" +#include "single_layer_tests/op_impl_check/op_impl_check.hpp" namespace ov { namespace test { diff --git a/src/tests/functional/plugin/shared/src/op_impl_check/single_op_graph.cpp b/src/tests/functional/plugin/shared/src/single_layer_tests/op_impl_check/single_op_graph.cpp similarity index 96% rename from src/tests/functional/plugin/shared/src/op_impl_check/single_op_graph.cpp rename to src/tests/functional/plugin/shared/src/single_layer_tests/op_impl_check/single_op_graph.cpp index e05206c08e2..b2fc1ffdb46 100644 --- a/src/tests/functional/plugin/shared/src/op_impl_check/single_op_graph.cpp +++ b/src/tests/functional/plugin/shared/src/single_layer_tests/op_impl_check/single_op_graph.cpp @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // -#include -#include +#include +#include namespace ov { namespace test { diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/read_ir/read_ir.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/read_ir/read_ir.hpp index a764fba8bdb..4549f35dc36 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/read_ir/read_ir.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/read_ir/read_ir.hpp @@ -19,6 +19,7 @@ class ReadIRTest : public testing::WithParamInterface, virtual public LayerTestsUtils::LayerTestsCommon { public: static std::string getTestCaseName(const testing::TestParamInfo &obj); + void QueryNetwork() override; protected: void SetUp() override; diff --git a/src/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp b/src/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp index 2f0889e8ec2..93ecfbfa122 100644 --- a/src/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp +++ b/src/tests/functional/shared_test_classes/src/read_ir/read_ir.cpp @@ -2,6 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 // +#include +#ifdef _WIN32 +#include +#endif + #include #include "common_test_utils/file_utils.hpp" #include "functional_test_utils/core_config.hpp" @@ -29,6 +34,36 @@ std::string ReadIRTest::getTestCaseName(const testing::TestParamInfoset_friendly_name("refFunction"); + } + auto crashHandler = [](int errCode) { + auto &s = LayerTestsUtils::Summary::getInstance(); + s.saveReport(); + std::cout << "Unexpected application crash!" << std::endl; + std::abort(); + }; + signal(SIGSEGV, crashHandler); + + auto &s = LayerTestsUtils::Summary::getInstance(); + s.setDeviceName(targetDevice); + + if (FuncTestUtils::SkipTestsConfig::currentTestIsDisabled()) { + s.updateOPsStats(functionRefs, LayerTestsUtils::PassRate::Statuses::SKIPPED); + GTEST_SKIP() << "Disabled test due to configuration" << std::endl; + } else { + s.updateOPsStats(functionRefs, LayerTestsUtils::PassRate::Statuses::CRASHED); + } + try { + LayerTestsCommon::QueryNetwork(); + s.updateOPsStats(functionRefs, LayerTestsUtils::PassRate::Statuses::PASSED); + } catch (...) { + s.updateOPsStats(functionRefs, LayerTestsUtils::PassRate::Statuses::FAILED); + } +} + void ReadIRTest::SetUp() { std::tie(pathToModel, targetDevice, configuration) = this->GetParam(); auto net = getCore()->ReadNetwork(pathToModel); From 2e433620b7a842a8047eb44a4a386c05883ae5ee Mon Sep 17 00:00:00 2001 From: Maxim Andronov Date: Wed, 29 Dec 2021 13:43:35 +0300 Subject: [PATCH 21/78] [CPU] General fixes for dynamic shapes. Part 3 (#9338) --- .../memory_desc/cpu_blocked_memory_desc.cpp | 7 +++--- .../memory_desc/dnnl_blocked_memory_desc.cpp | 4 +--- .../intel_cpu/src/mkldnn_graph_optimizer.cpp | 6 +++++ src/plugins/intel_cpu/src/mkldnn_node.cpp | 23 ++++++++++++++----- .../src/nodes/mkldnn_eltwise_node.cpp | 2 +- .../intel_cpu/src/nodes/mkldnn_non_zero.h | 2 ++ .../src/nodes/mkldnn_reference_node.cpp | 16 +++++++++++-- .../intel_cpu/src/nodes/mkldnn_topk_node.cpp | 16 +++++++++++-- .../skip_tests_config.cpp | 2 -- .../plugin/cpu/single_layer_tests/nonzero.cpp | 8 ------- .../src/ov_tensor_utils.cpp | 6 ++++- src/tests/unit/cpu/mkldnn_zero_dims_test.cpp | 6 ++--- 12 files changed, 66 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp index dd8e3597598..a06b867db1f 100644 --- a/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/cpu_blocked_memory_desc.cpp @@ -112,8 +112,9 @@ bool CpuBlockedMemoryDesc::canComputeMemSizeZeroDims() const { } size_t CpuBlockedMemoryDesc::getCurrentMemSizeImp() const { - int64_t e_size = getOffsetPadding() + 1; // size in bytes (from begin of data to last element) + int64_t e_size = getOffsetPadding(); // size in bytes (from begin of data to last element) if (!getShape().hasZeroDims()) { + e_size += 1; for (int j = 0; j < getBlockDims().size(); j++) e_size += (getBlockDims()[j] - 1) * getStrides()[j]; } @@ -129,9 +130,7 @@ size_t CpuBlockedMemoryDesc::getMaxMemSize() const { } const auto& maxDims = shape.getMaxDims(); - if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x || - // WA: for some nodes ngraph compute upper bound depending on precision max value - x >= std::numeric_limits::max(); })) { + if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { return UNDEFINED_SIZE; } diff --git a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp index 4b1efda623a..bba275f0be5 100644 --- a/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp +++ b/src/plugins/intel_cpu/src/memory_desc/dnnl_blocked_memory_desc.cpp @@ -506,9 +506,7 @@ size_t DnnlBlockedMemoryDesc::getMaxMemSize() const { } const auto& maxDims = shape.getMaxDims(); - if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x || - // WA: for some nodes ngraph compute upper bound depending on precision max value - x >= std::numeric_limits::max(); })) { + if (std::any_of(maxDims.begin(), maxDims.end(), [](size_t x){ return Shape::UNDEFINED_DIM == x; })) { return UNDEFINED_SIZE; } diff --git a/src/plugins/intel_cpu/src/mkldnn_graph_optimizer.cpp b/src/plugins/intel_cpu/src/mkldnn_graph_optimizer.cpp index 7dfc6e448a5..937a143f82b 100644 --- a/src/plugins/intel_cpu/src/mkldnn_graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_graph_optimizer.cpp @@ -1474,6 +1474,12 @@ void MKLDNNGraphOptimizer::FuseEltwiseAndSimple(MKLDNNGraph &graph) { } auto childNode = parentNode->getChildEdgeAt(0)->getChild(); + + if ((parentNode->isDynamicNode() && !childNode->isDynamicNode()) || (!parentNode->isDynamicNode() && childNode->isDynamicNode())) { + parent++; + continue; + } + if (!isSuitableChildNode(parentNode, childNode)) { parent++; continue; diff --git a/src/plugins/intel_cpu/src/mkldnn_node.cpp b/src/plugins/intel_cpu/src/mkldnn_node.cpp index f9c2f5a6867..00250a8c635 100644 --- a/src/plugins/intel_cpu/src/mkldnn_node.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_node.cpp @@ -543,8 +543,6 @@ void MKLDNNNode::redefineOutputMemory(const std::vector &newOutputSh newOutputShape.push_back(1); } - const auto memDesc = getBaseMemDescAtOutputPort(i)->cloneWithNewDims(newOutputShape); - const auto &currDesc = edges[0]->getMemory().getDesc(); if (currDesc.getShape().isStatic() && currDesc.getShape().getStaticDims() == newOutputShape) continue; @@ -559,6 +557,8 @@ void MKLDNNNode::redefineOutputMemory(const std::vector &newOutputSh break; } } + + const auto memDesc = getBaseMemDescAtOutputPort(i)->cloneWithNewDims(newOutputShape); edges[sharedEdgeNum]->getMemoryPtr()->redefineDesc(*memDesc); void *data = edges[sharedEdgeNum]->getMemoryPtr()->GetData(); for (size_t j = 0; j < edges.size(); j++) { @@ -1419,8 +1419,9 @@ bool MKLDNNNode::needShapeInfer() const { std::vector MKLDNNNode::shapeInfer() const { std::vector shapes; - for (size_t i = 0; i < inputShapes.size(); i++) { - shapes.push_back(getParentEdgesAtPort(i)[0]->getMemory().getDesc().getShape()); + for (size_t i = 0; i < opToShapeInfer->get_input_size(); i++) { + shapes.push_back(opToShapeInfer->get_input_partial_shape(i).rank().get_length() == 0 ? Shape{} : + getParentEdgesAtPort(i)[0]->getMemory().getDesc().getShape()); } auto newOutputShapes = shapeInferGeneric(shapes); @@ -1448,8 +1449,18 @@ std::vector MKLDNNNode::shapeInferGeneric(const std::vector& for (size_t i = 0; i < newOutputShapes.size(); i++) { const auto &partShape = opToShapeInfer->get_output_partial_shape(i); if (partShape.is_dynamic()) { - IE_THROW(NotImplemented) << "CPU plug-in doesn't support default shape infer for node " << getTypeStr() - << " with internal dynamism. Operation name: " << getName(); + std::ostringstream errorMessage; + errorMessage << "Can't compute static output shape on " << i << " port for node with name: " << getName(); + errorMessage << ". Input shapes = ( "; + for (size_t in = 0; in < opToShapeInfer->get_input_size(); in++) { + errorMessage << in << " port = " << opToShapeInfer->get_input_partial_shape(in) << ", "; + } + errorMessage << "). Output shapes = ( "; + for (size_t out = 0; out < opToShapeInfer->get_output_size(); out++) { + errorMessage << out << " port = " << opToShapeInfer->get_output_partial_shape(out) << ", "; + } + errorMessage << ")"; + IE_THROW(NotImplemented) << errorMessage.str(); } newOutputShapes[i] = partShape.get_shape(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp index d44909a7185..b546a32f741 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_eltwise_node.cpp @@ -2000,7 +2000,7 @@ void MKLDNNEltwiseNode::execute(mkldnn::stream strm) { execPtr->exec(args_ptrs, dims_out); } else { - IE_THROW() << "Can't execute eltwise node. Primitive has not been created"; + IE_THROW() << "Can't execute eltwise node with name: " << getName() << ". Primitive isn't created"; } } diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.h b/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.h index a454a2da268..fcf7f10b28b 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.h +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.h @@ -25,6 +25,8 @@ public: void executeDynamicImpl(mkldnn::stream strm) override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + bool isExecutable() const override { return true; } + private: std::string errorPrefix; template diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_reference_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_reference_node.cpp index 3d34abdef96..ec6b891a678 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_reference_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_reference_node.cpp @@ -91,8 +91,20 @@ std::vector MKLDNNReferenceNode::shapeInfer() const { std::vector newOutputShapes(outputShapes.size()); for (size_t i = 0; i < newOutputShapes.size(); i++) { const auto &partShape = localShapeInferOp->get_output_partial_shape(i); - if (partShape.is_dynamic()) - IE_THROW(NotImplemented) << "CPU plug-in doesn't support default shape infer for nodes with internal dynamism"; + if (partShape.is_dynamic()) { + std::ostringstream errorMessage; + errorMessage << "Can't compute static output shape on " << i << " port for node with name: " << getName(); + errorMessage << ". Input shapes = ( "; + for (size_t in = 0; in < opToShapeInfer->get_input_size(); in++) { + errorMessage << in << " port = " << opToShapeInfer->get_input_partial_shape(in) << ", "; + } + errorMessage << "). Output shapes = ( "; + for (size_t out = 0; out < opToShapeInfer->get_output_size(); out++) { + errorMessage << out << " port = " << opToShapeInfer->get_output_partial_shape(out) << ", "; + } + errorMessage << ")"; + IE_THROW(NotImplemented) << errorMessage.str(); + } newOutputShapes[i] = partShape.get_shape(); } return newOutputShapes; diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_topk_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_topk_node.cpp index 14746d819a4..956292f0b23 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_topk_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_topk_node.cpp @@ -183,8 +183,20 @@ std::vector MKLDNNTopKNode::shapeInfer() const { std::vector newOutputShapes(outputShapes.size()); for (size_t i = 0; i < newOutputShapes.size(); ++i) { const auto& pShape = localShapeInferOp->get_output_partial_shape(i); - if (pShape.is_dynamic()) - IE_THROW(NotImplemented) << "CPU plug-in doesn't support default shape infer for nodes with internal dynamism"; + if (pShape.is_dynamic()) { + std::ostringstream errorMessage; + errorMessage << "Can't compute static output shape on " << i << " port for node with name: " << getName(); + errorMessage << ". Input shapes = ( "; + for (size_t in = 0; in < opToShapeInfer->get_input_size(); in++) { + errorMessage << in << " port = " << opToShapeInfer->get_input_partial_shape(in) << ", "; + } + errorMessage << "). Output shapes = ( "; + for (size_t out = 0; out < opToShapeInfer->get_output_size(); out++) { + errorMessage << out << " port = " << opToShapeInfer->get_output_partial_shape(out) << ", "; + } + errorMessage << ")"; + IE_THROW(NotImplemented) << errorMessage.str(); + } newOutputShapes[i] = pShape.get_shape(); } return newOutputShapes; diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp index 0c87ea285af..1ff8c21a217 100644 --- a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp @@ -168,8 +168,6 @@ std::vector disabledTestPatterns() { R"(smoke_ConversionLayerTest/ConversionLayerTest.CompareWithRefs.*UNSPECIFIED.*)", // Issue: R"(.*smoke_VariadicSplit4D_CPU_zero_dims.*)", - // Waiting for common fix of zero dims - R"(smoke_If.*TS=\(.*\.0.*\).*)", }; #define FIX_62820 0 diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/nonzero.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/nonzero.cpp index eb4853a54d8..a0107b39187 100644 --- a/src/tests/functional/plugin/cpu/single_layer_tests/nonzero.cpp +++ b/src/tests/functional/plugin/cpu/single_layer_tests/nonzero.cpp @@ -64,14 +64,6 @@ public: } } - void compare(const std::vector &expected, const std::vector &actual) override { - const auto dims = targetStaticShapes[inferNum].front(); - if (!((startFrom == 0 && range == 1) || std::any_of(dims.begin(), dims.end(), [](size_t dim) { return dim == 0; } ))) { - SubgraphBaseTest::compare(expected, actual); - } - inferNum++; - } - protected: size_t startFrom = 0, range = 10; size_t inferNum = 0; diff --git a/src/tests/ie_test_utils/functional_test_utils/src/ov_tensor_utils.cpp b/src/tests/ie_test_utils/functional_test_utils/src/ov_tensor_utils.cpp index 887ab3e1563..288a2b6b20c 100644 --- a/src/tests/ie_test_utils/functional_test_utils/src/ov_tensor_utils.cpp +++ b/src/tests/ie_test_utils/functional_test_utils/src/ov_tensor_utils.cpp @@ -108,7 +108,11 @@ void compare(const ov::runtime::Tensor& expected, auto expected_shape = expected.get_shape(); auto actual_shape = actual.get_shape(); ASSERT_EQ(expected_shape, actual_shape); - ASSERT_NE(shape_size(actual_shape), 0); + + if (shape_size(actual_shape) == 0) { + return; + } + auto expected_data = expected.data(); auto actual_data = actual.data(); double abs_threshold = abs_threshold_; diff --git a/src/tests/unit/cpu/mkldnn_zero_dims_test.cpp b/src/tests/unit/cpu/mkldnn_zero_dims_test.cpp index cb2944dc6b6..a58e41676da 100644 --- a/src/tests/unit/cpu/mkldnn_zero_dims_test.cpp +++ b/src/tests/unit/cpu/mkldnn_zero_dims_test.cpp @@ -73,7 +73,7 @@ protected: VectorDims zeroStrides(descDnnl.getBlockDims().size(), 0); validate(descDnnl, zeroStrides, offset, offsetPadding, 0); - validate(descCpu, zeroStrides, offset, offsetPadding, precision.size()); + validate(descCpu, zeroStrides, offset, offsetPadding, 0); ASSERT_TRUE(descDnnl.isCompatible(descCpu)); ASSERT_TRUE(descCpu.isCompatible(descDnnl)); @@ -94,7 +94,7 @@ protected: validate(*definedDnnl->as(), zeroStrides, offset, offsetPadding, 0); const auto definedCpu = descCpu.cloneWithDefaultStridesAndOffset(); - validate(*definedCpu->as(), zeroStrides, offset, offsetPadding, precision.size()); + validate(*definedCpu->as(), zeroStrides, offset, offsetPadding, 0); } }; @@ -243,7 +243,7 @@ TEST_P(MemDescWithZeroDimsCloneNewDimsTest, CloneWithNewDims) { const auto& dims = shape.getDims(); bool skipOrderCheck = std::all_of(dims.begin() + 1, dims.end(), [](const size_t& dim) { return dim == 0; }); validate(*clonedDescDnnl->as(), zeroStrides, offset, offsetPadding, 0, skipOrderCheck); - validate(*clonedDescCpu->as(), zeroStrides, offset, offsetPadding, precision.size()); + validate(*clonedDescCpu->as(), zeroStrides, offset, offsetPadding, 0); } const std::vector srcDynShapes = { From 1ee8007764eeeb8dae4157173a309828917c50c8 Mon Sep 17 00:00:00 2001 From: Vladislav Volkov Date: Wed, 29 Dec 2021 13:46:02 +0300 Subject: [PATCH 22/78] [CPU] NV12toRGB and NV12toBGR operations for CPU plugin (#8628) --- src/plugins/intel_cpu/src/cpu_types.cpp | 6 + src/plugins/intel_cpu/src/cpu_types.h | 7 +- .../src/emitters/jit_load_store_emitters.cpp | 16 +- src/plugins/intel_cpu/src/mkldnn_node.h | 5 +- .../intel_cpu/src/mkldnn_nodes_factory.cpp | 2 + .../src/nodes/mkldnn_color_convert_node.cpp | 659 +++++++++++++++++ .../src/nodes/mkldnn_color_convert_node.h | 75 ++ src/plugins/intel_cpu/src/utils/blob_dump.cpp | 4 +- .../intel_cpu/src/utils/jit_kernel.cpp | 334 +++++++++ .../intel_cpu/src/utils/jit_kernel.hpp | 697 ++++++++++++++++++ .../intel_cpu/src/utils/multidim_map.hpp | 64 ++ 11 files changed, 1860 insertions(+), 9 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.h create mode 100644 src/plugins/intel_cpu/src/utils/jit_kernel.cpp create mode 100644 src/plugins/intel_cpu/src/utils/jit_kernel.hpp create mode 100644 src/plugins/intel_cpu/src/utils/multidim_map.hpp diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 1c87b16d7b0..5be7d6f8786 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -103,6 +103,8 @@ const InferenceEngine::details::caseless_unordered_map type_t { "ReadValue", MemoryInput}, // for construction from name ctor, arbitrary name is used { "Assign", MemoryOutput }, // for construction from layer ctor { "Convert", Convert }, + { "NV12toRGB", ColorConvert }, + { "NV12toBGR", ColorConvert }, { "MVN", MVN}, { "NormalizeL2", NormalizeL2}, { "ScatterUpdate", ScatterUpdate}, @@ -275,6 +277,8 @@ std::string NameFromType(const Type type) { return "TensorIterator"; case Convert: return "Convert"; + case ColorConvert: + return "ColorConvert"; case NormalizeL2: return "NormalizeL2"; case ScatterUpdate: @@ -470,6 +474,8 @@ std::string algToString(const Algorithm alg) { CASE(MathTan); CASE(TensorIteratorCommon); CASE(TensorIteratorLoop); + CASE(ColorConvertNV12toRGB); + CASE(ColorConvertNV12toBGR); #undef CASE return "Undefined"; } diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index 7a7144c348a..c9df96d6ba6 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -55,6 +55,7 @@ enum Type { DeformableConvolution, TensorIterator, Convert, + ColorConvert, MVN, NormalizeL2, ScatterUpdate, @@ -220,10 +221,12 @@ enum Algorithm { MathSoftPlus, MathSoftsign, MathTan, - // TensorIterator TensorIteratorCommon, - TensorIteratorLoop + TensorIteratorLoop, + // Color conversions + ColorConvertNV12toRGB, + ColorConvertNV12toBGR, }; extern const InferenceEngine::details::caseless_unordered_map type_to_name_tbl; diff --git a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp index 16b55daf9d3..20c2b67e270 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_load_store_emitters.cpp @@ -763,7 +763,9 @@ template if (is_signed) { h->vpmovsdb(addr(0), vmm); } else { - h->vpmaxsd(vmm, vmm, Vmm(aux_vec_idxs[0])); + Vmm zero(aux_vec_idxs[0]); + h->uni_vpxor(zero, zero, zero); + h->vpmaxsd(vmm, vmm, zero); h->vpmovusdb(addr(0), vmm); } } else { @@ -774,7 +776,9 @@ template if (is_signed) { h->vpmovsdb(addr(0), vmm | k_mask); } else { - h->vpmaxsd(vmm, vmm, Vmm(aux_vec_idxs[0])); + Vmm zero(aux_vec_idxs[0]); + h->uni_vpxor(zero, zero, zero); + h->vpmaxsd(vmm, vmm, zero); h->vpmovusdb(addr(0), vmm | k_mask); } } @@ -845,7 +849,9 @@ template if (is_signed) { h->vpmovsdw(ptr[reg + offset], vmm); // singed int32 saturate to signed int16. } else { - h->vmaxsd(vmm, Vmm(aux_vec_idxs[0]), vmm); // if singed bit is 1, set value as 0. + Vmm zero(aux_vec_idxs[0]); + h->uni_vpxor(zero, zero, zero); + h->vmaxsd(vmm, zero, vmm); // if singed bit is 1, set value as 0. h->vpmovusdw(ptr[reg + offset], vmm); // unsinged int32 saturate to unsigned int16. } } else { @@ -856,7 +862,9 @@ template if (is_signed) { h->vpmovsdw(ptr[reg + offset], vmm | k_mask); } else { - h->vmaxsd(vmm, Vmm(aux_vec_idxs[0]), vmm); + Vmm zero(aux_vec_idxs[0]); + h->uni_vpxor(zero, zero, zero); + h->vmaxsd(vmm, zero, vmm); h->vpmovusdw(ptr[reg + offset], vmm | k_mask); } } diff --git a/src/plugins/intel_cpu/src/mkldnn_node.h b/src/plugins/intel_cpu/src/mkldnn_node.h index 8f9498fbe82..7dbadaa3796 100644 --- a/src/plugins/intel_cpu/src/mkldnn_node.h +++ b/src/plugins/intel_cpu/src/mkldnn_node.h @@ -127,6 +127,9 @@ private: class MKLDNNNode { public: + MKLDNNNode(const MKLDNNNode &) = delete; + MKLDNNNode & operator = (const MKLDNNNode &) = delete; + using AttrPtr = std::shared_ptr; public: @@ -444,7 +447,7 @@ public: return execIndex; } - std::string getTypeStr() const { + const std::string & getTypeStr() const { return typeStr; } diff --git a/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp b/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp index 85df2cbbad1..75b01e84a20 100644 --- a/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp @@ -81,6 +81,7 @@ #include "nodes/mkldnn_if_node.h" #include "nodes/mkldnn_ctc_greedy_decoder_node.h" #include "nodes/mkldnn_non_zero.h" +#include "nodes/mkldnn_color_convert_node.h" #include "nodes/subgraph.h" #define MKLDNN_NODE(__prim, __type) \ @@ -172,4 +173,5 @@ MKLDNNPlugin::MKLDNNNode::NodesFactory::NodesFactory() MKLDNN_NODE(MKLDNNGRNNode, GRN); MKLDNN_NODE(MKLDNNNonZeroNode, NonZero); MKLDNN_NODE(MKLDNNSnippetNode, Subgraph); + MKLDNN_NODE(MKLDNNColorConvertNode, ColorConvert); } diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.cpp new file mode 100644 index 00000000000..64d5c420576 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.cpp @@ -0,0 +1,659 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_color_convert_node.h" +#include +#include +#include +#include +#include +#include + +using namespace InferenceEngine; +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::cpu::x64; +using namespace Xbyak; + +namespace MKLDNNPlugin { + +namespace { + +std::tuple getAlgorithmFor(const std::shared_ptr& op) { + if (ov::is_type(op)) + return std::make_tuple(Algorithm::ColorConvertNV12toRGB, std::string()); + if (ov::is_type(op)) + return std::make_tuple(Algorithm::ColorConvertNV12toBGR, std::string()); + return std::make_tuple(Algorithm::Default, "Only v8::NV12toRGB or v8::NV12toBGR operation is supported"); +} + +namespace nv12 { + +MKLDNNColorConvertNode::Converter::PrimitiveDescs supportedPrimitiveDescs(MKLDNNNode *node) { + const LayoutType layout = LayoutType::ncsp; // 0,1,2,3 + + const Precision precision = node->getOriginalInputPrecisionAtPort(0) == Precision::U8 + ? Precision::U8 + : Precision::FP32; + + MKLDNNColorConvertNode::Converter::PrimitiveDescs descs; + + descs.emplace_back(std::vector { node->getOriginalInputsNumber(), { layout, precision } }, + std::vector { { layout, precision } }, + mayiuse(cpu_isa_t::sse41) + ? impl_desc_type::jit_uni + : impl_desc_type::ref, + true); + + return std::move(descs); +} + +class Converter : public MKLDNNColorConvertNode::Converter { + using Base = MKLDNNColorConvertNode::Converter; + +public: + Converter(MKLDNNNode *node); + +protected: + Shapes shapeInfer() const override; + bool singlePlane() const; + + template + void convert(const T* y, + const T* uv, + T* dst, + size_t batch_size, + size_t height, + size_t width, + size_t stride_y, + size_t stride_uv); +}; + +Converter::Converter(MKLDNNNode *node) + : Base(node, node->getAlgorithm() == Algorithm::ColorConvertNV12toRGB + ? ColorFormat { { 0, 1, 2 } } + : ColorFormat { { 2, 1, 0 } }) { + if (node->getOriginalInputsNumber() != (singlePlane() ? 1: 2)) + IE_THROW() <<"NV12Converter node has incorrect number of inputs"; + if (!node->getOriginalOutputsNumber()) + IE_THROW() <<"NV12Converter node has incorrect number of outputs"; +} + +MKLDNNColorConvertNode::Converter::Shapes +Converter::shapeInfer() const { + const auto & dims = inputDims(0); + if (dims.size() != 4) + IE_THROW() <<"NV12Converter node has incorrect input dimensions"; + return singlePlane() + ? Shapes { { dims[N_DIM], dims[H_DIM] * 2 / 3, dims[W_DIM], 3 } } + : Shapes { { dims[N_DIM], dims[H_DIM], dims[W_DIM], 3 } }; +} + +bool Converter::singlePlane() const { + return _node->getOriginalInputsNumber() == 1; +} + +template +void Converter::convert(const T* y, + const T* uv, + T* dst, + size_t batch_size, + size_t height, + size_t width, + size_t stride_y, + size_t stride_uv) { + InferenceEngine::parallel_for2d(batch_size, height, [&](int batch, int h) { + T* out = dst + batch * width * height * 3; + auto y_ptr = y + batch * stride_y; + auto uv_ptr = uv + batch * stride_uv; + + for (int w = 0; w < width; w++) { + auto y_index = h * width + w; + auto y_val = static_cast(y_ptr[y_index]); + auto uv_index = (h / 2) * width + (w / 2) * 2; + auto u_val = static_cast(uv_ptr[uv_index]); + auto v_val = static_cast(uv_ptr[uv_index + 1]); + auto c = y_val - 16.f; + auto d = u_val - 128.f; + auto e = v_val - 128.f; + auto clip = [](float a) -> T { + if (std::is_integral()) { + return static_cast(std::min(std::max(std::round(a), 0.f), 255.f)); + } else { + return static_cast(std::min(std::max(a, 0.f), 255.f)); + } + }; + + auto r = clip(1.164f * c + 1.596f * e); + auto g = clip(1.164f * c - 0.391f * d - 0.813f * e); + auto b = clip(1.164f * c + 2.018f * d); + + out[y_index * 3 + _colorFormat[0]] = r; + out[y_index * 3 + _colorFormat[1]] = g; + out[y_index * 3 + _colorFormat[2]] = b; + } + }); +} + +template +class SinglePlaneConvert; +template +class TwoPlaneConvert; + +template +class SinglePlaneConvert : public Converter { +public: + using Converter::Converter; + + void execute(mkldnn::stream strm) override { + const auto & dims = inputDims(0); + + const size_t batch_size = dims[N_DIM]; + const size_t height = dims[H_DIM] * 2 / 3; + const size_t width = dims[W_DIM]; + + const T* y = static_cast(input(0)); + const T* uv = y + width * height; + T* dst = static_cast(output(0)); + + convert(y, uv, dst, + batch_size, + height, + width, + height * width * 3 / 2, + height * width * 3 / 2); + } +}; + +template +class TwoPlaneConvert : public Converter { +public: + using Converter::Converter; + + void execute(mkldnn::stream strm) override { + const auto & dims = inputDims(0); + + const T* y = static_cast(input(0)); + const T* uv = static_cast(input(1)); + T* dst = static_cast(output(0)); + + const size_t batch_size = dims[N_DIM]; + const size_t height = dims[H_DIM]; + const size_t width = dims[W_DIM]; + + convert(y, uv, dst, + batch_size, + height, + width, + height * width, + height * width / 2); + } +}; + +struct jit_uni_converter : public jit_kernel { + DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_converter) + + struct Params { + const void * y; + const void * uv; + void * dst; + size_t width; + uint8_t colorFormat; // RGB: 0, BGR: !=0 + }; + + typedef void (*function_t)(const Params *); + + void operator()(const Params & args) const { + _fn(&args); + } + + template + static const jit_uni_converter & get(); + +protected: + jit_uni_converter() = default; + + function_t _fn; +}; + +template +class jit_uni_converter_impl : public jit_uni_converter { + void generate() override; +}; + +template +void jit_uni_converter_impl::generate() { + using reg_type = typename isa_traits::reg::type; + using var_type = variable::reg::length]>; + constexpr auto reg_capacity = isa_traits::reg::length; + + preamble(); + + // Get arguments addresses + auto y = arg(&Params::y); + auto uv = arg(&Params::uv); + auto dst = arg(&Params::dst); + auto width = arg(&Params::width); + auto colorFormat = arg(&Params::colorFormat); + + // Reserve registars + auto consts = reserve(); + auto tmp = var(); + auto y_val = var(); + auto u_val = var(); + auto v_val = var(); + auto r = var(); + auto g = var(); + auto b = var(); + + // Aliases + const auto & c = y_val; + const auto & d = u_val; + const auto & e = v_val; + const auto & uv_val = tmp; + + const uint8_t even_mask = 0xA0; // 0b10100000 + const uint8_t odd_mask = 0xF5; // 0b11110101 + static const float data[8] = { 16.f, 128.f, 1.164f, 1.596f, 0.391f, 2.018f, 0.813f, 255.f }; + + mov(consts, (size_t)data); + + auto clip = [this](const reg_type & op, const reg_type & a, const reg_type & b) { + if (std::is_integral()) + uni_vroundps(op, op, 0); + uni_vmaxps(op, op, a); + uni_vminps(op, op, b); + }; + + auto blend = [&, this](const reg_type & r, const reg_type & g, const reg_type & b) { + /* + Input: + r0,r1,r2,r3,r4,r5,r6,r7 + g0,g1,g2,g3,g4,g5,g6,g7 + b0,b1,b2,b3,b4,b5,b6,b7 + + Permutation: + r0,r3,r6,r1,r4,r7,r2,r5 + g5,g0,g3,g6,g1,g4,g7,g2 + b2,b5,b0,b3,b6,b1,b4,b7 + + Blend + r0,g0,xx,r1,g1,xx,r2,g2 blend 1+2 by mask 10210210 + r0,g0,b0,r1,g1,b1,r2,g2 blend +3 by mask 00100100 + + xx,r3,g3,xx,r4,g4,xx,r5 blend 1+2 by mask 02102102 + b2,r3,g3,b3,r4,g4,b4,r5 blend +3 by mask 01001001 + + g5,xx,r6,g6,xx,r7,g7,xx blend 1+2 by mask 21021021 + g5,b5,r6,g6,b6,r7,g7,b7 blend +3 by mask 10010010 + + Result + c = r0,g0,b0,r1,g1,b1,r2,g2 + d = b2,r3,g3,b3,r4,g4,b4,r5 + e = g5,b5,r6,g6,b6,r7,g7,b7 + */ + + auto genPermutationMask = [](int offset) -> std::array::reg::length> { + std::array::reg::length> mask {}; + + if (!!(isa & cpu_isa_t::avx)) { + for (int i = 0; i < mask.size(); ++i) + mask[(i * 3 + offset) % mask.size()] = i; + } else { + int & m0 = mask.front(); + for (int i = 0; i < 4; ++i) + m0 |= i << ((i * 3 + offset) % 4) * 2; + } + + return std::move(mask); + }; + + static const auto permutationMask4r = genPermutationMask(0); + static const auto permutationMask4g = genPermutationMask(1); + static const auto permutationMask4b = genPermutationMask(2); + + uni_vpermps(r, permutationMask4r.data(), r); + uni_vpermps(g, permutationMask4g.data(), g); + uni_vpermps(b, permutationMask4b.data(), b); + + auto blendWithMask = [&](int offset, const var_type & result) { + static const uint32_t blendMasks[2] = { + 0x92492492, + 0x24924924 + }; + const uint16_t mask0 = static_cast(blendMasks[0] >> ((offset * reg_capacity) % 3)); + const uint16_t mask1 = static_cast(blendMasks[1] >> ((offset * reg_capacity) % 3)); + + result = r; + result.blend(g, mask0); + result.blend(b, mask1); + }; + + blendWithMask(0, c); + blendWithMask(1, d); + blendWithMask(2, e); + }; // blend + + auto colorConvert = [&](const reg_type & y_val, const reg_type & uv_val) { + uni_vshufps(u_val, uv_val, uv_val, even_mask); // u_val = tmp[0,0,2,2,4,4,6,6] + uni_vshufps(v_val, uv_val, uv_val, odd_mask); // v_val = tmp[1,1,3,3,5,5,7,7] + + uni_vbroadcastss(tmp, ptr[consts + 0 * sizeof(float)]); // tmp = [16.0f,16.0f,...] + uni_vsubps(c, y_val, tmp); // c = y_val - tmp + uni_vbroadcastss(tmp, ptr[consts + 1 * sizeof(float)]); // tmp = [128.f,128.f,...] + uni_vsubps(d, u_val, tmp); // d = u_val - tmp + uni_vsubps(e, v_val, tmp); // e = v_val - tmp + + uni_vbroadcastss(tmp, ptr[consts + 2 * sizeof(float)]); // tmp = [1.164f,1.164f,...] + uni_vmulps(c, c, tmp); // c = c * tmp + + uni_vbroadcastss(r, ptr[consts + 3 * sizeof(float)]); // r = [1.596f,1.596f,...] + uni_vmulps(r, r, e); // r = r * e + uni_vaddps(r, r, c); // r = r + c + + uni_vbroadcastss(g, ptr[consts + 4 * sizeof(float)]); // g = [0.391f,0.391f,...] + uni_vmulps(g, g, d); // g = g * d + uni_vsubps(g, c, g); // g = c - g + uni_vbroadcastss(tmp, ptr[consts + 6 * sizeof(float)]); // tmp = [0.813f,0.813f,...] + uni_vmulps(tmp, tmp, e); // tmp = tmp * e + uni_vsubps(g, g, tmp); // g = g - tmp + + uni_vbroadcastss(b, ptr[consts + 5 * sizeof(float)]); // b = [2.018f,2.018f,...] + uni_vmulps(b, b, d); // b = b * d + uni_vaddps(b, b, c); // b = b + c + + // clip + uni_vxorps(c, c, c); + uni_vbroadcastss(d, ptr[consts + 7 * sizeof(float)]); + + clip(r, c, d); + clip(g, c, d); + clip(b, c, d); + + _if(colorFormat == 0) + ._then([&]{ blend(r, g, b); }) + ._else([&]{ blend(b, g, r); }); + }; + + const size_t reg_capacity_log = static_cast(std::logb(reg_capacity)); + const size_t step = reg_capacity * sizeof(T); + + width >>= reg_capacity_log; + + foreach(0, width, [&](const Reg64 & idx) { + load(y_val, y); + load(uv_val, uv); + + colorConvert(y_val, uv_val); + + store(dst, c); dst += step; + store(dst, d); dst += step; + store(dst, e); dst += step; + + y += step; + uv += step; + }); + + mov(width, argPtr(&Params::width)); + width &= reg_capacity - 1; + + _if(width != 0) + ._then([&] { + auto s = stack(3 * step); + s.clear(); + + copy(s.pointer(), y, width); + copy(ptr[s.pointer() + step], uv, width); + + y = s.pointer(); + lea(uv, ptr[s.pointer() + step]); + + load(y_val, y); + load(uv_val, uv); + + colorConvert(y_val, uv_val); + + store(y, c); y += step; + store(y, d); y += step; + store(y, e); + + lea(width, ptr[width + width * 2]); + copy(ptr[dst], s.pointer(), width); + }); + + postamble(); +} + +template +const jit_uni_converter & jit_uni_converter::get() { + auto createKernel = []() { + std::unique_ptr kernel; + + if (mayiuse(cpu_isa_t::avx512_common)) { + kernel.reset(new jit_uni_converter_impl); + } else if (mayiuse(cpu_isa_t::avx2)) { + kernel.reset(new jit_uni_converter_impl); + } else if (mayiuse(cpu_isa_t::sse41)) { + kernel.reset(new jit_uni_converter_impl); + } else { + IE_THROW() << "Can't create jit color converter kernel"; + } + + if (kernel->create_kernel() != status::success) + IE_THROW() << "Can't generate jit color converter kernel"; + kernel->_fn = (function_t)kernel->jit_ker(); + + return std::move(kernel); + }; + + static auto kernel = createKernel(); + + return *kernel; +} + +template +class SinglePlaneConvert : public Converter { +public: + using Converter::Converter; + + void execute(mkldnn::stream strm) override { + const auto & kernel = jit_uni_converter::get(); + const auto & dims = inputDims(0); + + const size_t batch_size = dims[N_DIM]; + const size_t height = dims[H_DIM] * 2 / 3; + const size_t width = dims[W_DIM]; + + const T* y = static_cast(input(0)); + const T* uv = y + width * height; + T* dst = static_cast(output(0)); + + const size_t stride_y = height * width * 3 / 2; + const size_t stride_uv = height * width * 3 / 2; + + InferenceEngine::parallel_for2d(batch_size, height, [&](int batch, int h) { + typename jit_uni_converter::Params args; + args.y = y + batch * stride_y + h * width; + args.uv = uv + batch * stride_uv + (h / 2) * width; + args.dst = dst + (batch * width * height + h * width) * 3; + args.width = width; + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + kernel(args); + }); + } +}; + +template +class TwoPlaneConvert : public Converter { +public: + using Converter::Converter; + + void execute(mkldnn::stream strm) override { + const auto & kernel = jit_uni_converter::get(); + const auto & dims = inputDims(0); + + const size_t batch_size = dims[N_DIM]; + const size_t height = dims[H_DIM]; + const size_t width = dims[W_DIM]; + + const T* y = static_cast(input(0)); + const T* uv = static_cast(input(1)); + T* dst = static_cast(output(0)); + + const size_t stride_y = height * width; + const size_t stride_uv = height * width / 2; + + InferenceEngine::parallel_for2d(batch_size, height, [&](int batch, int h) { + typename jit_uni_converter::Params args; + args.y = y + batch * stride_y + h * width; + args.uv = uv + batch * stride_uv + (h / 2) * width; + args.dst = dst + (batch * width * height + h * width) * 3; + args.width = width; + args.colorFormat = _colorFormat[0]; // The first byte is enough to determine the RGB or BGR format. + kernel(args); + }); + } +}; + +} // namespace nv12 +} // namespace + +MKLDNNColorConvertNode::Converter::Converter(MKLDNNNode *node, const ColorFormat & colorFormat) + : _node(node) + , _colorFormat(colorFormat) { +} + +InferenceEngine::Precision MKLDNNColorConvertNode::Converter::inputPrecision(size_t idx) const { + return _node->getParentEdgeAt(idx)->getMemory().GetDescWithType()->getPrecision(); +} + +InferenceEngine::Precision MKLDNNColorConvertNode::Converter::outputPrecision(size_t idx) const { + return _node->getChildEdgeAt(idx)->getMemory().GetDescWithType()->getPrecision(); +} + +const void * MKLDNNColorConvertNode::Converter::input(size_t idx) const { + return _node->getParentEdgeAt(idx)->getMemoryPtr()->GetPtr(); +} + +void * MKLDNNColorConvertNode::Converter::output(size_t idx) const { + return _node->getChildEdgeAt(idx)->getMemoryPtr()->GetPtr(); +} + +const VectorDims & MKLDNNColorConvertNode::Converter::inputDims(size_t idx) const { + return _node->getParentEdgesAtPort(idx)[0]->getMemory().getStaticDims(); +} + +bool MKLDNNColorConvertNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + Algorithm alg; + std::tie(alg, errorMessage) = getAlgorithmFor(op); + return alg != Algorithm::Default; +} + +MKLDNNColorConvertNode::MKLDNNColorConvertNode(const std::shared_ptr& op, + const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) + : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + std::tie(algorithm, errorMessage) = getAlgorithmFor(op); + if (algorithm == Algorithm::Default) + IE_THROW(NotImplemented) << errorMessage; +} + +void MKLDNNColorConvertNode::getSupportedDescriptors() {} + +void MKLDNNColorConvertNode::initSupportedPrimitiveDescriptors() { + if (supportedPrimitiveDescriptors.empty()) { + switch (algorithm) { + case Algorithm::ColorConvertNV12toRGB: + case Algorithm::ColorConvertNV12toBGR: { + for (const auto &desc : nv12::supportedPrimitiveDescs(this)) { + const auto & inPortConfigs = std::get<0>(desc); + const auto & outPortConfigs = std::get<1>(desc); + const auto implType = std::get<2>(desc); + const auto dynBatchSupport = std::get<3>(desc); + addSupportedPrimDesc(inPortConfigs, outPortConfigs, implType, dynBatchSupport); + } + initSupportedNV12Impls(); + break; + default: + break; + } + } + } +} + +void MKLDNNColorConvertNode::initSupportedNV12Impls() { + #define SUPPORTED_IMPL(Impl, type, desc_type) \ + [](MKLDNNNode *node) { \ + return new nv12::Impl(node); \ + }; + + // ref + { + auto &impls = _supportedImpls[impl_desc_type::ref][algorithm]; + impls[Precision::U8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, ref); + impls[Precision::U8][false] = SUPPORTED_IMPL(TwoPlaneConvert, uint8_t, ref); + impls[Precision::FP32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, ref); + impls[Precision::FP32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, ref); + } + + // jit_uni + { + auto &impls = _supportedImpls[impl_desc_type::jit_uni][algorithm]; + impls[Precision::U8][true] = SUPPORTED_IMPL(SinglePlaneConvert, uint8_t, jit_uni); + impls[Precision::U8][false] = SUPPORTED_IMPL(TwoPlaneConvert, uint8_t, jit_uni); + impls[Precision::FP32][true] = SUPPORTED_IMPL(SinglePlaneConvert, float, jit_uni); + impls[Precision::FP32][false] = SUPPORTED_IMPL(TwoPlaneConvert, float, jit_uni); + } + + #undef SUPPORTED_IMPL +} + +void MKLDNNColorConvertNode::createPrimitive() { + const NodeDesc *desc = getSelectedPrimitiveDescriptor(); + if (!desc) + IE_THROW() << getTypeStr() + " node with name '" + getName() + "' " + << "no optimal primitive descriptor selected"; + + if (!_impl) { + const auto & cfg = desc->getConfig(); + const auto precision = cfg.inConfs[0].desc->getPrecision(); + const bool isSinglePlane = cfg.inConfs.size() == 1; + + _impl = std::unique_ptr(_supportedImpls + .at(desc->getImplementationType()) + .at(algorithm) + .at(precision) + .at(isSinglePlane)(this)); + } +} + +void MKLDNNColorConvertNode::execute(mkldnn::stream strm) { + if (!_impl) + IE_THROW() << getTypeStr() + " node with name '" + getName() + "' " + << "has no any implemented converter"; + _impl->execute(strm); +} + +bool MKLDNNColorConvertNode::created() const { + return getType() == ColorConvert; +} + +std::vector MKLDNNColorConvertNode::shapeInfer() const { + if (!_impl) + IE_THROW() << getTypeStr() + " node with name '" + getName() + "' " + << "has no any implemented converter"; + return _impl->shapeInfer(); +} + +bool MKLDNNColorConvertNode::needPrepareParams() const { + return false; +} + +void MKLDNNColorConvertNode::executeDynamicImpl(mkldnn::stream strm) { + execute(strm); +} + +REG_MKLDNN_PRIM_FOR(MKLDNNColorConvertNode, ColorConvert); + +} // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.h b/src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.h new file mode 100644 index 00000000000..7ed1d7b4747 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_color_convert_node.h @@ -0,0 +1,75 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNColorConvertNode : public MKLDNNNode { +public: + MKLDNNColorConvertNode(const std::shared_ptr& op, + const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache); + class Converter; + +public: + void getSupportedDescriptors() override; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override; + void execute(mkldnn::stream strm) override; + bool created() const override; + std::vector shapeInfer() const override; + bool needPrepareParams() const override; + void executeDynamicImpl(mkldnn::stream strm) override; + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + void initSupportedNV12Impls(); + +private: + using ConverterBuilder = std::function; + using SupportedImpls = multidim_map; + + std::unique_ptr _impl; + SupportedImpls _supportedImpls; +}; + +class MKLDNNColorConvertNode::Converter { +public: + using PrimitiveDescs = std::vector, + std::vector, + impl_desc_type, + bool>>; + using Shapes = std::vector; + + static constexpr size_t N_DIM = 0; + static constexpr size_t H_DIM = 1; + static constexpr size_t W_DIM = 2; + static constexpr size_t C_DIM = 3; + + using ColorFormat = std::array; + + Converter(MKLDNNNode *node, const ColorFormat & colorFormat); + virtual ~Converter() = default; + InferenceEngine::Precision inputPrecision(size_t idx) const; + InferenceEngine::Precision outputPrecision(size_t idx) const; + const void * input(size_t idx) const; + void * output(size_t idx) const; + const VectorDims & inputDims(size_t idx) const; + virtual Shapes shapeInfer() const = 0; + virtual void execute(mkldnn::stream strm) = 0; + +protected: + MKLDNNNode *_node; + ColorFormat _colorFormat; // RGB: {0,1,2}, BGR: {2,1,0} +}; + +} // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/utils/blob_dump.cpp b/src/plugins/intel_cpu/src/utils/blob_dump.cpp index fb5d285de81..b5a00d7ce44 100644 --- a/src/plugins/intel_cpu/src/utils/blob_dump.cpp +++ b/src/plugins/intel_cpu/src/utils/blob_dump.cpp @@ -18,8 +18,8 @@ using namespace InferenceEngine; namespace MKLDNNPlugin { // IEB file format routine -static unsigned char IEB_MAGIC[4] = {'I', 'E', 'B', '0'}; -static unsigned char NO_SCALES = 0xFF; +static const unsigned char IEB_MAGIC[4] = {'I', 'E', 'B', '0'}; +static const unsigned char NO_SCALES = 0xFF; struct IEB_HEADER { unsigned char magic[4]; diff --git a/src/plugins/intel_cpu/src/utils/jit_kernel.cpp b/src/plugins/intel_cpu/src/utils/jit_kernel.cpp new file mode 100644 index 00000000000..c165d1da5f9 --- /dev/null +++ b/src/plugins/intel_cpu/src/utils/jit_kernel.cpp @@ -0,0 +1,334 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "jit_kernel.hpp" +#include + +using namespace dnnl::impl::cpu::x64; +using namespace Xbyak; + +namespace MKLDNNPlugin { + +namespace { + +template +using registers = std::array, 16>; + +template +const RegType & reserveReg(jit_kernel::reg_indices & freeRegs, const registers & regs) { + if (freeRegs.empty()) + throw std::runtime_error("No free registers"); + const auto idx = freeRegs.back(); + freeRegs.pop_back(); + return regs[idx]; +} + +template +void freeReg(jit_kernel::reg_indices & freeRegs, const registers & regs, const RegType & reg) { + const auto idx = reg.getIdx(); + // Debug: + // auto it = std::find(freeRegs.begin(), freeRegs.end(), idx); + // if (it != freeRegs.end()) + // throw std::runtime_error("Some register was freed twice"); + freeRegs.emplace_back(idx); + if (freeRegs.size() > regs.size()) + throw std::runtime_error("Some register was freed twice"); +} + +const registers & x64regs() { + using namespace Xbyak::util; + static const registers _x64regs {{ + rax, rcx, rdx, rbx, + rsp, rbp, rsi, rdi, + r8, r9, r10, r11, + r12, r13, r14, r15, + }}; + return _x64regs; +} + +const registers & x32regs() { + using namespace Xbyak::util; + static const registers _x32regs {{ + eax, ecx, edx, ebx, + esp, ebp, esi, edi, + r8d, r9d, r10d, r11d, + r12d, r13d, r14d, r15d, + }}; + return _x32regs; +} + +const registers & x16regs() { + using namespace Xbyak::util; + static const registers _x16regs {{ + ax, cx, dx, bx, + sp, bp, si, di, + r8w, r9w, r10w, r11w, + r12w, r13w, r14w, r15w, + }}; + return _x16regs; +} + +const registers & x8regs() { + using namespace Xbyak::util; + static const registers _x8regs {{ + al, cl, dl, bl, + spl, bpl, sil, dil, + r8b, r9b, r10b, r11b, + r12b, r13b, r14b, r15b, + }}; + return _x8regs; +} + +const registers & xmmregs() { + static const registers _xmmregs {{ + util::xmm0, util::xmm1, util::xmm2, util::xmm3, + util::xmm4, util::xmm5, util::xmm6, util::xmm7, + util::xmm8, util::xmm9, util::xmm10, util::xmm11, + util::xmm12, util::xmm13, util::xmm14, util::xmm15, + }}; + return _xmmregs; +} + +const registers & ymmregs() { + static const registers _ymmregs {{ + util::ymm0, util::ymm1, util::ymm2, util::ymm3, + util::ymm4, util::ymm5, util::ymm6, util::ymm7, + util::ymm8, util::ymm9, util::ymm10, util::ymm11, + util::ymm12, util::ymm13, util::ymm14, util::ymm15, + }}; + return _ymmregs; +} + +const registers & zmmregs() { + static const registers _zmmregs {{ + util::zmm0, util::zmm1, util::zmm2, util::zmm3, + util::zmm4, util::zmm5, util::zmm6, util::zmm7, + util::zmm8, util::zmm9, util::zmm10, util::zmm11, + util::zmm12, util::zmm13, util::zmm14, util::zmm15, + }}; + return _zmmregs; +} + +} // namespace + +namespace internal { + +template<> +InferenceEngine::Precision type2precision() { + return InferenceEngine::Precision::FP32; +} + +template<> +InferenceEngine::Precision type2precision() { + return InferenceEngine::Precision::U8; +} + +cpu_isa_t get_current_isa() { + if (mayiuse(cpu_isa_t::avx512_common)) + return cpu_isa_t::avx512_common; + if (mayiuse(cpu_isa_t::avx2)) + return cpu_isa_t::avx2; + return cpu_isa_t::sse41; +} + +stack_frame::stack_frame(MKLDNNPlugin::jit_kernel & kernel, size_t size) + : _kernel(kernel) + , _size(size) { + if (_size) + _kernel.sub(_kernel.rsp, _size); +} + +stack_frame::stack_frame(stack_frame && rhs) + : _kernel(rhs._kernel) + , _size(rhs._size) { + rhs._size = 0; +} + +stack_frame::~stack_frame() { + if (_size) + _kernel.add(_kernel.rsp, _size); +} + +const Xbyak::Reg64 & stack_frame::pointer() const { + return _kernel.rsp; +} + +void stack_frame::clear() const { + const size_t end = _size & ~(size_t)7u; + + _kernel.foreach(0, end, [&](const Reg64 & idx) { + _kernel.mov(_kernel.qword[pointer() + idx], 0); + }, sizeof(size_t)); + + if (end < _size) { + _kernel.foreach(end, _size, [&](const Reg64 & idx) { + _kernel.mov(_kernel.byte[pointer() + idx], 0); + }); + } +} + +} // namespace internal + +jit_kernel::jit_kernel() + : _load_emitter(this, internal::get_current_isa()) + , _store_emitter(this, internal::get_current_isa()) { + _free_rmmregs.reserve(16); + _free_rmmregs.reserve(16); + + auto isRegReserved = [this](int idx) { + return idx == param1.getIdx() // function argument + || idx == Operand::Code::RSP // stack pointer + || idx == Operand::Code::RBP; // frame pointer + }; + + for (int reg = Operand::Code::RAX; reg <= Operand::Code::R15; ++reg) { + if (!isRegReserved(reg)) + _free_x64regs.emplace_back(reg); + _free_rmmregs.emplace_back(reg); + } +} + +template<> +const Reg64 & jit_kernel::reserve() { + return reserveReg(_free_x64regs, x64regs()); +} + +template<> +const Reg32 & jit_kernel::reserve() { + return reserveReg(_free_x64regs, x32regs()); +} + +template<> +const Reg16 & jit_kernel::reserve() { + return reserveReg(_free_x64regs, x16regs()); +} + +template<> +const Reg8 & jit_kernel::reserve() { + return reserveReg(_free_x64regs, x8regs()); +} + +template<> +void jit_kernel::free(const Reg64 & reg) { + freeReg(_free_x64regs, x64regs(), reg); +} + +template<> +void jit_kernel::free(const Reg32 & reg) { + freeReg(_free_x64regs, x32regs(), reg); +} + +template<> +void jit_kernel::free(const Reg16 & reg) { + freeReg(_free_x64regs, x16regs(), reg); +} + +template<> +void jit_kernel::free(const Reg8 & reg) { + freeReg(_free_x64regs, x8regs(), reg); +} + +template<> +const Xmm & jit_kernel::reserve() { + return reserveReg(_free_rmmregs, xmmregs()); +} + +template<> +void jit_kernel::free(const Xmm & reg) { + freeReg(_free_rmmregs, xmmregs(), reg); +} + +template<> +const Ymm & jit_kernel::reserve() { + return reserveReg(_free_rmmregs, ymmregs()); +} + +template<> +void jit_kernel::free(const Ymm & reg) { + freeReg(_free_rmmregs, ymmregs(), reg); +} + +template<> +const Zmm & jit_kernel::reserve() { + return reserveReg(_free_rmmregs, zmmregs()); +} + +template<> +void jit_kernel::free(const Zmm & reg) { + freeReg(_free_rmmregs, zmmregs(), reg); +} + +void jit_kernel::postamble() { + jit_generator::postamble(); + if (_is_load_emitter_used) + _load_emitter.emit_data(); + if (_is_store_emitter_used) + _store_emitter.emit_data(); +} + +const AddressFrame & jit_kernel::address_frame(size_t size) const { + switch (size) { + case 1: return byte; + case 2: return word; + case 4: return dword; + case 8: return qword; + case 16: return xword; + case 32: return yword; + case 64: return zword; + default: + break; + } + return ptr; +} + +jit_kernel::stack_frame jit_kernel::stack(size_t size) { + return stack_frame(*this, size); +} + +void jit_kernel::uni_vpermps(const Xmm& x1, const int *mask, const Operand& op) { + uint8_t imm8 = static_cast(*mask); + mov(x1, op); + shufps(x1, op, imm8); +} + +void jit_kernel::uni_vpermps(const Ymm& y1, const int *mask, const Operand& op) { + auto mreg = reserve(); + auto mptr = reserve(); + + mov(mptr, (size_t)mask); + uni_vmovdqu(mreg, ptr[mptr]); + vpermps(y1, mreg, op); + + free(mreg); + free(mptr); +} + +void jit_kernel::uni_vpermps(const Zmm& z1, const int *mask, const Operand& op) { + auto mreg = reserve(); + auto mptr = reserve(); + + mov(mptr, (size_t)mask); + uni_vmovdqu(mreg, ptr[mptr]); + vpermps(z1, mreg, op); + + free(mreg); + free(mptr); +} + +void jit_kernel::uni_vblendps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, uint16_t mask) { + blendps(x1, x2, mask); +} + +void jit_kernel::uni_vblendps(const Xbyak::Ymm& y1, const Xbyak::Ymm& y2, uint16_t mask) { + vblendps(y1, y1, y2, static_cast(mask)); +} + +void jit_kernel::uni_vblendps(const Xbyak::Zmm& z1, const Xbyak::Zmm& z2, uint16_t mask) { + auto reg = var(); + mov(reg, mask); + kmovw(k1, reg); + vblendmps(z1 | k1, z1, z2); +} + +} // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/utils/jit_kernel.hpp b/src/plugins/intel_cpu/src/utils/jit_kernel.hpp new file mode 100644 index 00000000000..283c6823a5f --- /dev/null +++ b/src/plugins/intel_cpu/src/utils/jit_kernel.hpp @@ -0,0 +1,697 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace MKLDNNPlugin { + +struct jit_kernel; + +namespace internal { + +template +struct reg_traits_by_size; +template +struct reg_traits; +template +struct reg_traits; +template +struct isa_traits; + +template<> +struct reg_traits_by_size<1> { + using type = Xbyak::Reg8; + constexpr static size_t size = 1; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::isa_any; +}; + +template<> +struct reg_traits_by_size<2> { + using type = Xbyak::Reg16; + constexpr static size_t size = 2; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::isa_any; +}; + +template<> +struct reg_traits_by_size<4> { + using type = Xbyak::Reg32; + constexpr static size_t size = 4; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::isa_any; +}; + +template<> +struct reg_traits_by_size<8> { + using type = Xbyak::Reg64; + constexpr static size_t size = 8; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::isa_any; +}; + +template +struct reg_traits : public reg_traits_by_size {}; + +template<> +struct reg_traits { + using type = Xbyak::Fpu; + constexpr static size_t size = 10; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::isa_any; +}; +template<> +struct reg_traits : public reg_traits {}; + +template +struct reg_traits : public reg_traits {}; + +template +struct reg_traits { + using type = Xbyak::Xmm; + constexpr static size_t size = 4 * 4; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::sse41; +}; + +template +struct reg_traits { + using type = Xbyak::Ymm; + constexpr static size_t size = 8 * 4; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::avx2; +}; + +template +struct reg_traits { + using type = Xbyak::Zmm; + constexpr static size_t size = 16 * 4; // in bytes + constexpr static dnnl::impl::cpu::x64::cpu_isa_t isa + = dnnl::impl::cpu::x64::cpu_isa_t::avx512_common; +}; + +template<> +struct isa_traits { + struct reg { + using type = Xbyak::Xmm; + constexpr static size_t size = 4 * 4; // in bytes + constexpr static size_t length = 4; // in dwords + }; +}; + +template<> +struct isa_traits { + struct reg { + using type = Xbyak::Ymm; + constexpr static size_t size = 8 * 4; // in bytes + constexpr static size_t length = 8; // in dwords + }; +}; + +template<> +struct isa_traits { + struct reg { + using type = Xbyak::Zmm; + constexpr static size_t size = 16 * 4; // in bytes + constexpr static size_t length = 16; // in dwords + }; +}; + +template +class variable; +template +class if_expression; +template +class then_expression; + +template +class boolean_expression { +private: + using reg_type = typename reg_traits::type; + + enum class type { + eq, neq + }; + + boolean_expression(jit_kernel & kernel, type t, const reg_type & lhs, const reg_type & rhs); + boolean_expression(jit_kernel & kernel, type t, const reg_type & lhs, T rhs); + void cmp(const Xbyak::Label & exit) const; + + jit_kernel & _kernel; + type _type; + const reg_type & _lhs; + + bool _is_ref; + + union datum { + datum(const reg_type & r) + : reg(&r) {} + datum(T v) + : value(v) {} + const reg_type * reg; + T value; + } _rhs; + + friend class variable; + friend class if_expression; + friend class then_expression; +}; + +template +struct then_expression { + then_expression(if_expression & expr); + + template + void _else(F && fn); + +private: + if_expression & _if_expr; +}; + +template +struct if_expression { + if_expression(const boolean_expression & expr) + : _expr(expr) {} + + ~if_expression() { + try { + if (!_is_exit_valid) + _expr._kernel.assignL(_exit, _else); + } catch(...) {} + } + + template + then_expression _then(F && fn) { + using namespace Xbyak; + + _expr.cmp(_else); + fn(); + _expr._kernel.jmp(_exit, Xbyak::CodeGenerator::T_NEAR); + _expr._kernel.L(_else); + + return then_expression(*this); + } + +private: + const boolean_expression & _expr; + Xbyak::Label _exit; + Xbyak::Label _else; + bool _is_exit_valid = false; + + friend class then_expression; +}; + +template +class variable_base { +public: + using reg_type = typename reg_traits::type; + + variable_base(const variable_base &) = delete; + variable_base & operator = (const variable_base &) = delete; + variable_base(variable_base &&); + + operator const reg_type &() const { + return _reg; + } + + operator Xbyak::RegExp () const { + return _reg; + } + + jit_kernel & kernel; + +protected: + variable_base(jit_kernel & krnl, const reg_type & reg); + ~variable_base(); + + bool _manage_lifetime = true; + const reg_type & _reg; +}; + +template +class variable : public variable_base::value, T>::type> { +public: + using type = T; + using base = variable_base; + using reg_type = typename base::reg_type; + + variable(variable &&) = default; + variable(jit_kernel & krnl); + variable(jit_kernel & krnl, const reg_type & reg); + + const variable & operator = (const reg_type & rhs) const { + base::kernel.mov(base::_reg, rhs); + return *this; + } + const variable & operator = (T rhs) const { + base::kernel.mov(base::_reg, rhs); + return *this; + } + const variable & operator += (const reg_type & rhs) const { + base::kernel.add(base::_reg, rhs); + return *this; + } + const variable & operator += (typename std::conditional::value, size_t, T>::type rhs) const { + base::kernel.add(base::_reg, rhs); + return *this; + } + const variable & operator -= (const reg_type & rhs) const { + base::kernel.sub(base::_reg, rhs); + return *this; + } + const variable & operator -= (typename std::conditional::value, size_t, T>::type rhs) const { + base::kernel.sub(base::_reg, rhs); + return *this; + } + const variable & operator &= (const reg_type & rhs) const { + base::kernel.and_(base::_reg, rhs); + return *this; + } + const variable & operator &= (T rhs) const { + base::kernel.and_(base::_reg, rhs); + return *this; + } + const variable & operator |= (const reg_type & rhs) const { + base::kernel.or_(base::_reg, rhs); + return *this; + } + const variable & operator |= (T rhs) const { + base::kernel.or_(base::_reg, rhs); + return *this; + } + const variable & operator >>= (size_t rhs) const { + base::kernel.shr(base::_reg, rhs); + return *this; + } + const variable & operator <<= (size_t rhs) const { + base::kernel.shl(base::_reg, rhs); + return *this; + } + + boolean_expression operator == (const reg_type & rhs) const { + return boolean_expression(base::kernel, boolean_expression::type::eq, base::_reg, rhs); + } + + boolean_expression operator == (T rhs) const { + return boolean_expression(base::kernel, boolean_expression::type::eq, base::_reg, rhs); + } + + boolean_expression operator != (const reg_type & rhs) const { + return boolean_expression(base::kernel, boolean_expression::type::neq, base::_reg, rhs); + } + + boolean_expression operator != (T rhs) const { + return boolean_expression(base::kernel, boolean_expression::type::neq, base::_reg, rhs); + } + + // TODO: add necessary operations +}; + +template +class variable : public variable_base { +public: + using type = T[N]; + using base = variable_base; + using reg_type = typename base::reg_type; + constexpr static size_t length = N; + + variable(variable &&) = default; + variable(jit_kernel & krnl); + variable(jit_kernel & krnl, const reg_type & reg); + + const variable & operator = (const reg_type & rhs) const { + base::kernel.uni_vmovups(base::_reg, rhs); + return *this; + } + + const variable & blend(const reg_type & rhs, uint16_t mask) const { + base::kernel.uni_vblendps(base::_reg, rhs, mask); + return *this; + } + + // TODO: implement vector arithmetic +}; + +class stack_frame { + stack_frame(const stack_frame &) = delete; + stack_frame & operator = (const stack_frame &) = delete; + +public: + stack_frame(jit_kernel & kernel, size_t size); + stack_frame(stack_frame && rhs); + ~stack_frame(); + const Xbyak::Reg64 & pointer() const; + void clear() const; + +private: + jit_kernel & _kernel; + size_t _size; +}; + +template +InferenceEngine::Precision type2precision(); + +dnnl::impl::cpu::x64::cpu_isa_t get_current_isa(); + +} // namespace internal + +struct jit_kernel : public dnnl::impl::cpu::x64::jit_generator { + using reg_indices = std::vector; + template + using reg_traits = internal::reg_traits; + template + using reg_traits_by_size = internal::reg_traits_by_size; + template + using isa_traits = internal::isa_traits; + using stack_frame = internal::stack_frame; + template + using variable = internal::variable; + template + using if_expression = internal::if_expression; + template + using boolean_expression = internal::boolean_expression; + + template + Xbyak::Address argPtr(U T::*member) const { + auto memPtr = &(reinterpret_cast(0)->*member); + const size_t offs = reinterpret_cast(memPtr) - reinterpret_cast(0); + return address_frame(sizeof(U))[param1 + offs]; + } + + template + variable arg(U T::*member) { + using traits = internal::reg_traits; + using reg_type = typename traits::type; + const auto & res = reserve(); + if (sizeof(T) < traits::size) + movzx(res, argPtr(member)); + else + mov(res, argPtr(member)); + return { *this, res }; + } + + template + variable arg(U T::*member) { + using traits = internal::reg_traits; + using reg_type = typename traits::type; + const auto & res = reserve(); + if (sizeof(T) < traits::size) + movzx(res, argPtr(member)); + else + mov(res, argPtr(member)); + return { *this, res }; + } + + jit_kernel(); + + template + const RegType & reserve(); + + template + void free(const RegType & reg); + + template + void copy(const Xbyak::Reg64& dst, + const Xbyak::Reg64& src, + const Xbyak::Reg64& size); + template + void copy(const Xbyak::Address& dst, + const Xbyak::Reg64& src, + const Xbyak::Reg64& size); + + template + void load(const variable & dst, const variable & src); + template + void store(const variable & dst, const variable & src); + + template + void foreach(const B & begin, + const E & end, + std::function && fn, + const S & step = 1); + + template + variable var(); + + stack_frame stack(size_t size); + + template + if_expression _if(const boolean_expression & expr); + + void uni_vpermps(const Xbyak::Xmm& x1, const int *mask, const Xbyak::Operand& op); + void uni_vpermps(const Xbyak::Ymm& y1, const int *mask, const Xbyak::Operand& op); + void uni_vpermps(const Xbyak::Zmm& z1, const int *mask, const Xbyak::Operand& op); + void uni_vblendps(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, uint16_t mask); + void uni_vblendps(const Xbyak::Ymm& y1, const Xbyak::Ymm& y2, uint16_t mask); + void uni_vblendps(const Xbyak::Zmm& z1, const Xbyak::Zmm& z2, uint16_t mask); + + void postamble(); + +private: + const Xbyak::AddressFrame & address_frame(size_t size) const; + + reg_indices _free_x64regs; + reg_indices _free_rmmregs; + bool _is_load_emitter_used = false; + bool _is_store_emitter_used = false; + jit_load_emitter _load_emitter; + jit_store_emitter _store_emitter; +}; + +template +void jit_kernel::copy(const Xbyak::Reg64& dst, + const Xbyak::Reg64& src, + const Xbyak::Reg64& size) { + const auto & addr_frame = address_frame(sizeof(T)); + auto p = reserve::type>(); + foreach(0, size, [&](const Xbyak::Reg64& idx) { + mov(p, addr_frame[src + idx * sizeof(T)]); + mov(addr_frame[dst + idx * sizeof(T)], p); + }); + free(p); +} + +template +void jit_kernel::copy(const Xbyak::Address& dst, + const Xbyak::Reg64& src, + const Xbyak::Reg64& size) { + const auto & addr_frame = address_frame(sizeof(T)); + auto p = reserve::type>(); + auto d = reserve(); + lea(d, dst); + foreach(0, size, [&](const Xbyak::Reg64& idx) { + mov(p, addr_frame[src + idx * sizeof(T)]); + mov(addr_frame[d + idx * sizeof(T)], p); + }); + free(d); + free(p); +} + +template +void jit_kernel::load(const variable & dst, const variable & src) { + static_assert(std::is_same::reg_type, Xbyak::Reg64>::value, + "Source register must be Reg64"); + + using src_type = typename std::remove_cv< + typename std::remove_pointer< + typename std::decay::type>::type>::type; + using dst_type = typename std::remove_cv< + typename std::remove_pointer< + typename std::decay::type>::type>::type; + constexpr size_t length = variable::length; + + const std::vector pool_vec_idxs(_free_rmmregs.begin(), _free_rmmregs.end()); + const std::vector pool_gpr_idxs(_free_x64regs.begin(), _free_x64regs.end()); + + _load_emitter.emit_code( + { static_cast(static_cast(src).getIdx()) }, + { static_cast(static_cast(dst).getIdx()) }, + std::make_shared( + internal::type2precision(), + internal::type2precision(), + static_cast(length)), + pool_vec_idxs, + pool_gpr_idxs); + + _is_load_emitter_used = true; +} + +template +void jit_kernel::store(const variable & dst, const variable & src) { + static_assert(std::is_same::reg_type, Xbyak::Reg64>::value, + "Destibnation register must be Reg64"); + + using src_type = typename std::remove_cv< + typename std::remove_pointer< + typename std::decay::type>::type>::type; + using dst_type = typename std::remove_cv< + typename std::remove_pointer< + typename std::decay::type>::type>::type; + constexpr size_t length = variable::length; + + const std::vector pool_vec_idxs(_free_rmmregs.begin(), _free_rmmregs.end()); + const std::vector pool_gpr_idxs(_free_x64regs.begin(), _free_x64regs.end()); + + _store_emitter.emit_code( + { static_cast(static_cast(src).getIdx()) }, + { static_cast(static_cast(dst).getIdx()) }, + std::make_shared( + internal::type2precision(), + internal::type2precision(), + static_cast(length)), + pool_vec_idxs, + pool_gpr_idxs); + + _is_store_emitter_used = true; +} + +template +void jit_kernel::foreach(const B & begin, + const E & end, + std::function && fn, + const S & step) { + using namespace Xbyak; + + Label loop, exit; + + auto idx = reserve(); + + mov(idx, begin); + + L(loop); + cmp(idx, end); + jge(exit, T_NEAR); + + fn(idx); + + add(idx, step); + jmp(loop, T_NEAR); + L(exit); + + free(idx); +} + +template +jit_kernel::variable jit_kernel::var() { + using reg_type = typename reg_traits::type; + const auto & reg = reserve(); + return variable(*this, reg); +} + +template +jit_kernel::if_expression jit_kernel::_if(const boolean_expression & expr) { + return if_expression(expr); +} + +namespace internal { + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// boolean_expression + +template +boolean_expression::boolean_expression(jit_kernel & kernel, type t, const reg_type & lhs, const reg_type & rhs) + : _kernel(kernel) + , _type(t) + , _lhs(lhs) + , _is_ref(true) + , _rhs(rhs) { +} + +template +boolean_expression::boolean_expression(jit_kernel & kernel, type t, const reg_type & lhs, T rhs) + : _kernel(kernel) + , _type(t) + , _lhs(lhs) + , _is_ref(false) + , _rhs(rhs) { +} + +template +void boolean_expression::cmp(const Xbyak::Label & exit) const { + if (_is_ref) + _kernel.cmp(_lhs, *_rhs.reg); + else + _kernel.cmp(_lhs, _rhs.value); + + switch (_type) { + case type::eq: { + _kernel.jne(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + case type::neq: { + _kernel.je(exit, Xbyak::CodeGenerator::T_NEAR); + break; + } + } +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// then_expression + +template +then_expression::then_expression(if_expression & expr) + : _if_expr(expr) {} + +template +template +void then_expression::_else(F && fn) { + fn(); + _if_expr._expr._kernel.L(_if_expr._exit); + _if_expr._is_exit_valid = true; +} + +// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +// variable + +template +variable_base::variable_base(jit_kernel & krnl, const reg_type & reg) + : kernel(krnl) + , _reg(reg) { +} + +template +variable_base::variable_base(variable_base && rhs) + : kernel(rhs.kernel) + , _reg(rhs._reg) { + rhs._manage_lifetime = false; +} + +template +variable_base::~variable_base() { + if (_manage_lifetime) + kernel.free(_reg); +} + +template +variable::variable(jit_kernel & krnl) + : base(krnl, krnl.reserve::type>()) { +} + +template +variable::variable(jit_kernel & krnl, const reg_type & reg) + : base(krnl, reg) { +} + +template +variable::variable(jit_kernel & krnl) + : base(krnl, krnl.reserve::type>()) { +} + +template +variable::variable(jit_kernel & krnl, const reg_type & reg) + : base(krnl, reg) { +} + +} // namespace internal + +} // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/utils/multidim_map.hpp b/src/plugins/intel_cpu/src/utils/multidim_map.hpp new file mode 100644 index 00000000000..7bee94fbb74 --- /dev/null +++ b/src/plugins/intel_cpu/src/utils/multidim_map.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include +#include +#include + +namespace MKLDNNPlugin { + +namespace internal { + +template +struct enum_hash { + std::size_t operator()(K t) const { + return static_cast(t); + } +}; + +template +using hash_t = typename std::conditional::value, enum_hash, std::hash>::type; + +} // namespace internal + +template +struct multidim_map { + using key_type = K; + using mapped_type = multidim_map; + using hash_type = internal::hash_t; + +public: + mapped_type & operator[](const key_type & key) { + return _map[key]; + } + + const mapped_type & at(const key_type & key) const { + return _map.at(key); + } + +private: + std::unordered_map _map; +}; + +template +struct multidim_map { + using key_type = K; + using mapped_type = T; + using hash_type = internal::hash_t; + +public: + mapped_type & operator[](const key_type & key) { + return _map[key]; + } + + const mapped_type & at(const key_type & key) const { + return _map.at(key); + } + +private: + std::unordered_map _map; +}; + +} // namespace MKLDNNPlugin From 010877d06c8f28e99f047f72b8fdd1305da77413 Mon Sep 17 00:00:00 2001 From: Nikita Malinin Date: Wed, 29 Dec 2021 15:11:56 +0300 Subject: [PATCH 23/78] [POT] Update dtype for BC Parameters (#9468) --- .../pot/algorithms/quantization/bias_correction/algorithm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/pot/openvino/tools/pot/algorithms/quantization/bias_correction/algorithm.py b/tools/pot/openvino/tools/pot/algorithms/quantization/bias_correction/algorithm.py index 26a7c0532b7..4d49e2a753f 100644 --- a/tools/pot/openvino/tools/pot/algorithms/quantization/bias_correction/algorithm.py +++ b/tools/pot/openvino/tools/pot/algorithms/quantization/bias_correction/algorithm.py @@ -261,11 +261,12 @@ class BiasCorrection(Algorithm): inputs_data = [] for input_node in input_nodes: input_node_name = nu.create_node_name(input_node) + input_node_data_type = nu.get_node_data_type(input_node) c_input_shape = outputs_shapes[input_node_name] c_input_shape[0] = 1 parameter_name = input_node_name + '/parameter' param_node = ge.create_node(input_node.graph, parameter_name, 'Parameter', - {'shape': c_input_shape}) + {'shape': c_input_shape, 'data_type': input_node_data_type}) for _, port in input_node.out_ports().items(): for in_port in port.get_destinations(): in_port.disconnect() From 39a1b98799a2ec08a0a4272bd173e084e1462455 Mon Sep 17 00:00:00 2001 From: Maxim Gordeev Date: Wed, 29 Dec 2021 15:36:33 +0300 Subject: [PATCH 24/78] changed C++ samples due to OpenVINO style (#9463) I'll merge this. We need to talk with someone who know openvino build system better than we to "use cmake function ov_ncc_style_check to perform such check automatically". As far as I can see, currently it used only in one place, so it is not common approach for openvino components --- .../classification_sample_async.h | 2 +- .../cpp/classification_sample_async/main.cpp | 10 +- .../hello_nv12_input_classification/main.cpp | 4 +- samples/cpp/hello_query_device/main.cpp | 9 +- samples/cpp/speech_sample/fileutils.cpp | 80 ++++++------- samples/cpp/speech_sample/fileutils.hpp | 102 ++++++++-------- samples/cpp/speech_sample/main.cpp | 113 +++++++++--------- samples/cpp/speech_sample/utils.hpp | 60 +++++----- 8 files changed, 189 insertions(+), 191 deletions(-) diff --git a/samples/cpp/classification_sample_async/classification_sample_async.h b/samples/cpp/classification_sample_async/classification_sample_async.h index 24546c10e21..2df4ba21849 100644 --- a/samples/cpp/classification_sample_async/classification_sample_async.h +++ b/samples/cpp/classification_sample_async/classification_sample_async.h @@ -45,7 +45,7 @@ DEFINE_string(d, "CPU", target_device_message); /** * @brief This function show a help message */ -static void showUsage() { +static void show_usage() { std::cout << std::endl; std::cout << "classification_sample_async [OPTION]" << std::endl; std::cout << "Options:" << std::endl; diff --git a/samples/cpp/classification_sample_async/main.cpp b/samples/cpp/classification_sample_async/main.cpp index bc3ea275f1e..39ee8ed8df2 100644 --- a/samples/cpp/classification_sample_async/main.cpp +++ b/samples/cpp/classification_sample_async/main.cpp @@ -40,22 +40,22 @@ using namespace ov::preprocess; * @param argv list of input arguments * @return bool status true(Success) or false(Fail) */ -bool ParseAndCheckCommandLine(int argc, char* argv[]) { +bool parse_and_check_command_line(int argc, char* argv[]) { gflags::ParseCommandLineNonHelpFlags(&argc, &argv, true); if (FLAGS_h) { - showUsage(); + show_usage(); showAvailableDevices(); return false; } slog::info << "Parsing input parameters" << slog::endl; if (FLAGS_m.empty()) { - showUsage(); + show_usage(); throw std::logic_error("Model is required but not set. Please set -m option."); } if (FLAGS_i.empty()) { - showUsage(); + show_usage(); throw std::logic_error("Input is required but not set. Please set -i option."); } @@ -68,7 +68,7 @@ int main(int argc, char* argv[]) { slog::info << ov::get_openvino_version() << slog::endl; // -------- Parsing and validation of input arguments -------- - if (!ParseAndCheckCommandLine(argc, argv)) { + if (!parse_and_check_command_line(argc, argv)) { return EXIT_SUCCESS; } diff --git a/samples/cpp/hello_nv12_input_classification/main.cpp b/samples/cpp/hello_nv12_input_classification/main.cpp index 2decf681c52..eba8750b4b0 100644 --- a/samples/cpp/hello_nv12_input_classification/main.cpp +++ b/samples/cpp/hello_nv12_input_classification/main.cpp @@ -37,7 +37,7 @@ using namespace ov::preprocess; * @param string of image size in WIDTHxHEIGHT format * @return parsed width and height */ -std::pair parseImageSize(const std::string& size_string) { +std::pair parse_image_size(const std::string& size_string) { auto delimiter_pos = size_string.find("x"); if (delimiter_pos == std::string::npos || delimiter_pos >= size_string.size() - 1 || delimiter_pos == 0) { std::stringstream err; @@ -81,7 +81,7 @@ int main(int argc, char* argv[]) { const std::string image_path{argv[2]}; size_t input_width = 0; size_t input_height = 0; - std::tie(input_width, input_height) = parseImageSize(argv[3]); + std::tie(input_width, input_height) = parse_image_size(argv[3]); const std::string device_name{argv[4]}; // ----------------------------------------------------------------------------------------------------- diff --git a/samples/cpp/hello_query_device/main.cpp b/samples/cpp/hello_query_device/main.cpp index dc8ad5af65d..91418d769d8 100644 --- a/samples/cpp/hello_query_device/main.cpp +++ b/samples/cpp/hello_query_device/main.cpp @@ -16,13 +16,12 @@ #include "samples/slog.hpp" // clang-format on -namespace { /** * @brief Print IE Parameters * @param reference on IE Parameter * @return void */ -void printAnyValue(const ov::Any& value) { +void print_any_value(const ov::Any& value) { if (value.empty()) { slog::info << "EMPTY VALUE" << slog::endl; } else if (value.is()) { @@ -84,8 +83,6 @@ void printAnyValue(const ov::Any& value) { } } -} // namespace - int main(int argc, char* argv[]) { try { // -------- Get OpenVINO runtime version -------- @@ -114,7 +111,7 @@ int main(int argc, char* argv[]) { for (auto&& metricName : supportedMetrics) { if (metricName != METRIC_KEY(SUPPORTED_METRICS) && metricName != METRIC_KEY(SUPPORTED_CONFIG_KEYS)) { slog::info << "\t\t" << metricName << " : " << slog::flush; - printAnyValue(core.get_metric(device, metricName)); + print_any_value(core.get_metric(device, metricName)); } } @@ -126,7 +123,7 @@ int main(int argc, char* argv[]) { core.get_metric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); for (auto&& configKey : supportedConfigKeys) { slog::info << "\t\t" << configKey << " : " << slog::flush; - printAnyValue(core.get_config(device, configKey)); + print_any_value(core.get_config(device, configKey)); } } diff --git a/samples/cpp/speech_sample/fileutils.cpp b/samples/cpp/speech_sample/fileutils.cpp index d661443d541..651bdbe1cea 100644 --- a/samples/cpp/speech_sample/fileutils.cpp +++ b/samples/cpp/speech_sample/fileutils.cpp @@ -4,10 +4,10 @@ #include "fileutils.hpp" -void ArkFile::GetFileInfo(const char* fileName, - uint32_t numArrayToFindSize, - uint32_t* ptrNumArrays, - uint32_t* ptrNumMemoryBytes) { +void ArkFile::get_file_info(const char* fileName, + uint32_t numArrayToFindSize, + uint32_t* ptrNumArrays, + uint32_t* ptrNumMemoryBytes) { uint32_t numArrays = 0; uint32_t numMemoryBytes = 0; @@ -34,7 +34,7 @@ void ArkFile::GetFileInfo(const char* fileName, } in_file.close(); } else { - throw std::runtime_error(std::string("Failed to open %s for reading in GetFileInfo()!\n") + fileName); + throw std::runtime_error(std::string("Failed to open %s for reading in get_file_info()!\n") + fileName); } if (ptrNumArrays != NULL) @@ -43,13 +43,13 @@ void ArkFile::GetFileInfo(const char* fileName, *ptrNumMemoryBytes = numMemoryBytes; } -void ArkFile::LoadFile(const char* fileName, - uint32_t arrayIndex, - std::string& ptrName, - std::vector& memory, - uint32_t* ptrNumRows, - uint32_t* ptrNumColumns, - uint32_t* ptrNumBytesPerElement) { +void ArkFile::load_file(const char* fileName, + uint32_t arrayIndex, + std::string& ptrName, + std::vector& memory, + uint32_t* ptrNumRows, + uint32_t* ptrNumColumns, + uint32_t* ptrNumBytesPerElement) { std::ifstream in_file(fileName, std::ios::binary); if (in_file.good()) { uint32_t i = 0; @@ -72,7 +72,7 @@ void ArkFile::LoadFile(const char* fileName, std::getline(in_file, ptrName, '\0'); // read variable length name followed by space and NUL std::getline(in_file, line, '\4'); // read "BFM" followed by space and control-D if (line.compare("BFM ") != 0) { - throw std::runtime_error(std::string("Cannot find array specifier in file %s in LoadFile()!\n") + + throw std::runtime_error(std::string("Cannot find array specifier in file %s in load_file()!\n") + fileName); } in_file.read(reinterpret_cast(ptrNumRows), sizeof(uint32_t)); // read number of rows @@ -83,18 +83,18 @@ void ArkFile::LoadFile(const char* fileName, } in_file.close(); } else { - throw std::runtime_error(std::string("Failed to open %s for reading in LoadFile()!\n") + fileName); + throw std::runtime_error(std::string("Failed to open %s for reading in load_file()!\n") + fileName); } *ptrNumBytesPerElement = sizeof(float); } -void ArkFile::SaveFile(const char* fileName, - bool shouldAppend, - std::string name, - void* ptrMemory, - uint32_t numRows, - uint32_t numColumns) { +void ArkFile::save_file(const char* fileName, + bool shouldAppend, + std::string name, + void* ptrMemory, + uint32_t numRows, + uint32_t numColumns) { std::ios_base::openmode mode = std::ios::binary; if (shouldAppend) { mode |= std::ios::app; @@ -111,14 +111,14 @@ void ArkFile::SaveFile(const char* fileName, out_file.write(reinterpret_cast(ptrMemory), numRows * numColumns * sizeof(float)); out_file.close(); } else { - throw std::runtime_error(std::string("Failed to open %s for writing in SaveFile()!\n") + fileName); + throw std::runtime_error(std::string("Failed to open %s for writing in save_file()!\n") + fileName); } } -void NumpyFile::GetFileInfo(const char* fileName, - uint32_t numArrayToFindSize, - uint32_t* ptrNumArrays, - uint32_t* ptrNumMemoryBytes) { +void NumpyFile::get_file_info(const char* fileName, + uint32_t numArrayToFindSize, + uint32_t* ptrNumArrays, + uint32_t* ptrNumMemoryBytes) { uint32_t numArrays = 0; uint32_t numMemoryBytes = 0; @@ -135,17 +135,17 @@ void NumpyFile::GetFileInfo(const char* fileName, if (ptrNumMemoryBytes != NULL) *ptrNumMemoryBytes = numMemoryBytes; } else { - throw std::runtime_error(std::string("Failed to get info %s GetFileInfo()!\n") + fileName); + throw std::runtime_error(std::string("Failed to get info %s get_file_info()!\n") + fileName); } } -void NumpyFile::LoadFile(const char* fileName, - uint32_t arrayIndex, - std::string& ptrName, - std::vector& memory, - uint32_t* ptrNumRows, - uint32_t* ptrNumColumns, - uint32_t* ptrNumBytesPerElement) { +void NumpyFile::load_file(const char* fileName, + uint32_t arrayIndex, + std::string& ptrName, + std::vector& memory, + uint32_t* ptrNumRows, + uint32_t* ptrNumColumns, + uint32_t* ptrNumBytesPerElement) { cnpy::npz_t my_npz1 = cnpy::npz_load(fileName); auto it = my_npz1.begin(); std::advance(it, arrayIndex); @@ -161,16 +161,16 @@ void NumpyFile::LoadFile(const char* fileName, *ptrNumBytesPerElement = sizeof(float); } else { - throw std::runtime_error(std::string("Failed to open %s for reading in LoadFile()!\n") + fileName); + throw std::runtime_error(std::string("Failed to open %s for reading in load_file()!\n") + fileName); } } -void NumpyFile::SaveFile(const char* fileName, - bool shouldAppend, - std::string name, - void* ptrMemory, - uint32_t numRows, - uint32_t numColumns) { +void NumpyFile::save_file(const char* fileName, + bool shouldAppend, + std::string name, + void* ptrMemory, + uint32_t numRows, + uint32_t numColumns) { std::string mode; shouldAppend ? mode = "a" : mode = "w"; std::vector shape{numRows, numColumns}; diff --git a/samples/cpp/speech_sample/fileutils.hpp b/samples/cpp/speech_sample/fileutils.hpp index 9928b7d956d..294ae02aac5 100644 --- a/samples/cpp/speech_sample/fileutils.hpp +++ b/samples/cpp/speech_sample/fileutils.hpp @@ -11,25 +11,25 @@ /// @brief Interface to work with files like input and output class BaseFile { public: - virtual void LoadFile(const char* fileName, - uint32_t arrayIndex, - std::string& ptrName, - std::vector& memory, - uint32_t* ptrNumRows, - uint32_t* ptrNumColumns, - uint32_t* ptrNumBytesPerElement) = 0; + virtual void load_file(const char* fileName, + uint32_t arrayIndex, + std::string& ptrName, + std::vector& memory, + uint32_t* ptrNumRows, + uint32_t* ptrNumColumns, + uint32_t* ptrNumBytesPerElement) = 0; - virtual void SaveFile(const char* fileName, - bool shouldAppend, - std::string name, - void* ptrMemory, - uint32_t numRows, - uint32_t numColumns) = 0; + virtual void save_file(const char* fileName, + bool shouldAppend, + std::string name, + void* ptrMemory, + uint32_t numRows, + uint32_t numColumns) = 0; - virtual void GetFileInfo(const char* fileName, - uint32_t numArrayToFindSize, - uint32_t* ptrNumArrays, - uint32_t* ptrNumMemoryBytes) = 0; + virtual void get_file_info(const char* fileName, + uint32_t numArrayToFindSize, + uint32_t* ptrNumArrays, + uint32_t* ptrNumMemoryBytes) = 0; }; /// @brief Responsible to work with .ark files @@ -43,10 +43,10 @@ public: * @param ptrNumMemoryBytes pointer to specific number of memory bytes * @return none. */ - void GetFileInfo(const char* fileName, - uint32_t numArrayToFindSize, - uint32_t* ptrNumArrays, - uint32_t* ptrNumMemoryBytes) override; + void get_file_info(const char* fileName, + uint32_t numArrayToFindSize, + uint32_t* ptrNumArrays, + uint32_t* ptrNumMemoryBytes) override; /** * @brief Load Kaldi ARK speech feature vector file @@ -59,13 +59,13 @@ public: * @param ptrNumBytesPerElement pointer to number bytes per element (size of float by default) * @return none. */ - void LoadFile(const char* fileName, - uint32_t arrayIndex, - std::string& ptrName, - std::vector& memory, - uint32_t* ptrNumRows, - uint32_t* ptrNumColumns, - uint32_t* ptrNumBytesPerElement) override; + void load_file(const char* fileName, + uint32_t arrayIndex, + std::string& ptrName, + std::vector& memory, + uint32_t* ptrNumRows, + uint32_t* ptrNumColumns, + uint32_t* ptrNumBytesPerElement) override; /** * @brief Save Kaldi ARK speech feature vector file @@ -77,12 +77,12 @@ public: * @param numColumns number of columns * @return none. */ - void SaveFile(const char* fileName, - bool shouldAppend, - std::string name, - void* ptrMemory, - uint32_t numRows, - uint32_t numColumns) override; + void save_file(const char* fileName, + bool shouldAppend, + std::string name, + void* ptrMemory, + uint32_t numRows, + uint32_t numColumns) override; }; /// @brief Responsible to work with .npz files @@ -96,10 +96,10 @@ public: * @param ptrNumMemoryBytes pointer to specific number of memory bytes * @return none. */ - void GetFileInfo(const char* fileName, - uint32_t numArrayToFindSize, - uint32_t* ptrNumArrays, - uint32_t* ptrNumMemoryBytes) override; + void get_file_info(const char* fileName, + uint32_t numArrayToFindSize, + uint32_t* ptrNumArrays, + uint32_t* ptrNumMemoryBytes) override; /** * @brief Load Numpy* uncompressed NPZ speech feature vector file @@ -112,13 +112,13 @@ public: * @param ptrNumBytesPerElement pointer to number bytes per element (size of float by default) * @return none. */ - void LoadFile(const char* fileName, - uint32_t arrayIndex, - std::string& ptrName, - std::vector& memory, - uint32_t* ptrNumRows, - uint32_t* ptrNumColumns, - uint32_t* ptrNumBytesPerElement) override; + void load_file(const char* fileName, + uint32_t arrayIndex, + std::string& ptrName, + std::vector& memory, + uint32_t* ptrNumRows, + uint32_t* ptrNumColumns, + uint32_t* ptrNumBytesPerElement) override; /** * @brief Save Numpy* uncompressed NPZ speech feature vector file @@ -130,10 +130,10 @@ public: * @param numColumns number of columns * @return none. */ - void SaveFile(const char* fileName, - bool shouldAppend, - std::string name, - void* ptrMemory, - uint32_t numRows, - uint32_t numColumns) override; + void save_file(const char* fileName, + bool shouldAppend, + std::string name, + void* ptrMemory, + uint32_t numRows, + uint32_t numColumns) override; }; diff --git a/samples/cpp/speech_sample/main.cpp b/samples/cpp/speech_sample/main.cpp index ce4b69861a9..345c8d36df4 100644 --- a/samples/cpp/speech_sample/main.cpp +++ b/samples/cpp/speech_sample/main.cpp @@ -67,7 +67,7 @@ int main(int argc, char* argv[]) { while (getline(stream, outStr, ',')) { std::string filename(fileNameNoExt(outStr) + "." + extInputFile); inputFiles.push_back(filename); - file->GetFileInfo(filename.c_str(), 0, ¤tNumUtterances, ¤tNumBytesThisUtterance); + file->get_file_info(filename.c_str(), 0, ¤tNumUtterances, ¤tNumBytesThisUtterance); if (numUtterances == 0) { numUtterances = currentNumUtterances; } else if (currentNumUtterances != numUtterances) { @@ -84,7 +84,7 @@ int main(int argc, char* argv[]) { ov::runtime::Core core; slog::info << "Loading model files:" << slog::endl << FLAGS_m << slog::endl; std::shared_ptr model = core.read_model(FLAGS_m); - CheckNumberOfInputs(model->inputs().size(), numInputFiles); + check_number_of_inputs(model->inputs().size(), numInputFiles); const ov::Layout tensor_layout{"NC"}; ov::preprocess::PrePostProcessor proc(model); for (int i = 0; i < model->inputs().size(); i++) { @@ -122,7 +122,7 @@ int main(int argc, char* argv[]) { if (!FLAGS_rg.empty()) { slog::warn << "Custom scale factor will be used for imported gna model: " << FLAGS_rg << slog::endl; } - auto scaleFactorInput = ParseScaleFactors(FLAGS_sf); + auto scaleFactorInput = parse_scale_factors(FLAGS_sf); if (numInputFiles != scaleFactorInput.size()) { std::string errMessage( "Incorrect command line for multiple inputs: " + std::to_string(scaleFactorInput.size()) + @@ -144,17 +144,18 @@ int main(int argc, char* argv[]) { std::string name; std::vector ptrFeatures; uint32_t numArrays(0), numBytes(0), numFrames(0), numFrameElements(0), numBytesPerElement(0); - file->GetFileInfo(inputFileName, 0, &numArrays, &numBytes); + file->get_file_info(inputFileName, 0, &numArrays, &numBytes); ptrFeatures.resize(numBytes); - file->LoadFile(inputFileName, - 0, - name, - ptrFeatures, - &numFrames, - &numFrameElements, - &numBytesPerElement); - auto floatScaleFactor = - ScaleFactorForQuantization(ptrFeatures.data(), MAX_VAL_2B_FEAT, numFrames * numFrameElements); + file->load_file(inputFileName, + 0, + name, + ptrFeatures, + &numFrames, + &numFrameElements, + &numBytesPerElement); + auto floatScaleFactor = scale_factor_for_quantization(ptrFeatures.data(), + MAX_VAL_2B_FEAT, + numFrames * numFrameElements); slog::info << "Using scale factor of " << floatScaleFactor << " calculated from first utterance." << slog::endl; std::string scaleFactorConfigKey = @@ -188,7 +189,7 @@ int main(int argc, char* argv[]) { auto t0 = Time::now(); std::vector outputs; if (!FLAGS_oname.empty()) { - std::vector output_names = ConvertStrToVector(FLAGS_oname); + std::vector output_names = convert_str_to_vector(FLAGS_oname); std::vector ports; for (const auto& outBlobName : output_names) { int pos_layer = outBlobName.rfind(":"); @@ -246,9 +247,9 @@ int main(int argc, char* argv[]) { // -------------------------------------------------- std::vector ptrInputBlobs; auto cInputInfo = executableNet.inputs(); - CheckNumberOfInputs(cInputInfo.size(), numInputFiles); + check_number_of_inputs(cInputInfo.size(), numInputFiles); if (!FLAGS_iname.empty()) { - std::vector inputNameBlobs = ConvertStrToVector(FLAGS_iname); + std::vector inputNameBlobs = convert_str_to_vector(FLAGS_iname); if (inputNameBlobs.size() != cInputInfo.size()) { std::string errMessage(std::string("Number of network inputs ( ") + std::to_string(cInputInfo.size()) + " ) is not equal to the number of inputs entered in the -iname argument ( " + @@ -272,14 +273,14 @@ int main(int argc, char* argv[]) { std::vector reference_name_files; size_t count_file = 1; if (!FLAGS_o.empty()) { - output_name_files = ConvertStrToVector(FLAGS_o); + output_name_files = convert_str_to_vector(FLAGS_o); if (output_name_files.size() != outputs.size() && !outputs.empty()) { throw std::logic_error("The number of output files is not equal to the number of network outputs."); } count_file = output_name_files.empty() ? 1 : output_name_files.size(); } if (!FLAGS_r.empty()) { - reference_name_files = ConvertStrToVector(FLAGS_r); + reference_name_files = convert_str_to_vector(FLAGS_r); if (reference_name_files.size() != outputs.size() && !outputs.empty()) { throw std::logic_error("The number of reference files is not equal to the number of network outputs."); } @@ -291,7 +292,7 @@ int main(int argc, char* argv[]) { std::vector> ptrUtterances; std::vector ptrScores; std::vector ptrReferenceScores; - score_error_t frameError, totalError; + ScoreErrorT frameError, totalError; ptrUtterances.resize(inputFiles.size()); // initialize memory state before starting for (auto&& state : inferRequests.begin()->inferRequest.query_state()) { @@ -316,15 +317,15 @@ int main(int argc, char* argv[]) { std::vector ptrUtterance; auto inputFilename = inputFiles[i].c_str(); uint32_t currentNumFrames(0), currentNumFrameElementsInput(0), currentNumBytesPerElementInput(0); - file->GetFileInfo(inputFilename, utteranceIndex, &n, &numBytesThisUtterance[i]); + file->get_file_info(inputFilename, utteranceIndex, &n, &numBytesThisUtterance[i]); ptrUtterance.resize(numBytesThisUtterance[i]); - file->LoadFile(inputFilename, - utteranceIndex, - uttName, - ptrUtterance, - ¤tNumFrames, - ¤tNumFrameElementsInput, - ¤tNumBytesPerElementInput); + file->load_file(inputFilename, + utteranceIndex, + uttName, + ptrUtterance, + ¤tNumFrames, + ¤tNumFrameElementsInput, + ¤tNumBytesPerElementInput); if (numFrames == 0) { numFrames = currentNumFrames; } else if (numFrames != currentNumFrames) { @@ -356,22 +357,22 @@ int main(int argc, char* argv[]) { throw std::logic_error("Invalid Reference Scores file"); } std::string refUtteranceName; - fileReferenceScores->GetFileInfo(reference_name_files[next_output].c_str(), - utteranceIndex, - &n, - &numBytesReferenceScoreThisUtterance); + fileReferenceScores->get_file_info(reference_name_files[next_output].c_str(), + utteranceIndex, + &n, + &numBytesReferenceScoreThisUtterance); ptrReferenceScores.resize(numBytesReferenceScoreThisUtterance); - fileReferenceScores->LoadFile(reference_name_files[next_output].c_str(), - utteranceIndex, - refUtteranceName, - ptrReferenceScores, - &numFramesReference, - &numFrameElementsReference, - &numBytesPerElementReference); + fileReferenceScores->load_file(reference_name_files[next_output].c_str(), + utteranceIndex, + refUtteranceName, + ptrReferenceScores, + &numFramesReference, + &numFrameElementsReference, + &numBytesPerElementReference); } double totalTime = 0.0; std::cout << "Utterance " << utteranceIndex << ": " << std::endl; - ClearScoreError(&totalError); + clear_score_error(&totalError); totalError.threshold = frameError.threshold = MAX_SCORE_DIFFERENCE; auto outputFrame = &ptrScores.front(); std::vector inputFrame; @@ -428,20 +429,20 @@ int main(int argc, char* argv[]) { if (!FLAGS_oname.empty()) outputBlob = inferRequest.inferRequest.get_tensor(executableNet.outputs().back()); - CompareScores( + compare_scores( outputBlob.data(), &ptrReferenceScores[inferRequest.frameIndex * numFrameElementsReference * numBytesPerElementReference], &frameError, inferRequest.numFramesThisBatch, numFrameElementsReference); - UpdateScoreError(&frameError, &totalError); + update_score_error(&frameError, &totalError); } if (FLAGS_pc) { // retrieve new counters - getPerformanceCounters(inferRequest.inferRequest, callPerfMap); + get_performance_counters(inferRequest.inferRequest, callPerfMap); // summarize retrieved counters with all previous - sumPerformanceCounters(callPerfMap, utterancePerfMap, totalNumberOfRunsOnHw); + sum_performance_counters(callPerfMap, utterancePerfMap, totalNumberOfRunsOnHw); } } // ----------------------------------------------------------------------------------------------------- @@ -510,12 +511,12 @@ int main(int argc, char* argv[]) { } /* Save output data to file */ bool shouldAppend = (utteranceIndex == 0) ? false : true; - fileOutput->SaveFile(output_name_files[next_output].c_str(), - shouldAppend, - uttName, - &ptrScores.front(), - numFramesFile, - numScoresPerFrame); + fileOutput->save_file(output_name_files[next_output].c_str(), + shouldAppend, + uttName, + &ptrScores.front(), + numFramesFile, + numScoresPerFrame); } /** Show performance results **/ std::cout << "Total time in Infer (HW and SW):\t" << totalTime << " ms" << std::endl; @@ -524,16 +525,16 @@ int main(int argc, char* argv[]) { << std::endl; if (FLAGS_pc) { // print performance results - printPerformanceCounters(utterancePerfMap, - frameIndex, - std::cout, - getFullDeviceName(core, FLAGS_d), - totalNumberOfRunsOnHw, - FLAGS_d); + print_performance_counters(utterancePerfMap, + frameIndex, + std::cout, + getFullDeviceName(core, FLAGS_d), + totalNumberOfRunsOnHw, + FLAGS_d); } if (!FLAGS_r.empty()) { // print statistical score error - printReferenceCompareResults(totalError, numFrames, std::cout); + print_reference_compare_results(totalError, numFrames, std::cout); } std::cout << "End of Utterance " << utteranceIndex << std::endl << std::endl; // ----------------------------------------------------------------------------------------------------- diff --git a/samples/cpp/speech_sample/utils.hpp b/samples/cpp/speech_sample/utils.hpp index 3216ce42531..664aec69f90 100644 --- a/samples/cpp/speech_sample/utils.hpp +++ b/samples/cpp/speech_sample/utils.hpp @@ -17,7 +17,7 @@ typedef std::chrono::duration fsec; /** * @brief struct to store score error */ -typedef struct { +struct ScoreErrorT { uint32_t numScores; uint32_t numErrors; float threshold; @@ -29,7 +29,7 @@ typedef struct { float maxRelError; float sumRelError; float sumSquaredRelError; -} score_error_t; +}; /** * @brief struct to store infer request data per frame @@ -46,7 +46,7 @@ struct InferRequestStruct { * @param numInputFiles number of input files * @return none. */ -void CheckNumberOfInputs(size_t numInputs, size_t numInputFiles) { +void check_number_of_inputs(size_t numInputs, size_t numInputFiles) { if (numInputs != numInputFiles) { throw std::logic_error("Number of network inputs (" + std::to_string(numInputs) + ")" @@ -62,7 +62,7 @@ void CheckNumberOfInputs(size_t numInputs, size_t numInputFiles) { * @param numElements number of elements in speech feature vector * @return scale factor */ -float ScaleFactorForQuantization(void* ptrFloatMemory, float targetMax, uint32_t numElements) { +float scale_factor_for_quantization(void* ptrFloatMemory, float targetMax, uint32_t numElements) { float* ptrFloatFeat = reinterpret_cast(ptrFloatMemory); float max = 0.0; float scaleFactor; @@ -87,7 +87,7 @@ float ScaleFactorForQuantization(void* ptrFloatMemory, float targetMax, uint32_t * @param error pointer to score error struct * @return none. */ -void ClearScoreError(score_error_t* error) { +void clear_score_error(ScoreErrorT* error) { error->numScores = 0; error->numErrors = 0; error->maxError = 0.0; @@ -106,7 +106,7 @@ void ClearScoreError(score_error_t* error) { * @param totalError pointer to total score error struct * @return none. */ -void UpdateScoreError(score_error_t* error, score_error_t* totalError) { +void update_score_error(ScoreErrorT* error, ScoreErrorT* totalError) { totalError->numErrors += error->numErrors; totalError->numScores += error->numScores; totalError->sumRmsError += error->rmsError; @@ -131,14 +131,14 @@ void UpdateScoreError(score_error_t* error, score_error_t* totalError) { * @param numColumns - number columns in score error arrays * @return none. */ -void CompareScores(float* ptrScoreArray, - void* ptrRefScoreArray, - score_error_t* scoreError, - uint32_t numRows, - uint32_t numColumns) { +void compare_scores(float* ptrScoreArray, + void* ptrRefScoreArray, + ScoreErrorT* scoreError, + uint32_t numRows, + uint32_t numColumns) { uint32_t numErrors = 0; - ClearScoreError(scoreError); + clear_score_error(scoreError); float* A = ptrScoreArray; float* B = reinterpret_cast(ptrRefScoreArray); @@ -178,7 +178,7 @@ void CompareScores(float* ptrScoreArray, * @param error pointer to score error struct * @return error */ -float StdDevError(score_error_t error) { +float std_dev_error(ScoreErrorT error) { return (sqrt(error.sumSquaredError / error.numScores - (error.sumError / error.numScores) * (error.sumError / error.numScores))); } @@ -211,7 +211,7 @@ inline void native_cpuid(unsigned int* eax, unsigned int* ebx, unsigned int* ecx * @brief Get GNA module frequency * @return GNA module frequency in MHz */ -float getGnaFrequencyMHz() { +float get_gna_frequency_mHz() { uint32_t eax = 1; uint32_t ebx = 0; uint32_t ecx = 0; @@ -264,11 +264,11 @@ float getGnaFrequencyMHz() { * @param stream output stream * @return none. */ -void printReferenceCompareResults(score_error_t const& totalError, size_t framesNum, std::ostream& stream) { +void print_reference_compare_results(ScoreErrorT const& totalError, size_t framesNum, std::ostream& stream) { stream << " max error: " << totalError.maxError << std::endl; stream << " avg error: " << totalError.sumError / totalError.numScores << std::endl; stream << " avg rms error: " << totalError.sumRmsError / framesNum << std::endl; - stream << " stdev error: " << StdDevError(totalError) << std::endl << std::endl; + stream << " stdev error: " << std_dev_error(totalError) << std::endl << std::endl; stream << std::endl; } @@ -282,12 +282,12 @@ void printReferenceCompareResults(score_error_t const& totalError, size_t frames * @param FLAGS_d flag of device * @return none. */ -void printPerformanceCounters(std::map const& utterancePerfMap, - size_t numberOfFrames, - std::ostream& stream, - std::string fullDeviceName, - const uint64_t numberOfFramesOnHw, - std::string FLAGS_d) { +void print_performance_counters(std::map const& utterancePerfMap, + size_t numberOfFrames, + std::ostream& stream, + std::string fullDeviceName, + const uint64_t numberOfFramesOnHw, + std::string FLAGS_d) { #if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64) stream << std::endl << "Performance counts:" << std::endl; stream << std::setw(10) << std::right << "" @@ -301,7 +301,7 @@ void printPerformanceCounters(std::map stream << std::endl; // if GNA HW counters // get frequency of GNA module - float freq = getGnaFrequencyMHz(); + float freq = get_gna_frequency_mHz(); for (const auto& it : utterancePerfMap) { std::string const& counter_name = it.first; float current_units_us = static_cast(it.second.real_time.count()) / freq; @@ -331,8 +331,8 @@ void printPerformanceCounters(std::map * @param perfCounters reference to a map to save performance counters * @return none. */ -void getPerformanceCounters(ov::runtime::InferRequest& request, - std::map& perfCounters) { +void get_performance_counters(ov::runtime::InferRequest& request, + std::map& perfCounters) { auto retPerfCounters = request.get_profiling_info(); for (const auto& element : retPerfCounters) { @@ -347,9 +347,9 @@ void getPerformanceCounters(ov::runtime::InferRequest& request, * @param totalRunsOnHw reference to a total number of frames computed on GNA HW * @return none. */ -void sumPerformanceCounters(std::map const& perfCounters, - std::map& totalPerfCounters, - uint64_t& totalRunsOnHw) { +void sum_performance_counters(std::map const& perfCounters, + std::map& totalPerfCounters, + uint64_t& totalRunsOnHw) { auto runOnHw = false; for (const auto& pair : perfCounters) { totalPerfCounters[pair.first].real_time += pair.second.real_time; @@ -364,7 +364,7 @@ void sumPerformanceCounters(std::map co * @param str reference to user-specified input scale factor for quantization, can be separated by comma * @return vector scale factors */ -std::vector ParseScaleFactors(const std::string& str) { +std::vector parse_scale_factors(const std::string& str) { std::vector scaleFactorInput; if (!str.empty()) { @@ -391,7 +391,7 @@ std::vector ParseScaleFactors(const std::string& str) { * @param str file names separated by comma * @return vector of file names */ -std::vector ConvertStrToVector(std::string str) { +std::vector convert_str_to_vector(std::string str) { std::vector blobName; if (!str.empty()) { size_t pos_last = 0; From 4505f5d7e2581f65c0e24f96302b593ff011457f Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Wed, 29 Dec 2021 22:25:34 +0900 Subject: [PATCH 25/78] [GPU] Remove remainder handling to read in reduce kernel to improve performance (#9359) Signed-off-by: Kelvin Choi --- .../core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl index 148e379db66..ee747462333 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reduce_gpu_b_fs_yx_fsv16.cl @@ -341,17 +341,7 @@ uint offset = batch_out * input_batch_pitch + ((feature_out + FSV - 1) / FSV) * for (uint fi = feature_out; fi < feature_max_val; fi += FSV) { for (uint yi = y_out; yi < y_max_val; ++yi) { for (uint xi = x_out; xi < x_max_val; ++xi) { - INPUT_VEC input = (INPUT_VEC)(INPUT_INIT_VAL); - #if REDUCE_FEATURE && (INPUT0_FEATURE_NUM % FSV != 0) - if (fi + FSV <= INPUT0_FEATURE_NUM) - input = BLOCK_READ(data, offset); - else - if (fi + get_sub_group_local_id() < INPUT0_FEATURE_NUM) - for (int i = 0; i < READ_OFFSET; ++i) - input[i] = data[offset + get_sub_group_local_id() + i * get_max_sub_group_size()]; - #else - input = BLOCK_READ(data, offset); - #endif + INPUT_VEC input = BLOCK_READ(data, offset); unroll_for (int i = 0; i < READ_OFFSET; ++i) acc[i] = FUNC_CALL(apply_reduce)(acc[i], input[i]); offset += input_x_pitch; From adce58d126adb59e1d3ccea6217a80cbd90e5d57 Mon Sep 17 00:00:00 2001 From: Anton Chetverikov Date: Wed, 29 Dec 2021 16:26:40 +0300 Subject: [PATCH 26/78] [MO] Add check for Python version to NetworkX dependency (#9226) * Add check for python version to networkx dependency * Update condition, task CVS-72806 * Update check to not break old Python versions support * Update POT NetworkX dependency * Allign requirements list format with setuptools documentation --- tools/mo/requirements.txt | 3 ++- tools/mo/requirements_caffe.txt | 3 ++- tools/mo/requirements_kaldi.txt | 3 ++- tools/mo/requirements_mxnet.txt | 3 ++- tools/mo/requirements_onnx.txt | 3 ++- tools/mo/requirements_tf.txt | 3 ++- tools/mo/requirements_tf2.txt | 3 ++- tools/pot/setup.py | 17 +++++++++-------- 8 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tools/mo/requirements.txt b/tools/mo/requirements.txt index 5386262aded..fef835edef2 100644 --- a/tools/mo/requirements.txt +++ b/tools/mo/requirements.txt @@ -1,7 +1,8 @@ tensorflow~=2.5.0 mxnet~=1.2.0; sys_platform == 'win32' mxnet~=1.7.0.post2; sys_platform != 'win32' -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 protobuf>=3.15.6 onnx>=1.8.1 diff --git a/tools/mo/requirements_caffe.txt b/tools/mo/requirements_caffe.txt index d1eef645f44..77a8eb5bdaf 100644 --- a/tools/mo/requirements_caffe.txt +++ b/tools/mo/requirements_caffe.txt @@ -1,4 +1,5 @@ -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 protobuf>=3.15.6 defusedxml>=0.7.1 diff --git a/tools/mo/requirements_kaldi.txt b/tools/mo/requirements_kaldi.txt index 1068d95240c..8685cccb710 100644 --- a/tools/mo/requirements_kaldi.txt +++ b/tools/mo/requirements_kaldi.txt @@ -1,4 +1,5 @@ -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 defusedxml>=0.7.1 requests>=2.25.1 diff --git a/tools/mo/requirements_mxnet.txt b/tools/mo/requirements_mxnet.txt index 61897faa500..9d3e2608802 100644 --- a/tools/mo/requirements_mxnet.txt +++ b/tools/mo/requirements_mxnet.txt @@ -1,6 +1,7 @@ mxnet~=1.2.0; sys_platform == 'win32' mxnet~=1.7.0.post2; sys_platform != 'win32' -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 defusedxml>=0.7.1 urllib3>=1.26.4 diff --git a/tools/mo/requirements_onnx.txt b/tools/mo/requirements_onnx.txt index a6415939ccd..6040f072c06 100644 --- a/tools/mo/requirements_onnx.txt +++ b/tools/mo/requirements_onnx.txt @@ -1,5 +1,6 @@ onnx>=1.8.1 -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 defusedxml>=0.7.1 requests>=2.25.1 diff --git a/tools/mo/requirements_tf.txt b/tools/mo/requirements_tf.txt index d7514869e39..890d6f48e71 100644 --- a/tools/mo/requirements_tf.txt +++ b/tools/mo/requirements_tf.txt @@ -1,5 +1,6 @@ tensorflow~=2.5.0 -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 defusedxml>=0.7.1 requests>=2.25.1 diff --git a/tools/mo/requirements_tf2.txt b/tools/mo/requirements_tf2.txt index d7514869e39..890d6f48e71 100644 --- a/tools/mo/requirements_tf2.txt +++ b/tools/mo/requirements_tf2.txt @@ -1,5 +1,6 @@ tensorflow~=2.5.0 -networkx~=2.5 +networkx~=2.5; python_version <= "3.6" +networkx~=2.6; python_version > "3.6" numpy>=1.16.6,<1.20 defusedxml>=0.7.1 requests>=2.25.1 diff --git a/tools/pot/setup.py b/tools/pot/setup.py index b993033d8c3..290ec4e8d59 100644 --- a/tools/pot/setup.py +++ b/tools/pot/setup.py @@ -45,14 +45,15 @@ if '--install-dev-extras' in sys.argv: sys.argv.remove('--install-dev-extras') INSTALL_REQUIRES = [ - 'scipy~=1.5.4', - 'jstyleson~=0.0.2', - 'numpy>=1.16.6,<1.20', - 'addict>=2.4.0', - 'networkx~=2.5', - 'tqdm>=4.54.1', - 'texttable~=1.6.3', - 'pandas~=1.1.5', + "scipy~=1.5.4", + "jstyleson~=0.0.2", + "numpy>=1.16.6,<1.20", + "addict>=2.4.0", + "networkx~=2.5;python_version<='3.6'", + "networkx~=2.6;python_version>'3.6'", + "tqdm>=4.54.1", + "texttable~=1.6.3", + "pandas~=1.1.5", ] ALGO_EXTRAS = [ From 9802a4cb5dbe81dbfb3e4d302b5a249a9df20c6d Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Wed, 29 Dec 2021 16:48:59 +0300 Subject: [PATCH 27/78] [OMZ]: update submodule (#9472) --- thirdparty/open_model_zoo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/open_model_zoo b/thirdparty/open_model_zoo index e813a1f0ee0..452ba6a62cb 160000 --- a/thirdparty/open_model_zoo +++ b/thirdparty/open_model_zoo @@ -1 +1 @@ -Subproject commit e813a1f0ee071c48b8bb3f56ec85331be9b35c07 +Subproject commit 452ba6a62cb7ea9457cebf9819793b4201a14365 From 8c145a25c9eeb970482ffd184afa436a0ad4f5fb Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Wed, 29 Dec 2021 17:31:05 +0300 Subject: [PATCH 28/78] [MO] mo_version getting fix (#9135) * mo_version getting fix * hardcoded major minor version in MO * Update check_ie_bindings.py hard error in version comparison fixed --- tools/mo/openvino/tools/mo/utils/version.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tools/mo/openvino/tools/mo/utils/version.py b/tools/mo/openvino/tools/mo/utils/version.py index c449a2d9368..382fde0ef12 100644 --- a/tools/mo/openvino/tools/mo/utils/version.py +++ b/tools/mo/openvino/tools/mo/utils/version.py @@ -3,12 +3,16 @@ import os import re -import sys import subprocess +import sys + +from openvino.tools.mo.utils.utils import get_mo_root_dir + +mo_major_and_minor_version = 2022.1 def get_version_file_path(): - return os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, os.pardir, "version.txt") + return os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir, "version.txt") def generate_mo_version(): @@ -17,9 +21,10 @@ def generate_mo_version(): custom_{branch_name}_{commit_hash} """ try: - branch_name = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).strip().decode() - commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode() - return "custom_{}_{}".format(branch_name, commit_hash) + mo_dir = get_mo_root_dir() + branch_name = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd=mo_dir).strip().decode() + commit_hash = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=mo_dir).strip().decode() + return "{}.custom_{}_{}".format(mo_major_and_minor_version, branch_name, commit_hash) except Exception as e: return "unknown version" @@ -29,8 +34,7 @@ def get_version(): if not os.path.isfile(version_txt): return generate_mo_version() with open(version_txt) as f: - version = f.readline().replace('\n', '') - return version + return f.readline().replace('\n', '') def extract_release_version(version: str): From f1a25b398c08d91040d33adff9a244e7568e0b12 Mon Sep 17 00:00:00 2001 From: Sergey Lyubimtsev Date: Wed, 29 Dec 2021 17:57:56 +0300 Subject: [PATCH 29/78] Clean up setupvars scripts (#9410) * Clean up setupvars scripts * align opencv * fix opencv location * fix env windows * fix env windows * check upper bound for supported python version * remove ROOT var * revert PYTHONPATH * fix error message * fix pyver option * revert OpenCV * okay -> true --- .ci/azure/linux.yml | 2 +- .ci/azure/mac.yml | 2 +- scripts/setupvars/setupvars.bat | 44 ++++++++++++++------------------- scripts/setupvars/setupvars.sh | 39 +++++++++++++++-------------- 4 files changed, 41 insertions(+), 46 deletions(-) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index 33e19d63cf6..e8831030ed8 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -208,7 +208,7 @@ jobs: - script: | set -e - mkdir $(INSTALL_DIR)/opencv/ + mkdir -p $(INSTALL_DIR)/opencv/ cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_DIR) -DCOMPONENT=tests -P cmake_install.cmake cp -R $(REPO_DIR)/temp/opencv_4.5.2_ubuntu20/opencv/* $(INSTALL_DIR)/opencv/ workingDirectory: $(BUILD_DIR) diff --git a/.ci/azure/mac.yml b/.ci/azure/mac.yml index 804ce171362..366792f283e 100644 --- a/.ci/azure/mac.yml +++ b/.ci/azure/mac.yml @@ -119,7 +119,7 @@ jobs: - script: | set -e - mkdir $(INSTALL_DIR)/opencv/ + mkdir -p $(INSTALL_DIR)/opencv/ cmake -DCMAKE_INSTALL_PREFIX=$(INSTALL_DIR) -DCOMPONENT=tests -P cmake_install.cmake cp -R $(REPO_DIR)/temp/opencv_4.5.2_osx/opencv/* $(INSTALL_DIR)/opencv/ workingDirectory: $(BUILD_DIR) diff --git a/scripts/setupvars/setupvars.bat b/scripts/setupvars/setupvars.bat index c5235b9158a..34a859b39ef 100644 --- a/scripts/setupvars/setupvars.bat +++ b/scripts/setupvars/setupvars.bat @@ -3,11 +3,9 @@ :: Copyright (C) 2018-2021 Intel Corporation :: SPDX-License-Identifier: Apache-2.0 -set ROOT=%~dp0 set SCRIPT_NAME=%~nx0 -set "INTEL_OPENVINO_DIR=%ROOT%" -set "INTEL_CVSDK_DIR=%INTEL_OPENVINO_DIR%" +set "INTEL_OPENVINO_DIR=%~dp0" set "python_version=" @@ -23,21 +21,16 @@ if not "%1"=="" ( ) :: OpenCV -if exist "%INTEL_OPENVINO_DIR%\extras\opencv\setupvars.bat" ( -call "%INTEL_OPENVINO_DIR%\extras\opencv\setupvars.bat" -goto :opencv_done -) if exist "%INTEL_OPENVINO_DIR%\opencv\setupvars.bat" ( call "%INTEL_OPENVINO_DIR%\opencv\setupvars.bat" goto :opencv_done ) -:opencv_done -:: Model Optimizer -if exist %INTEL_OPENVINO_DIR%\tools\mo ( -set PYTHONPATH=%INTEL_OPENVINO_DIR%\tools\mo;%PYTHONPATH% -set "PATH=%INTEL_OPENVINO_DIR%\tools\mo;%PATH%" +if exist "%INTEL_OPENVINO_DIR%\extras\opencv\setupvars.bat" ( +call "%INTEL_OPENVINO_DIR%\extras\opencv\setupvars.bat" +goto :opencv_done ) +:opencv_done :: OpenVINO runtime set "InferenceEngine_DIR=%INTEL_OPENVINO_DIR%\runtime\cmake" @@ -61,14 +54,17 @@ set "PATH=%INTEL_OPENVINO_DIR%\tools\compile_tool;%PATH%" set "PATH=%OPENVINO_LIB_PATHS%;%PATH%" :: Check if Python is installed +set PYTHON_VERSION_MAJOR=3 +set MIN_REQUIRED_PYTHON_VERSION_MINOR=6 +set MAX_SUPPORTED_PYTHON_VERSION_MINOR=9 + python --version 2>NUL if errorlevel 1 ( - echo Error^: Python is not installed. Please install one of Python 3.6 - 3.8 ^(64-bit^) from https://www.python.org/downloads/ + echo Error^: Python is not installed. Please install one of Python %PYTHON_VERSION_MAJOR%.%MIN_REQUIRED_PYTHON_VERSION_MINOR% - %PYTHON_VERSION_MAJOR%.%MAX_SUPPORTED_PYTHON_VERSION_MINOR% ^(64-bit^) from https://www.python.org/downloads/ exit /B 1 ) :: Check Python version if user did not pass -pyver - if "%python_version%" == "" ( for /F "tokens=* USEBACKQ" %%F IN (`python -c "import sys; print(str(sys.version_info[0])+'.'+str(sys.version_info[1]))" 2^>^&1`) DO ( set python_version=%%F @@ -80,14 +76,16 @@ for /F "tokens=1,2 delims=. " %%a in ("%python_version%") do ( set pyversion_minor=%%b ) -if "%pyversion_major%" geq "3" ( - if "%pyversion_minor%" geq "6" ( - set check_pyversion=okay - ) +if "%pyversion_major%" equ "%PYTHON_VERSION_MAJOR%" ( + if "%pyversion_minor%" geq "%MIN_REQUIRED_PYTHON_VERSION_MINOR%" ( + if "%pyversion_minor%" leq "%MAX_SUPPORTED_PYTHON_VERSION_MINOR%" ( + set "check_pyversion=true" + ) + ) ) -if not "%check_pyversion%"=="okay" ( - echo Unsupported Python version. Please install one of Python 3.6 - 3.8 ^(64-bit^) from https://www.python.org/downloads/ +if not "%check_pyversion%"=="true" ( + echo Unsupported Python version. Please install one of Python %PYTHON_VERSION_MAJOR%.%MIN_REQUIRED_PYTHON_VERSION_MINOR% - %PYTHON_VERSION_MAJOR%.%MAX_SUPPORTED_PYTHON_VERSION_MINOR% ^(64-bit^) from https://www.python.org/downloads/ exit /B 1 ) @@ -103,16 +101,12 @@ for /F "tokens=* USEBACKQ" %%F IN (`python -c "import sys; print(64 if sys.maxsi ) if not "%bitness%"=="64" ( - echo Unsupported Python bitness. Please install one of Python 3.6 - 3.8 ^(64-bit^) from https://www.python.org/downloads/ + echo Unsupported Python bitness. Please install one of Python %PYTHON_VERSION_MAJOR%.%MIN_REQUIRED_PYTHON_VERSION_MINOR% - %PYTHON_VERSION_MAJOR%.%MAX_SUPPORTED_PYTHON_VERSION_MINOR%^(64-bit^) from https://www.python.org/downloads/ exit /B 1 ) set PYTHONPATH=%INTEL_OPENVINO_DIR%\python\python%pyversion_major%.%pyversion_minor%;%INTEL_OPENVINO_DIR%\python\python3;%PYTHONPATH% -if exist %INTEL_OPENVINO_DIR%\tools\post_training_optimization_toolkit ( - set PYTHONPATH=%INTEL_OPENVINO_DIR%\tools\post_training_optimization_toolkit;%PYTHONPATH% -) - echo [setupvars.bat] OpenVINO environment initialized exit /B 0 diff --git a/scripts/setupvars/setupvars.sh b/scripts/setupvars/setupvars.sh index 99b651f545d..99e787c90a5 100755 --- a/scripts/setupvars/setupvars.sh +++ b/scripts/setupvars/setupvars.sh @@ -59,29 +59,25 @@ if [ -e "$INSTALLDIR/tools/compile_tool" ]; then export LD_LIBRARY_PATH=$INSTALLDIR/tools/compile_tool${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} fi - # OpenCV environment -for _loc in "extras/opencv" "opencv" ; do - _fname="$INSTALLDIR/${_loc}/setupvars.sh" - [ -f "${_fname}" ] && source "${_fname}" && break -done - - -if [ -f "$INTEL_OPENVINO_DIR/extras/dl_streamer/setupvars.sh" ]; then - source "$INTEL_OPENVINO_DIR/extras/dl_streamer/setupvars.sh" +if [ -f "$INSTALLDIR/opencv/setupvars.sh" ]; then + source "$INSTALLDIR/opencv/setupvars.sh" fi -export PATH="$INTEL_OPENVINO_DIR/tools/mo${PATH:+:$PATH}" -export PYTHONPATH="$INTEL_OPENVINO_DIR/tools/mo${PYTHONPATH:+:$PYTHONPATH}" - -if [ -e "$INTEL_OPENVINO_DIR/tools/post_training_optimization_toolkit" ]; then - export PYTHONPATH="$INTEL_OPENVINO_DIR/tools/post_training_optimization_toolkit:$PYTHONPATH" +if [ -f "$INSTALLDIR/extras/opencv/setupvars.sh" ]; then + source "$INSTALLDIR/extras/opencv/setupvars.sh" fi if [ -z "$python_version" ]; then python_version=$(python3 -c 'import sys; print(str(sys.version_info[0])+"."+str(sys.version_info[1]))') fi +version_arr=(${python_version//./ }) +if [ "${#version_arr[@]}" -ge "2" ]; then + python_version_major=${version_arr[0]} + python_version_minor=${version_arr[1]} +fi + OS_NAME="" if command -v lsb_release >/dev/null 2>&1; then OS_NAME=$(lsb_release -i -s) @@ -92,14 +88,19 @@ if [ "$python_bitness" != "" ] && [ "$python_bitness" != "64" ] && [ "$OS_NAME" echo "[setupvars.sh] 64 bitness for Python $python_version is required" fi -MINIMUM_REQUIRED_PYTHON_VERSION="3.6" -MAX_SUPPORTED_PYTHON_VERSION=$([[ "$OSTYPE" == "darwin"* ]] && echo '3.7' || echo '3.8') -if [[ -n "$python_version" && "$(printf '%s\n' "$python_version" "$MINIMUM_REQUIRED_PYTHON_VERSION" | sort -V | head -n 1)" != "$MINIMUM_REQUIRED_PYTHON_VERSION" ]]; then - echo "[setupvars.sh] ERROR: Unsupported Python version. Please install one of Python 3.6-${MAX_SUPPORTED_PYTHON_VERSION} (64-bit) from https://www.python.org/downloads/" +PYTHON_VERSION_MAJOR="3" +MIN_REQUIRED_PYTHON_VERSION_MINOR="6" +MAX_SUPPORTED_PYTHON_VERSION_MINOR="9" + +if [ "$PYTHON_VERSION_MAJOR" != "$python_version_major" ] || + [ "$python_version_minor" -lt "$MIN_REQUIRED_PYTHON_VERSION_MINOR" ] || + [ "$python_version_minor" -gt "$MAX_SUPPORTED_PYTHON_VERSION_MINOR" ] ; then + echo "[setupvars.sh] ERROR: Unsupported Python version. Please install one of Python" \ + "${PYTHON_VERSION_MAJOR}.${MIN_REQUIRED_PYTHON_VERSION_MINOR} -" \ + "${PYTHON_VERSION_MAJOR}.${MAX_SUPPORTED_PYTHON_VERSION_MINOR} (64-bit) from https://www.python.org/downloads/" return 1 fi - if [ -n "$python_version" ]; then if [[ -d $INTEL_OPENVINO_DIR/python ]]; then # add path to OpenCV API for Python 3.x From b144089ef7da6383ff61835df11ba34c7b04c69b Mon Sep 17 00:00:00 2001 From: Maxim Gordeev Date: Wed, 29 Dec 2021 23:50:19 +0300 Subject: [PATCH 30/78] [IE_Samples] Updating information about methods in README.md according new API 2.0 (#9477) --- .../cpp/classification_sample_async/README.md | 10 +++++----- samples/cpp/hello_classification/README.md | 10 +++++----- .../hello_nv12_input_classification/README.md | 8 ++++---- samples/cpp/hello_query_device/README.md | 2 +- samples/cpp/hello_reshape_ssd/README.md | 6 +++--- .../ngraph_function_creation_sample/README.md | 8 ++++---- samples/cpp/speech_sample/README.md | 16 ++++++++-------- 7 files changed, 30 insertions(+), 30 deletions(-) diff --git a/samples/cpp/classification_sample_async/README.md b/samples/cpp/classification_sample_async/README.md index d6b92f4a36a..cc3625773ea 100644 --- a/samples/cpp/classification_sample_async/README.md +++ b/samples/cpp/classification_sample_async/README.md @@ -8,11 +8,11 @@ Image Classification Async C++ sample application demonstrates how to use the fo | Feature | API | Description | |:--- |:--- |:--- -|Inference Engine Version| `InferenceEngine::GetInferenceEngineVersion` | Get Inference Engine API version -|Available Devices|`InferenceEngine::Core::GetAvailableDevices`| Get version information of the devices for inference -| Asynchronous Infer | `InferenceEngine::InferRequest::StartAsync`, `InferenceEngine::InferRequest::SetCompletionCallback` | Do asynchronous inference with callback -|Custom Extension Kernels|`InferenceEngine::Core::AddExtension`, `InferenceEngine::Core::SetConfig`| Load extension library and config to the device -| Network Operations | `InferenceEngine::CNNNetwork::setBatchSize`, `InferenceEngine::CNNNetwork::getBatchSize`, `InferenceEngine::CNNNetwork::getFunction` | Managing of network, operate with its batch size. Setting batch size using input image count. +|OpenVINO Runtime Version| `ov::get_openvino_version` | Get Openvino API version +|Available Devices| `ov::runtime::Core::get_available_devices`| Get version information of the devices for inference +| Asynchronous Infer | `ov::runtime::InferRequest::start_async`, `ov::runtime::InferRequest::set_callback` | Do asynchronous inference with callback +|Custom Extension Kernels| `ov::runtime::Core::add_extension`, `ov::runtime::Core::set_config`| Load extension library and config to the device +| Model Operations | `ov::set_batch`, `ov::get_batch` | Managing of model, operate with its batch size. Setting batch size using input image count. Basic Inference Engine API is covered by [Hello Classification C++ sample](../hello_classification/README.md). diff --git a/samples/cpp/hello_classification/README.md b/samples/cpp/hello_classification/README.md index 14fe34e0e40..7024b8f28e9 100644 --- a/samples/cpp/hello_classification/README.md +++ b/samples/cpp/hello_classification/README.md @@ -6,11 +6,11 @@ Hello Classification C++ sample application demonstrates how to use the followin | Feature | API | Description | |:--- |:--- |:--- -| Basic Infer Flow | `InferenceEngine::Core::ReadNetwork`, `InferenceEngine::Core::LoadNetwork`, `InferenceEngine::ExecutableNetwork::CreateInferRequest`, `InferenceEngine::InferRequest::SetBlob`, `InferenceEngine::InferRequest::GetBlob` | Common API to do inference: configure input and output blobs, loading model, create infer request -| Synchronous Infer | `InferenceEngine::InferRequest::Infer` | Do synchronous inference -| Network Operations | `ICNNNetwork::getInputsInfo`, `InferenceEngine::CNNNetwork::getOutputsInfo`, `InferenceEngine::InputInfo::setPrecision` | Managing of network -| Blob Operations| `InferenceEngine::Blob::getTensorDesc`, `InferenceEngine::TensorDesc::getDims`, , `InferenceEngine::TensorDesc::getPrecision`, `InferenceEngine::as`, `InferenceEngine::MemoryBlob::wmap`, `InferenceEngine::MemoryBlob::rmap`, `InferenceEngine::Blob::size` | Work with memory container for storing inputs, outputs of the network, weights and biases of the layers -| Input auto-resize | `InferenceEngine::PreProcessInfo::setResizeAlgorithm`, `InferenceEngine::InputInfo::setLayout` | Set image of the original size as input for a network with other input size. Resize and layout conversions will be performed automatically by the corresponding plugin just before inference +| Basic Infer Flow | `ov::runtime::Core::read_model`, `ov::runtime::Core::compile_model`, `ov::runtime::CompiledModel::create_infer_request`, `ov::runtime::InferRequest::get_input_tensor`, `ov::runtime::InferRequest::set_input_tensor`, `ov::runtime::InferRequest::get_output_tensor` | Common API to do inference: configure input and output tensors, reading model, create infer request +| Synchronous Infer | `ov::runtime::InferRequest::infer` | Do synchronous inference +| Model Operations | `ov::Model::inputs`, `ov::Model::outputs` | Managing of model +| Tensor Operations| `ov::runtime::Tensor::get_element_type`, `ov::runtime::Tensor::get_shape`, `ov::runtime::Tensor::data` | Work with storing inputs, outputs of the model, weights and biases of the layers +| Input auto-resize | `ov::preprocess::PreProcessSteps::resize`, `ov::preprocess::InputInfo::model::set_layout` | Set image of the original size as input for a model with other input size. Resize and layout conversions will be performed automatically by the corresponding plugin just before inference | Options | Values | |:--- |:--- diff --git a/samples/cpp/hello_nv12_input_classification/README.md b/samples/cpp/hello_nv12_input_classification/README.md index 9f632d0131a..d4b9a07442d 100644 --- a/samples/cpp/hello_nv12_input_classification/README.md +++ b/samples/cpp/hello_nv12_input_classification/README.md @@ -6,10 +6,10 @@ Hello NV12 Input Classification C++ Sample demonstrates how to use the NV12 auto | Feature | API | Description | |:--- |:--- |:--- -|Inference Engine Core Operations| `InferenceEngine::Core::GetMetric` | Gets general runtime metric for dedicated hardware -| Blob Operations | `InferenceEngine::NV12Blob` | Create NV12Blob to hold the NV12 input data -| Input in N12 color format | `InferenceEngine::PreProcessInfo::setColorFormat` | Change the color format of the input data -| Model Input Reshape | `InferenceEngine::CNNNetwork::getInputShapes`, `InferenceEngine::CNNNetwork::reshape`, `InferenceEngine::CNNNetwork::getBatchSize` | Set the batch size equal to the number of input images +|OpenVINO Runtime Core Operations| `ov::runtime::Core::get_metric` | Gets general runtime metric for dedicated hardware +| Tensor Operations | `ov::runtime::Tensor::get_element_type`, `ov::runtime::Tensor::get_shape`, `ov::runtime::Tensor::data` | Work with storing inputs, outputs of the model, weights and biases of the layers +| Input in N12 color format | `ov::preprocess::InputTensorInfo::set_color_format` | Change the color format of the input data +| Model Input Reshape | `ov::Model::get_output_shape`, `ov::Model::reshape`, `ov::get_batch` | Set the batch size equal to the number of input images Basic Inference Engine API is covered by [Hello Classification C++ sample](../hello_classification/README.md). diff --git a/samples/cpp/hello_query_device/README.md b/samples/cpp/hello_query_device/README.md index 1bddde6feb6..4f41f5ae2af 100644 --- a/samples/cpp/hello_query_device/README.md +++ b/samples/cpp/hello_query_device/README.md @@ -6,7 +6,7 @@ Hello Query Device C++ sample application demonstrates how to use the following | Feature | API | Description | |:--- |:--- |:--- -|Available Devices|`InferenceEngine::Core::GetAvailableDevices`, `InferenceEngine::Core::GetMetric`, `InferenceEngine::Core::GetConfig`| Get available devices information and configuration for inference +|Available Devices|`ov::runtime::Core::get_available_devices`, `ov::runtime::Core::get_metric`, `ov::runtime::Core::get_config`| Get available devices information and configuration for inference Basic Inference Engine API is covered by [Hello Classification C++ sample](../hello_classification/README.md). diff --git a/samples/cpp/hello_reshape_ssd/README.md b/samples/cpp/hello_reshape_ssd/README.md index d4035999ad2..7a4634c6f29 100644 --- a/samples/cpp/hello_reshape_ssd/README.md +++ b/samples/cpp/hello_reshape_ssd/README.md @@ -6,9 +6,9 @@ Hello Reshape SSD C++ sample application demonstrates how to use the following I | Feature | API | Description | |:--- |:--- |:--- -|Network Operations| `ov::runtime::Core::read_model`, `ov::runtime::Core::compile_model` | Managing of network. -|Input Reshape|`ov::Function::reshape`| Resize network to match image sizes and given batch -|nGraph Functions|`ov::Function::get_ops`, `ov::Node::get_type_info`| Go thru network nGraph +| Model Operations | `ov::runtime::Core::read_model`, `ov::runtime::Core::compile_model` | Managing of model +| Model Input Reshape | `ov::Model::reshape`| Resize model to match image sizes and given batch +| Tensor Operations | `ov::runtime::Tensor::get_element_type`, `ov::runtime::Tensor::get_shape`, `ov::runtime::Tensor::data` | Work with storing inputs, outputs of the model, weights and biases of the layers Basic Inference Engine API is covered by [Hello Classification C++ sample](../hello_classification/README.md). diff --git a/samples/cpp/ngraph_function_creation_sample/README.md b/samples/cpp/ngraph_function_creation_sample/README.md index c5e8ef64851..8d94bf29060 100644 --- a/samples/cpp/ngraph_function_creation_sample/README.md +++ b/samples/cpp/ngraph_function_creation_sample/README.md @@ -10,10 +10,10 @@ nGraph Function Creation C++ Sample demonstrates the following Inference Engine | Feature | API | Description | |:--- |:--- |:--- -|Inference Engine Version| `InferenceEngine::GetInferenceEngineVersion` | Get Inference Engine API version -|Available Devices|`InferenceEngine::Core::GetAvailableDevices`| Get version information of the devices for inference -| Network Operations | `InferenceEngine::CNNNetwork::setBatchSize`, `InferenceEngine::CNNNetwork::getBatchSize` | Managing of network, operate with its batch size. Setting batch size using input image count. -|nGraph Functions| `ngraph::Function`, `ngraph::op`, `ngraph::Node`, `ngraph::Shape::Shape`, `ngraph::Strides::Strides`, `ngraph::CoordinateDiff::CoordinateDiff`, `ngraph::Node::set_friendly_name`, `ngraph::shape_size`, `ngraph::ParameterVector::vector` | Illustrates how to construct an nGraph function +|OpenVINO Runtime Version| `ov::get_openvino_versio` | Get Openvino API version +|Available Devices|`ov::runtime::Core::get_available_devices`| Get version information of the devices for inference +| Model Operations | `ov::set_batch`, `ov::get_batch` | Managing of model, operate with its batch size. Setting batch size using input image count. +|nGraph Functions| `ov::op`, `ov::Node`, `ov::Shape::Shape`, `ov::Strides::Strides`, `ov::CoordinateDiff::CoordinateDiff`, `ov::Node::set_friendly_name`, `ov::shape_size`, `ov::ParameterVector::vector` | Illustrates how to construct an nGraph function Basic Inference Engine API is covered by [Hello Classification C++ sample](../hello_classification/README.md). diff --git a/samples/cpp/speech_sample/README.md b/samples/cpp/speech_sample/README.md index 560fe8b1d92..4ab9b638335 100644 --- a/samples/cpp/speech_sample/README.md +++ b/samples/cpp/speech_sample/README.md @@ -8,14 +8,14 @@ Automatic Speech Recognition C++ sample application demonstrates how to use the | Feature | API | Description | |:--- |:--- |:--- -|Inference Engine Version| `InferenceEngine::GetInferenceEngineVersion` | Get Inference Engine API version -|Available Devices|`InferenceEngine::Core::GetAvailableDevices`| Get version information of the devices for inference -| Network Operations | `InferenceEngine::CNNNetwork::setBatchSize`, `InferenceEngine::CNNNetwork::getBatchSize` | Managing of network, operate with its batch size. -|Network Operations|`InferenceEngine::CNNNetwork::addOutput`| Change names of output layers in the network -|Import Network|`InferenceEngine::ExecutableNetwork::Export`,`InferenceEngine::Core::ImportNetwork`| Creates an executable network from a previously exported network -|Asynchronous Infer| `InferenceEngine::InferRequest::StartAsync`, `InferenceEngine::InferRequest::Wait`| Do asynchronous inference and waits until inference result becomes available -|InferRequest Operations|`InferenceEngine::InferRequest::QueryState`, `InferenceEngine::VariableState::Reset`| Gets and resets state control interface for given executable network -|InferRequest Operations|`InferenceEngine::InferRequest::GetPerformanceCounts`| Get performance counters for infer request +|OpenVINO Runtime Version| `ov::get_openvino_versio` | Get Openvino API version +|Available Devices|`ov::runtime::Core::get_available_devices`| Get version information of the devices for inference +| Model Operations | `ov::set_batch`, `ov::get_batch` | Managing of model, operate with its batch size. Setting batch size using input image count. +|Model Operations|`ov::Model::add_output`| Change names of output layers in the model +|Import Model|`ov::runtime::CompiledModel::export_model`,`ov::runtime::Core::import_model`| Creates a CompiledModel from a previously exported model +|Asynchronous Infer| `ov::runtime::InferRequest::start_async`, `ov::runtime::InferRequest::wait`| Do asynchronous inference and waits until inference result becomes available +|InferRequest Operations|`ov::runtime::InferRequest::query_state`, `ov::runtime::VariableState::reset`| Gets and resets state control interface for given CompiledModel +|InferRequest Operations|`ov::runtime::InferRequest::get_profiling_info`| Get profiling info for infer request Basic Inference Engine API is covered by [Hello Classification C++ sample](../hello_classification/README.md). From c23025dfd2e3da57981e7fc7328074e3895759ea Mon Sep 17 00:00:00 2001 From: Mateusz Tabaka Date: Thu, 30 Dec 2021 01:34:19 +0100 Subject: [PATCH 31/78] [GPU] Allow networks with gather to use b_fs_yx_fsv16 format (#9249) Gather is an alternative approach used by model optimizer to implement --reverse_input_channels option. Currently, if it's in use, convolutions in GPU plugin may use other formats that leads to choosing less performant kernels. --- src/plugins/intel_gpu/src/graph/program.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 364a3b36c5c..3eca5027a15 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1347,8 +1347,10 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::strided_slice::type_id() && prim.type() != cldnn::region_yolo::type_id() && prim.type() != cldnn::normalize::type_id() && - prim.type() != cldnn::mvn::type_id()) + prim.type() != cldnn::mvn::type_id() && + prim.type() != cldnn::gather::type_id()) { can_use_fsv16 = false; + } if (prim.type() == cldnn::quantize::type_id() && (prim.get_output_layout().data_type == data_types::i8 || prim.get_output_layout().data_type == data_types::u8)) { From 5beb5dca3d7776c413a0946d73dbb52503662806 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Thu, 30 Dec 2021 16:20:35 +0900 Subject: [PATCH 32/78] [GPU] Update onednn to v2.6-pc (#9479) --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index b2cd3a8e50a..10c3cb8efc5 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit b2cd3a8e50a715f9326a35f4c503bd11e60235a5 +Subproject commit 10c3cb8efc5213aca2fb2cc6f960921fe6a5636f From bea10d6e3c62e99fe71986128a95db2b51b48fc5 Mon Sep 17 00:00:00 2001 From: Chen Xu Date: Thu, 30 Dec 2021 21:47:48 +0800 Subject: [PATCH 33/78] [CPU] Optimize Broadcast node for case with scalar input (#9358) --- .../src/nodes/common/tile_broadcast_utils.cpp | 68 ++++++++++--- .../src/nodes/common/tile_broadcast_utils.h | 1 + .../cpu/single_layer_tests/broadcast.cpp | 98 +++++++++++++++++++ 3 files changed, 156 insertions(+), 11 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp index d93a25196c6..aae69ea4afd 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.cpp @@ -211,23 +211,69 @@ bool TileBroadcastCommon::prepareOptimizedParams(const MKLDNNNode *node, VectorD return true; } +// Broadcast 1 element to N continuous elements based on cpu_memcpy +// Step 1: Get the binary format of the number N +// Step 2: Use cpu_memcpy to form fragments containing pow(2, k) (ie. 2, 4, 8, ...) elements, based on the given 1 element +// Step 3: Form N continuous elements, who's a combination of those fragments, demonstrated by its binary format +void TileBroadcastCommon::broadcastScalar(const char *srcData, char *dstData, size_t elt_cnt, size_t data_size) { + std::vector binary_digits; + + binary_digits.clear(); + for (size_t tmp_cnt = elt_cnt; tmp_cnt > 0; tmp_cnt >>= 1) { + binary_digits.emplace_back(tmp_cnt & 0x1); + } + + size_t min_cnt = 1; + size_t max_cnt = 1; + auto curDstData = dstData; + for (auto b : binary_digits) { + if (b) { + if (curDstData == dstData) { + cpu_memcpy(curDstData, srcData, min_cnt * data_size); + } else { + cpu_memcpy(curDstData, dstData, min_cnt * data_size); + } + curDstData += min_cnt * data_size; + for (size_t cur_cnt = min_cnt; cur_cnt < max_cnt; cur_cnt <<= 1) { + cpu_memcpy(curDstData, dstData, cur_cnt * data_size); + curDstData += cur_cnt * data_size; + } + min_cnt = max_cnt; + } + max_cnt <<= 1; + } +} + void TileBroadcastCommon::optimizedExecute(const MKLDNNMemoryPtr& srcMemory, const MKLDNNMemoryPtr& dstMemory) { auto srcData = reinterpret_cast(srcMemory->GetPtr()); auto dstData = reinterpret_cast(dstMemory->GetPtr()); if (optimizedParams.srcStrides[5] == 0) { - parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], - [&](int i0, int i1, int i2, int i3, int i4) { - auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + - i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + - i4 * optimizedParams.srcStrides[4]); - auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + - i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + - i4 * optimizedParams.dstStrides[4]); - for (int i = 0; i < optimizedParams.dims[5]; i++) { - cpu_memcpy(dstData2 + i * optimizedParams.dstStrides[5], srcData2, optimizedParams.dstStrides[5]); + if (optimizedParams.dstStrides[0] == optimizedParams.dims[5] * optimizedParams.dstStrides[5]) { + size_t data_size = optimizedParams.dstStrides[5]; + size_t elt_cnt = optimizedParams.dims[5]; + auto srcData_i32 = reinterpret_cast(srcMemory->GetPtr()); + if (data_size == 1) { + memset(dstData, srcData[0], elt_cnt); + } else if (data_size == 4 && srcData_i32[0] == 0) { + memset(dstData, 0, elt_cnt * data_size); + } else { + broadcastScalar(srcData, dstData, elt_cnt, data_size); } - }); + } else { + parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], + [&](int i0, int i1, int i2, int i3, int i4) { + auto srcData2 = srcData + (i0 * optimizedParams.srcStrides[0] + i1 * optimizedParams.srcStrides[1] + + i2 * optimizedParams.srcStrides[2] + i3 * optimizedParams.srcStrides[3] + + i4 * optimizedParams.srcStrides[4]); + auto dstData2 = dstData + (i0 * optimizedParams.dstStrides[0] + i1 * optimizedParams.dstStrides[1] + + i2 * optimizedParams.dstStrides[2] + i3 * optimizedParams.dstStrides[3] + + i4 * optimizedParams.dstStrides[4]); + for (int i = 0; i < optimizedParams.dims[5]; i++) { + cpu_memcpy(dstData2 + i * optimizedParams.dstStrides[5], srcData2, optimizedParams.dstStrides[5]); + } + }); + } } else { parallel_for5d(optimizedParams.dims[0], optimizedParams.dims[1], optimizedParams.dims[2], optimizedParams.dims[3], optimizedParams.dims[4], [&](int i0, int i1, int i2, int i3, int i4) { diff --git a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h index e5a60c81a08..a7ca767fec9 100644 --- a/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h +++ b/src/plugins/intel_cpu/src/nodes/common/tile_broadcast_utils.h @@ -28,6 +28,7 @@ protected: private: static void fillOptimizedDimsAndSrcStrides(const VectorDims &srcBlockedDims, const VectorDims &blockedRepeats, VectorDims &optimizedDims, VectorDims &optimizedSrcStrides); + static void broadcastScalar(const char *srcData, char *dstData, size_t elt_cnt, size_t data_size); static bool canBeExecutedInBlockedLayout(VectorDims srcDims, VectorDims repeats, const size_t elemsInBlock); static bool canBeExecutedInNSPCLayout(VectorDims srcDims, VectorDims repeats); diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/broadcast.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/broadcast.cpp index 5833d20f95f..0d0b4e9fcff 100644 --- a/src/tests/functional/plugin/cpu/single_layer_tests/broadcast.cpp +++ b/src/tests/functional/plugin/cpu/single_layer_tests/broadcast.cpp @@ -266,6 +266,29 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape4DE, BroadcastLayerCPUTest, ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); +const std::vector> staticInputShapesScalar = { + { + {{}, + { // Static shapes + {1} + } + } + } +}; + +INSTANTIATE_TEST_CASE_P(smoke_StaticShape4DScalar, BroadcastLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(staticInputShapesScalar), + ::testing::Values(std::vector{1, 16, 3, 3}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::ValuesIn(inputPrecisions), + ::testing::Values(std::vector{true, true}), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + BroadcastLayerCPUTest::getTestCaseName); + const std::vector> dynamicInputShapes4D = { { { // Origin dynamic shapes @@ -300,6 +323,30 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4D, BroadcastLayerCPUTest, ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); +const std::vector> dynamicInputShapesScalar = { + { + { // Origin dynamic shapes + {-1}, + { // Dynamic shapes instances + {1}, + {7} + } + } + } +}; + +INSTANTIATE_TEST_CASE_P(smoke_DynamicShape4DScalar, BroadcastLayerCPUTest, + ::testing::Combine(::testing::Combine( + ::testing::ValuesIn(dynamicInputShapesScalar), + ::testing::Values(std::vector{8, 16, 1, 7}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + BroadcastLayerCPUTest::getTestCaseName); + // 5D const std::vector> staticInputShapes5D = { { @@ -355,6 +402,19 @@ INSTANTIATE_TEST_CASE_P(smoke_StaticShape5D, BroadcastLayerCPUTest, ::testing::ValuesIn(CPUParams5D)), BroadcastLayerCPUTest::getTestCaseName); +INSTANTIATE_TEST_CASE_P(smoke_StaticShape5DScalar, BroadcastLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(staticInputShapesScalar), + ::testing::Values(std::vector{1, 16, 3, 1, 3}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::ValuesIn(inputPrecisions), + ::testing::Values(std::vector{true, true}), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + BroadcastLayerCPUTest::getTestCaseName); + INSTANTIATE_TEST_CASE_P(smoke_DynamicShape5D, BroadcastLayerCPUTest, ::testing::Combine( ::testing::Combine( @@ -367,6 +427,44 @@ INSTANTIATE_TEST_CASE_P(smoke_DynamicShape5D, BroadcastLayerCPUTest, ::testing::Values(CommonTestUtils::DEVICE_CPU)), ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), BroadcastLayerCPUTest::getTestCaseName); + +INSTANTIATE_TEST_CASE_P(smoke_DynamicShape5DScalar, BroadcastLayerCPUTest, + ::testing::Combine( + ::testing::Combine( + ::testing::ValuesIn(dynamicInputShapesScalar), + ::testing::Values(std::vector{8, 16, 1, 1, 7}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(std::vector>{{true, true}, {false, true}}), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + BroadcastLayerCPUTest::getTestCaseName); + +// 1D +const std::vector> dynamicShapes1D = { + { + { // Origin dynamic shapes + {-1}, + { // Dynamic shapes instances + {}, + {1} + } + } + } +}; + +INSTANTIATE_TEST_CASE_P(smoke_DynamicShapes1D, BroadcastLayerCPUTest, + ::testing::Combine(::testing::Combine( + ::testing::ValuesIn(dynamicShapes1D), + ::testing::Values(std::vector{0}), + ::testing::Values(std::vector{}), + ::testing::Values(ov::op::BroadcastType::NUMPY), + ::testing::ValuesIn(inputPrecisions), + ::testing::ValuesIn(std::vector>{{false, true}}), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Values(CPUSpecificParams{{}, {}, {}, "ref"})), + BroadcastLayerCPUTest::getTestCaseName); /* ========= */ } // namespace From 8ba94cfb8fec16e4902b7a0c2c8e745eccee38dc Mon Sep 17 00:00:00 2001 From: Maxim Andronov Date: Thu, 30 Dec 2021 17:04:33 +0300 Subject: [PATCH 34/78] [CPU] Fix memory allocation for non default shape infer path (#9475) --- .../intel_cpu/src/nodes/mkldnn_matrix_nms_node.cpp | 11 ++--------- .../intel_cpu/src/nodes/mkldnn_multiclass_nms.cpp | 11 ++--------- .../src/nodes/mkldnn_non_max_suppression_node.cpp | 8 ++------ src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.cpp | 2 +- src/plugins/intel_cpu/src/nodes/mkldnn_range_node.cpp | 4 +++- 5 files changed, 10 insertions(+), 26 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_matrix_nms_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_matrix_nms_node.cpp index c9668785548..120f4544ad6 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_matrix_nms_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_matrix_nms_node.cpp @@ -287,12 +287,7 @@ bool MKLDNNMatrixNmsNode::isExecutable() const { void MKLDNNMatrixNmsNode::executeDynamicImpl(mkldnn::stream strm) { if (hasEmptyInputTensors()) { - getChildEdgesAtPort(NMS_SELECTED_OUTPUTS)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTED_OUTPUTS)->cloneWithNewDims({0, 6})); - getChildEdgesAtPort(NMS_SELECTED_INDICES)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTED_INDICES)->cloneWithNewDims({0, 1})); - getChildEdgesAtPort(NMS_VALID_OUTPUTS)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_VALID_OUTPUTS)->cloneWithNewDims({0})); + redefineOutputMemory({{0, 6}, {0, 1}, {0}}); return; } execute(strm); @@ -375,9 +370,7 @@ void MKLDNNMatrixNmsNode::execute(mkldnn::stream strm) { // TODO [DS NMS]: remove when nodes from models where nms is not last node in model supports DS if (isDynamicNode()) { size_t totalBox = std::accumulate(m_numPerBatch.begin(), m_numPerBatch.end(), 0); - selectedOutputsMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTED_OUTPUTS)->cloneWithNewDims({totalBox, 6})); - selectedIndicesMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTED_INDICES)->cloneWithNewDims({totalBox, 1})); - validOutputsMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_VALID_OUTPUTS)->cloneWithNewDims({m_numBatches})); + redefineOutputMemory({{totalBox, 6}, {totalBox, 1}, {m_numBatches}}); } float* selectedOutputs = reinterpret_cast(selectedOutputsMemPtr->GetPtr()); int* selectedIndices = reinterpret_cast(selectedIndicesMemPtr->GetPtr()); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_multiclass_nms.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_multiclass_nms.cpp index 15d810fbb4d..9d21c37338e 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_multiclass_nms.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_multiclass_nms.cpp @@ -144,12 +144,7 @@ bool MKLDNNMultiClassNmsNode::isExecutable() const { void MKLDNNMultiClassNmsNode::executeDynamicImpl(mkldnn::stream strm) { if (hasEmptyInputTensors()) { - getChildEdgesAtPort(NMS_SELECTEDOUTPUTS)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTEDOUTPUTS)->cloneWithNewDims({0, 6})); - getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTEDINDICES)->cloneWithNewDims({0, 1})); - getChildEdgesAtPort(NMS_SELECTEDNUM)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTEDNUM)->cloneWithNewDims({0})); + redefineOutputMemory({{0, 6}, {0, 1}, {0}}); return; } execute(strm); @@ -268,9 +263,7 @@ void MKLDNNMultiClassNmsNode::execute(mkldnn::stream strm) { // TODO [DS NMS]: remove when nodes from models where nms is not last node in model supports DS if (isDynamicNode()) { size_t totalBox = std::accumulate(m_selected_num.begin(), m_selected_num.end(), 0); - selectedOutputsMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTEDOUTPUTS)->cloneWithNewDims({totalBox, 6})); - selectedIndicesMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTEDINDICES)->cloneWithNewDims({totalBox, 1})); - validOutputsMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTEDNUM)->cloneWithNewDims({m_numBatches})); + redefineOutputMemory({{totalBox, 6}, {totalBox, 1}, {m_numBatches}}); } int* selected_indices = reinterpret_cast(selectedIndicesMemPtr->GetPtr()); float* selected_outputs = reinterpret_cast(selectedOutputsMemPtr->GetPtr()); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp index 3cc419f03dd..a921c16b51a 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_non_max_suppression_node.cpp @@ -713,10 +713,7 @@ void MKLDNNNonMaxSuppressionNode::createJitKernel() { void MKLDNNNonMaxSuppressionNode::executeDynamicImpl(mkldnn::stream strm) { if (hasEmptyInputTensors() || (inputShapes.size() > NMS_MAXOUTPUTBOXESPERCLASS && reinterpret_cast(getParentEdgeAt(NMS_MAXOUTPUTBOXESPERCLASS)->getMemoryPtr()->GetPtr())[0] == 0)) { - getChildEdgesAtPort(NMS_SELECTEDINDICES)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTEDINDICES)->cloneWithNewDims({0, 3})); - getChildEdgesAtPort(NMS_SELECTEDSCORES)[0]->getMemoryPtr()->redefineDesc( - getBaseMemDescAtOutputPort(NMS_SELECTEDSCORES)->cloneWithNewDims({0, 3})); + redefineOutputMemory({{0, 3}, {0, 3}, {1}}); *reinterpret_cast(getChildEdgesAtPort(NMS_VALIDOUTPUTS)[0]->getMemoryPtr()->GetPtr()) = 0; return; } @@ -794,8 +791,7 @@ void MKLDNNNonMaxSuppressionNode::execute(mkldnn::stream strm) { // TODO [DS NMS]: remove when nodes from models where nms is not last node in model supports DS if (isDynamicNode()) { VectorDims newDims{validOutputs, 3}; - indicesMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTEDINDICES)->cloneWithNewDims(newDims)); - scoresMemPtr->redefineDesc(getBaseMemDescAtOutputPort(NMS_SELECTEDSCORES)->cloneWithNewDims(newDims)); + redefineOutputMemory({newDims, newDims, {1}}); } int selectedIndicesStride = indicesMemPtr->GetDescWithType()->getStrides()[0]; diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.cpp index 5e270ba56cc..c836937e1f5 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_non_zero.cpp @@ -112,7 +112,7 @@ void MKLDNNNonZeroNode::executeSpecified() { if (isDynamicNode()) { VectorDims newDims{inRank, nonZeroCount}; - dstMemPtr->redefineDesc(getBaseMemDescAtOutputPort(0)->cloneWithNewDims(newDims)); + redefineOutputMemory({newDims}); } int *dst = reinterpret_cast(dstMemPtr->GetPtr()); size_t inSize = inShape.getElementsCount(); diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_range_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_range_node.cpp index 10e9d14c6ba..48175474b79 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_range_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_range_node.cpp @@ -104,6 +104,7 @@ void MKLDNNRangeNode::execute(mkldnn::stream strm) { IE_THROW() << errorMsg; } } + template size_t MKLDNNRangeNode::getWorkAmount(data_t *startPtr, data_t *stopPtr, data_t *stepPtr) const { data_t start = 0, limit = 0, delta = 0; @@ -126,13 +127,14 @@ size_t MKLDNNRangeNode::getWorkAmount(data_t *startPtr, data_t *stopPtr, data_t return static_cast(std::ceil(std::fabs(span) / std::fabs(step))); } } + template InferenceEngine::StatusCode MKLDNNRangeNode::rangeKernel() { data_t start = 0, delta = 0; size_t work_amount_dst = getWorkAmount(&start, nullptr, &delta); if (isDynamicNode()) { VectorDims newOutputShape {work_amount_dst}; - getChildEdgeAt(0)->getMemoryPtr()->redefineDesc(getBaseMemDescAtOutputPort(0)->cloneWithNewDims(newOutputShape)); + redefineOutputMemory({newOutputShape}); } data_t* dst_data = reinterpret_cast(getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPtr()); parallel_nt(0, [&](const int ithr, const int nthr) { From ec5198094ac2cfe930fcc5869f8913022717e5b9 Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Thu, 30 Dec 2021 17:43:16 +0300 Subject: [PATCH 35/78] [CPU] PriorBox & PriorBoxClustered dynamism enabling (#8597) --- src/core/src/op/prior_box_clustered.cpp | 2 +- src/plugins/intel_cpu/src/cpu_types.cpp | 4 +- src/plugins/intel_cpu/src/cpu_types.h | 4 +- .../intel_cpu/src/mkldnn_nodes_factory.cpp | 4 + .../nodes/mkldnn_priorbox_clustered_node.cpp | 170 +++++++++ .../nodes/mkldnn_priorbox_clustered_node.h | 46 +++ .../src/nodes/mkldnn_priorbox_node.cpp | 324 ++++++++++++++++++ .../src/nodes/mkldnn_priorbox_node.h | 52 +++ .../skip_tests_config.cpp | 3 +- .../cpu/single_layer_tests/prior_box.cpp | 229 +++++++++++++ .../prior_box_clustered.cpp | 228 ++++++++++++ .../src/base/ov_subgraph.cpp | 12 +- .../src/single_layer/prior_box.cpp | 5 +- 13 files changed, 1073 insertions(+), 10 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.h create mode 100644 src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.cpp create mode 100644 src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.h create mode 100644 src/tests/functional/plugin/cpu/single_layer_tests/prior_box.cpp create mode 100644 src/tests/functional/plugin/cpu/single_layer_tests/prior_box_clustered.cpp diff --git a/src/core/src/op/prior_box_clustered.cpp b/src/core/src/op/prior_box_clustered.cpp index d9c7fbfdcff..9fb769f86e4 100644 --- a/src/core/src/op/prior_box_clustered.cpp +++ b/src/core/src/op/prior_box_clustered.cpp @@ -68,7 +68,7 @@ void ov::op::v0::PriorBoxClustered::validate_and_infer_types() { const auto num_priors = m_attrs.widths.size(); set_output_type(0, element::f32, ov::Shape{2, 4 * layer_shape[0] * layer_shape[1] * num_priors}); } else { - set_output_type(0, element::f32, ov::PartialShape::dynamic()); + set_output_type(0, element::f32, ov::PartialShape{2, Dimension::dynamic()}); } } diff --git a/src/plugins/intel_cpu/src/cpu_types.cpp b/src/plugins/intel_cpu/src/cpu_types.cpp index 5be7d6f8786..cf73c05563d 100644 --- a/src/plugins/intel_cpu/src/cpu_types.cpp +++ b/src/plugins/intel_cpu/src/cpu_types.cpp @@ -185,7 +185,9 @@ const InferenceEngine::details::caseless_unordered_map type_t { "MatrixNms", MatrixNms}, { "MulticlassNms", MulticlassNms}, { "Reference", Reference}, - { "Subgraph", Subgraph} + { "Subgraph", Subgraph}, + { "PriorBox", PriorBox}, + { "PriorBoxClustered", PriorBoxClustered}, }; Type TypeFromName(const std::string& type) { diff --git a/src/plugins/intel_cpu/src/cpu_types.h b/src/plugins/intel_cpu/src/cpu_types.h index c9df96d6ba6..72dd6643ebf 100644 --- a/src/plugins/intel_cpu/src/cpu_types.h +++ b/src/plugins/intel_cpu/src/cpu_types.h @@ -101,7 +101,9 @@ enum Type { NonMaxSuppression, MatrixNms, MulticlassNms, - Subgraph + Subgraph, + PriorBox, + PriorBoxClustered, }; enum Algorithm { diff --git a/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp b/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp index 75b01e84a20..5ba59066acc 100644 --- a/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_nodes_factory.cpp @@ -83,6 +83,8 @@ #include "nodes/mkldnn_non_zero.h" #include "nodes/mkldnn_color_convert_node.h" #include "nodes/subgraph.h" +#include "nodes/mkldnn_priorbox_node.h" +#include "nodes/mkldnn_priorbox_clustered_node.h" #define MKLDNN_NODE(__prim, __type) \ registerNodeIfRequired(MKLDNNPlugin, __prim, __type, MKLDNNNodeImpl<__prim>) @@ -174,4 +176,6 @@ MKLDNNPlugin::MKLDNNNode::NodesFactory::NodesFactory() MKLDNN_NODE(MKLDNNNonZeroNode, NonZero); MKLDNN_NODE(MKLDNNSnippetNode, Subgraph); MKLDNN_NODE(MKLDNNColorConvertNode, ColorConvert); + MKLDNN_NODE(MKLDNNPriorBoxNode, PriorBox); + MKLDNN_NODE(MKLDNNPriorBoxClusteredNode, PriorBoxClustered); } diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.cpp new file mode 100644 index 00000000000..7efc11afddd --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.cpp @@ -0,0 +1,170 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_priorbox_clustered_node.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +bool MKLDNNPriorBoxClusteredNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto priorBox = std::dynamic_pointer_cast(op); + if (!priorBox) { + errorMessage = "Only opset1 PriorBoxClustered operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNPriorBoxClusteredNode::MKLDNNPriorBoxClusteredNode( + const std::shared_ptr& op, + const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + const auto priorBox = std::dynamic_pointer_cast(op); + const ngraph::opset1::PriorBoxClustered::Attributes& attrs = priorBox->get_attrs(); + + widths = attrs.widths; + heights = attrs.heights; + clip = attrs.clip; + variances = attrs.variances; + step = attrs.step; + step_heights = attrs.step_heights; + step_widths = attrs.step_widths; + offset = attrs.offset; + + number_of_priors = widths.size(); + + if (variances.empty()) { + variances.push_back(0.1f); + } +} + +bool MKLDNNPriorBoxClusteredNode::needShapeInfer() const { + auto& memory = getChildEdgeAt(0)->getMemoryPtr(); + if (memory->GetShape().isDynamic()) { + return true; + } + + const auto& outputShape = memory->GetShape().getStaticDims(); + const int* in_data = reinterpret_cast(memory->GetPtr()); + const int h = in_data[0]; + const int w = in_data[1]; + const auto output = static_cast(4 * h * w * number_of_priors); + + return outputShape[1] != output; +} + +std::vector MKLDNNPriorBoxClusteredNode::shapeInfer() const { + const int* in_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const int H = in_data[0]; + const int W = in_data[1]; + const auto output = static_cast(4 * H * W * number_of_priors); + return {{2, output}}; +} + +bool MKLDNNPriorBoxClusteredNode::needPrepareParams() const { + return false; +} + +void MKLDNNPriorBoxClusteredNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc( + {{LayoutType::ncsp, Precision::I32}, {LayoutType::ncsp, Precision::I32}}, + {{LayoutType::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNPriorBoxClusteredNode::createPrimitive() { + if (inputShapesDefined()) { + if (needPrepareParams()) + prepareParams(); + updateLastInputDims(); + } +} + +void MKLDNNPriorBoxClusteredNode::execute(mkldnn::stream strm) { + const int* in_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const int layer_height = in_data[0]; + const int layer_width = in_data[1]; + + const int* in_image = reinterpret_cast(getParentEdgeAt(1)->getMemoryPtr()->GetPtr()); + int img_height = in_image[0]; + int img_width = in_image[1]; + + float step_w = step_widths == 0 ? step : step_widths; + float step_h = step_heights == 0 ? step : step_heights; + if (step_w == 0 && step_h == 0) { + step_w = static_cast(img_width) / layer_width; + step_h = static_cast(img_height) / layer_height; + } + + float* dst_data = reinterpret_cast(getChildEdgeAt(0)->getMemoryPtr()->GetPtr()); + const auto& out_shape = getChildEdgeAt(0)->getMemory().GetShape().getStaticDims(); + + size_t var_size = variances.size(); + parallel_for2d(layer_height, layer_width, [&](int64_t h, int64_t w) { + float center_x = (w + offset) * step_w; + float center_y = (h + offset) * step_h; + + for (size_t s = 0; s < number_of_priors; ++s) { + float box_width = widths[s]; + float box_height = heights[s]; + + float xmin = (center_x - box_width / 2.0f) / img_width; + float ymin = (center_y - box_height / 2.0f) / img_height; + float xmax = (center_x + box_width / 2.0f) / img_width; + float ymax = (center_y + box_height / 2.0f) / img_height; + + if (clip) { + xmin = (std::min)((std::max)(xmin, 0.0f), 1.0f); + ymin = (std::min)((std::max)(ymin, 0.0f), 1.0f); + xmax = (std::min)((std::max)(xmax, 0.0f), 1.0f); + ymax = (std::min)((std::max)(ymax, 0.0f), 1.0f); + } + + const uint64_t idx = h * layer_width * number_of_priors * 4 + w * number_of_priors * 4 + s * 4; + dst_data[idx + 0] = xmin; + dst_data[idx + 1] = ymin; + dst_data[idx + 2] = xmax; + dst_data[idx + 3] = ymax; + + // At this point we have either: + // 1. A single variance value (to be repeated 4 times for each prior) + // 2. 4 variance values + if (var_size == 1) { + for (size_t j = 0; j < 4; j++) + dst_data[idx + j + out_shape[1]] = variances[0]; + } else { + for (size_t j = 0; j < var_size; j++) + dst_data[idx + j + out_shape[1]] = variances[j]; + } + } + }); +} + +bool MKLDNNPriorBoxClusteredNode::created() const { + return getType() == PriorBoxClustered; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNPriorBoxClusteredNode, PriorBoxClustered) diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.h b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.h new file mode 100644 index 00000000000..4af4acbc543 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_clustered_node.h @@ -0,0 +1,46 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNPriorBoxClusteredNode : public MKLDNNNode { +public: + MKLDNNPriorBoxClusteredNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override; + void execute(mkldnn::stream strm) override; + bool created() const override; + + bool needShapeInfer() const override; + std::vector shapeInfer() const override; + bool needPrepareParams() const override; + + void executeDynamicImpl(mkldnn::stream strm) override { execute(strm); } + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + std::vector widths; + std::vector heights; + std::vector variances; + bool clip; + float step; + float step_heights; + float step_widths; + float offset; + + int number_of_priors; +}; + +} // namespace MKLDNNPlugin diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.cpp new file mode 100644 index 00000000000..76a4a28cd57 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.cpp @@ -0,0 +1,324 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "mkldnn_priorbox_node.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace MKLDNNPlugin; +using namespace InferenceEngine; + +#define THROW_ERROR IE_THROW() << "PriorBox layer with name '" << getName() << "': " + +namespace { +float clip_great(float x, float threshold) { + return x < threshold ? x : threshold; +} + +float clip_less(float x, float threshold) { + return x > threshold ? x : threshold; +} +} + +bool MKLDNNPriorBoxNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { + try { + const auto priorBox = std::dynamic_pointer_cast(op); + if (!priorBox) { + errorMessage = "Only opset1 PriorBox operation is supported"; + return false; + } + } catch (...) { + return false; + } + return true; +} + +MKLDNNPriorBoxNode::MKLDNNPriorBoxNode( + const std::shared_ptr& op, + const mkldnn::engine& eng, + MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { + std::string errorMessage; + if (!isSupportedOperation(op, errorMessage)) { + IE_THROW(NotImplemented) << errorMessage; + } + + const auto priorBox = std::dynamic_pointer_cast(op); + const ngraph::opset1::PriorBox::Attributes& attrs = priorBox->get_attrs(); + offset = attrs.offset; + step = attrs.step; + min_size = attrs.min_size; + max_size = attrs.max_size; + flip = attrs.flip; + clip = attrs.clip; + scale_all_sizes = attrs.scale_all_sizes; + fixed_size = attrs.fixed_size; + fixed_ratio = attrs.fixed_ratio; + density = attrs.density; + + bool exist; + aspect_ratio.push_back(1.0f); + for (float aspect_ratio_item : attrs.aspect_ratio) { + exist = false; + + if (std::fabs(aspect_ratio_item) < std::numeric_limits::epsilon()) { + THROW_ERROR << "Aspect_ratio param can't be equal to zero"; + } + + for (float _aspect_ratio : aspect_ratio) { + if (fabs(aspect_ratio_item - _aspect_ratio) < 1e-6) { + exist = true; + break; + } + } + + if (exist) { + continue; + } + + aspect_ratio.push_back(aspect_ratio_item); + if (flip) { + aspect_ratio.push_back(1.0f / aspect_ratio_item); + } + } + + number_of_priors = ngraph::opset1::PriorBox::number_of_priors(attrs); + + if (attrs.variance.size() == 1 || attrs.variance.size() == 4) { + for (float i : attrs.variance) { + if (i < 0) { + THROW_ERROR << "Variance must be > 0."; + } + + variance.push_back(i); + } + } else if (attrs.variance.empty()) { + variance.push_back(0.1f); + } else { + THROW_ERROR << "Wrong number of variance values. Not less than 1 and more than 4 variance values."; + } +} + +bool MKLDNNPriorBoxNode::needShapeInfer() const { + auto& memory = getChildEdgeAt(0)->getMemoryPtr(); + if (memory->GetShape().isDynamic()) { + return true; + } + + const auto& outputShape = memory->GetShape().getStaticDims(); + const int* in_data = reinterpret_cast(memory->GetPtr()); + const int h = in_data[0]; + const int w = in_data[1]; + const auto output = static_cast(4 * h * w * number_of_priors); + + return outputShape[1] != output; +} + +std::vector MKLDNNPriorBoxNode::shapeInfer() const { + const int* in_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const int H = in_data[0]; + const int W = in_data[1]; + const auto output = static_cast(4 * H * W * number_of_priors); + return {{2, output}}; +} + +bool MKLDNNPriorBoxNode::needPrepareParams() const { + return false; +} + +void MKLDNNPriorBoxNode::initSupportedPrimitiveDescriptors() { + if (!supportedPrimitiveDescriptors.empty()) + return; + + addSupportedPrimDesc( + {{LayoutType::ncsp, Precision::I32}, {LayoutType::ncsp, Precision::I32}}, + {{LayoutType::ncsp, Precision::FP32}}, + impl_desc_type::ref_any); +} + +void MKLDNNPriorBoxNode::createPrimitive() { + if (inputShapesDefined()) { + if (needPrepareParams()) + prepareParams(); + updateLastInputDims(); + } +} + +void MKLDNNPriorBoxNode::execute(mkldnn::stream strm) { + const int* in_data = reinterpret_cast(getParentEdgeAt(0)->getMemoryPtr()->GetPtr()); + const int H = in_data[0]; + const int W = in_data[1]; + + const int* in_image = reinterpret_cast(getParentEdgeAt(1)->getMemoryPtr()->GetPtr()); + const int IH = in_image[0]; + const int IW = in_image[1]; + + const int OH = 4 * H * W * number_of_priors; + const int OW = 1; + + float* dst_data = reinterpret_cast(getChildEdgeAt(0)->getMemoryPtr()->GetPtr()); + + float step_ = step; + auto min_size_ = min_size; + if (!scale_all_sizes) { + // mxnet-like PriorBox + if (step_ == -1) + step_ = 1.f * IH / H; + else + step_ *= IH; + for (auto& size : min_size_) + size *= IH; + } + + int64_t idx = 0; + float center_x, center_y, box_width, box_height, step_x, step_y; + float IWI = 1.0f / static_cast(IW); + float IHI = 1.0f / static_cast(IH); + + if (step_ == 0) { + step_x = static_cast(IW) / W; + step_y = static_cast(IH) / H; + } else { + step_x = step_; + step_y = step_; + } + + auto calculate_data = + [&dst_data, &IWI, &IHI, &idx](float center_x, float center_y, float box_width, float box_height, bool clip) { + if (clip) { + // order: xmin, ymin, xmax, ymax + dst_data[idx++] = clip_less((center_x - box_width) * IWI, 0); + dst_data[idx++] = clip_less((center_y - box_height) * IHI, 0); + dst_data[idx++] = clip_great((center_x + box_width) * IWI, 1); + dst_data[idx++] = clip_great((center_y + box_height) * IHI, 1); + } else { + dst_data[idx++] = (center_x - box_width) * IWI; + dst_data[idx++] = (center_y - box_height) * IHI; + dst_data[idx++] = (center_x + box_width) * IWI; + dst_data[idx++] = (center_y + box_height) * IHI; + } + }; + + for (int64_t h = 0; h < H; ++h) { + for (int64_t w = 0; w < W; ++w) { + if (step_ == 0) { + center_x = (w + 0.5f) * step_x; + center_y = (h + 0.5f) * step_y; + } else { + center_x = (offset + w) * step_; + center_y = (offset + h) * step_; + } + + for (size_t s = 0; s < fixed_size.size(); ++s) { + auto fixed_size_ = static_cast(fixed_size[s]); + box_width = box_height = fixed_size_ * 0.5f; + + if (!fixed_ratio.empty()) { + for (float ar : fixed_ratio) { + auto density_ = static_cast(density[s]); + auto shift = static_cast(fixed_size[s] / density_); + ar = std::sqrt(ar); + float box_width_ratio = fixed_size[s] * 0.5f * ar; + float box_height_ratio = fixed_size[s] * 0.5f / ar; + for (int64_t r = 0; r < density_; ++r) { + for (int64_t c = 0; c < density_; ++c) { + float center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift; + float center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift; + calculate_data(center_x_temp, center_y_temp, box_width_ratio, box_height_ratio, true); + } + } + } + } else { + if (!density.empty()) { + auto density_ = static_cast(density[s]); + auto shift = static_cast(fixed_size[s] / density_); + for (int64_t r = 0; r < density_; ++r) { + for (int64_t c = 0; c < density_; ++c) { + float center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift; + float center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift; + calculate_data(center_x_temp, center_y_temp, box_width, box_height, true); + } + } + } + // Rest of priors + for (float ar : aspect_ratio) { + if (fabs(ar - 1.) < 1e-6) { + continue; + } + + auto density_ = static_cast(density[s]); + auto shift = static_cast(fixed_size[s] / density_); + ar = std::sqrt(ar); + float box_width_ratio = fixed_size[s] * 0.5f * ar; + float box_height_ratio = fixed_size[s] * 0.5f / ar; + for (int64_t r = 0; r < density_; ++r) { + for (int64_t c = 0; c < density_; ++c) { + float center_x_temp = center_x - fixed_size_ / 2 + shift / 2.f + c * shift; + float center_y_temp = center_y - fixed_size_ / 2 + shift / 2.f + r * shift; + calculate_data(center_x_temp, center_y_temp, box_width_ratio, box_height_ratio, true); + } + } + } + } + } + + for (size_t ms_idx = 0; ms_idx < min_size_.size(); ms_idx++) { + box_width = min_size_[ms_idx] * 0.5f; + box_height = min_size_[ms_idx] * 0.5f; + calculate_data(center_x, center_y, box_width, box_height, false); + + if (max_size.size() > ms_idx) { + box_width = box_height = std::sqrt(min_size_[ms_idx] * max_size[ms_idx]) * 0.5f; + calculate_data(center_x, center_y, box_width, box_height, false); + } + + if (scale_all_sizes || (!scale_all_sizes && (ms_idx == min_size_.size() - 1))) { + size_t s_idx = scale_all_sizes ? ms_idx : 0; + for (float ar : aspect_ratio) { + if (std::fabs(ar - 1.0f) < 1e-6) { + continue; + } + + ar = std::sqrt(ar); + box_width = min_size_[s_idx] * 0.5f * ar; + box_height = min_size_[s_idx] * 0.5f / ar; + calculate_data(center_x, center_y, box_width, box_height, false); + } + } + } + } + } + + if (clip) { + parallel_for((H * W * number_of_priors * 4), [&](size_t i) { + dst_data[i] = (std::min)((std::max)(dst_data[i], 0.0f), 1.0f); + }); + } + + uint64_t channel_size = OH * OW; + if (variance.size() == 1) { + parallel_for(channel_size, [&](size_t i) { + dst_data[i + channel_size] = variance[0]; + }); + } else { + parallel_for(H * W * number_of_priors, [&](size_t i) { + for (size_t j = 0; j < 4; ++j) { + dst_data[i * 4 + j + channel_size] = variance[j]; + } + }); + } +} + +bool MKLDNNPriorBoxNode::created() const { + return getType() == PriorBox; +} + +REG_MKLDNN_PRIM_FOR(MKLDNNPriorBoxNode, PriorBox) diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.h b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.h new file mode 100644 index 00000000000..bb054b2df15 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_priorbox_node.h @@ -0,0 +1,52 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +#include +#include + +namespace MKLDNNPlugin { + +class MKLDNNPriorBoxNode : public MKLDNNNode { +public: + MKLDNNPriorBoxNode(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache); + + void getSupportedDescriptors() override {}; + void initSupportedPrimitiveDescriptors() override; + void createPrimitive() override; + void execute(mkldnn::stream strm) override; + bool created() const override; + + bool needShapeInfer() const override; + std::vector shapeInfer() const override; + bool needPrepareParams() const override; + + void executeDynamicImpl(mkldnn::stream strm) override { execute(strm); } + + static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; + +private: + float offset; + float step; + std::vector min_size; + std::vector max_size; + bool flip; + bool clip; + bool scale_all_sizes; + + std::vector fixed_size; + std::vector fixed_ratio; + std::vector density; + + std::vector aspect_ratio; + std::vector variance; + + int number_of_priors; +}; + +} // namespace MKLDNNPlugin diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp index 1ff8c21a217..bd8d9b8f37a 100644 --- a/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/skip_tests_config.cpp @@ -138,7 +138,8 @@ std::vector disabledTestPatterns() { // bad accuracy R"(.*smoke_FakeQuantizeLayerCPUTest_Decompos. *IS=_TS=\(\(4\.5\.6\.7\)\)_RS=\(\(1\.1\.6\.1\)\)_\(\(1\.5\.6\.1\)\)_\(\(1\.1\.1\.1\)\)_\(\(1\.1\.6\.1\)\).*)", - + // Issue: 69222 + R"(.*smoke_PriorBoxClustered.*PriorBoxClusteredLayerCPUTest.*_netPRC=f16_.*)", // Issue: 71121 R"(.*smoke_Proposal*.*TS=\(2.*)", // TODO : CVS-69533 diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/prior_box.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/prior_box.cpp new file mode 100644 index 00000000000..0485dc31505 --- /dev/null +++ b/src/tests/functional/plugin/cpu/single_layer_tests/prior_box.cpp @@ -0,0 +1,229 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include +#include "ngraph_functions/builders.hpp" +#include "shared_test_classes/base/layer_test_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ov::test; + +namespace CPULayerTestsDefinitions { + +using priorBoxSpecificParams = std::tuple< + std::vector, // min_size + std::vector, // max_size + std::vector, // aspect_ratio + std::vector, // density + std::vector, // fixed_ratio + std::vector, // fixed_size + bool, // clip + bool, // flip + float, // step + float, // offset + std::vector, // variance + bool>; // scale_all_sizes + +typedef std::tuple< + priorBoxSpecificParams, + ov::test::ElementType, // net precision + ov::test::ElementType, // Input precision + ov::test::ElementType, // Output precision + InferenceEngine::Layout, // Input layout + InferenceEngine::Layout, // Output layout + ov::test::InputShape, // input shape + ov::test::InputShape, // image shape + std::string> priorBoxLayerParams; + +class PriorBoxLayerCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ov::test::ElementType netPrecision; + ov::test::ElementType inPrc, outPrc; + InferenceEngine::Layout inLayout, outLayout; + ov::test::InputShape inputShapes; + ov::test::InputShape imageShapes; + std::string targetDevice; + priorBoxSpecificParams specParams; + std::tie(specParams, + netPrecision, + inPrc, outPrc, inLayout, outLayout, + inputShapes, + imageShapes, + targetDevice) = obj.param; + + ngraph::op::PriorBoxAttrs attributes; + std::tie( + attributes.min_size, + attributes.max_size, + attributes.aspect_ratio, + attributes.density, + attributes.fixed_ratio, + attributes.fixed_size, + attributes.clip, + attributes.flip, + attributes.step, + attributes.offset, + attributes.variance, + attributes.scale_all_sizes) = specParams; + + std::ostringstream result; + const char separator = '_'; + result << "IS=" << inputShapes << separator; + result << "imageS=" << imageShapes << separator; + result << "netPRC=" << netPrecision << separator; + result << "inPRC=" << inPrc << separator; + result << "outPRC=" << outPrc << separator; + result << "inL=" << inLayout << separator; + result << "outL=" << outLayout << separator; + result << "min_size=" << CommonTestUtils::vec2str(attributes.min_size) << separator; + result << "max_size=" << CommonTestUtils::vec2str(attributes.max_size)<< separator; + result << "aspect_ratio=" << CommonTestUtils::vec2str(attributes.aspect_ratio)<< separator; + result << "density=" << CommonTestUtils::vec2str(attributes.density)<< separator; + result << "fixed_ratio=" << CommonTestUtils::vec2str(attributes.fixed_ratio)<< separator; + result << "fixed_size=" << CommonTestUtils::vec2str(attributes.fixed_size)<< separator; + result << "variance=" << CommonTestUtils::vec2str(attributes.variance)<< separator; + result << "step=" << attributes.step << separator; + result << "offset=" << attributes.offset << separator; + result << "clip=" << attributes.clip << separator; + result << "flip=" << attributes.flip<< separator; + result << "scale_all_sizes=" << attributes.scale_all_sizes << separator; + result << "trgDev=" << targetDevice; + + return result.str(); + } + +protected: + void SetUp() override { + priorBoxSpecificParams specParams; + + InferenceEngine::Layout inLayout; + InferenceEngine::Layout outLayout; + ov::test::ElementType netPrecision; + ov::test::ElementType inPrc; + ov::test::ElementType outPrc; + ov::test::InputShape inputShapes; + ov::test::InputShape imageShapes; + std::tie(specParams, netPrecision, + inPrc, outPrc, inLayout, outLayout, + inputShapes, imageShapes, targetDevice) = GetParam(); + + selectedType = makeSelectedTypeStr("ref", inPrc); + targetDevice = CommonTestUtils::DEVICE_CPU; + + init_input_shapes({ inputShapes, imageShapes }); + + ngraph::op::PriorBoxAttrs attributes; + std::tie( + attributes.min_size, + attributes.max_size, + attributes.aspect_ratio, + attributes.density, + attributes.fixed_ratio, + attributes.fixed_size, + attributes.clip, + attributes.flip, + attributes.step, + attributes.offset, + attributes.variance, + attributes.scale_all_sizes) = specParams; + + auto params = ngraph::builder::makeDynamicParams(netPrecision, inputDynamicShapes); + + auto shape_of_1 = std::make_shared(params[0]); + auto shape_of_2 = std::make_shared(params[1]); + auto priorBox = std::make_shared( + shape_of_1, + shape_of_2, + attributes); + + ngraph::ResultVector results{std::make_shared(priorBox)}; + function = std::make_shared (results, params, "priorBox"); + } +}; + +TEST_P(PriorBoxLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); + CheckPluginRelatedResults(executableNetwork, "PriorBox"); +} + +namespace { +const std::vector netPrecisions = { + ov::test::ElementType::i32, + ov::test::ElementType::u64}; + +const std::vector> min_sizes = {{256.0f}}; + +const std::vector> max_sizes = {{315.0f}}; + +const std::vector> aspect_ratios = {{2.0f}}; + +const std::vector> densities = {{1.0f}}; + +const std::vector> fixed_ratios = {{}}; + +const std::vector> fixed_sizes = {{}}; + +const std::vector clips = {false, true}; + +const std::vector flips = {false, true}; + +const std::vector steps = {1.0f}; + +const std::vector offsets = {0.0f}; + +const std::vector> variances = {{}}; + +const std::vector scale_all_sizes = { false, true}; + +const std::vector inputShape = { + {{300, 300}, {{300, 300}}}, + {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{300, 300}, {150, 150}}}, + {{{150, 300}, {150, 300}}, {{300, 300}, {150, 150}}} +}; + +const std::vector imageShape = { + {{32, 32}, {{32, 32}}}, + {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{32, 32}, {16, 16}}}, + {{{16, 32}, {16, 32}}, {{32, 32}, {16, 16}}} +}; + +const auto layerSpecificParams = ::testing::Combine( + ::testing::ValuesIn(min_sizes), + ::testing::ValuesIn(max_sizes), + ::testing::ValuesIn(aspect_ratios), + ::testing::ValuesIn(densities), + ::testing::ValuesIn(fixed_ratios), + ::testing::ValuesIn(fixed_sizes), + ::testing::ValuesIn(clips), + ::testing::ValuesIn(flips), + ::testing::ValuesIn(steps), + ::testing::ValuesIn(offsets), + ::testing::ValuesIn(variances), + ::testing::ValuesIn(scale_all_sizes)); + +INSTANTIATE_TEST_SUITE_P(smoke_PriorBox, PriorBoxLayerCPUTest, + ::testing::Combine( + layerSpecificParams, + ::testing::ValuesIn(netPrecisions), + ::testing::Values(ov::test::ElementType::undefined), + ::testing::Values(ov::test::ElementType::undefined), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::ValuesIn(inputShape), + ::testing::ValuesIn(imageShape), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + PriorBoxLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions \ No newline at end of file diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/prior_box_clustered.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/prior_box_clustered.cpp new file mode 100644 index 00000000000..0e24002250a --- /dev/null +++ b/src/tests/functional/plugin/cpu/single_layer_tests/prior_box_clustered.cpp @@ -0,0 +1,228 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include + +#include +#include "ngraph_functions/builders.hpp" +#include "shared_test_classes/base/layer_test_utils.hpp" +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ov::test; + +namespace CPULayerTestsDefinitions { + +typedef std::tuple< + std::vector, // widths + std::vector, // heights + bool, // clip + float, // step_width + float, // step_height + float, // step + float, // offset + std::vector> priorBoxClusteredSpecificParams; + +typedef std::tuple< + priorBoxClusteredSpecificParams, + ov::test::ElementType, // net precision + ov::test::ElementType, // Input precision + ov::test::ElementType, // Output precision + InferenceEngine::Layout, // Input layout + InferenceEngine::Layout, // Output layout + ov::test::InputShape, // input shape + ov::test::InputShape, // image shape + std::string> priorBoxClusteredLayerParams; + +class PriorBoxClusteredLayerCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ov::test::ElementType netPrecision; + ov::test::ElementType inPrc, outPrc; + InferenceEngine::Layout inLayout, outLayout; + ov::test::InputShape inputShapes, imageShapes; + std::string targetDevice; + priorBoxClusteredSpecificParams specParams; + std::tie(specParams, + netPrecision, + inPrc, outPrc, inLayout, outLayout, + inputShapes, + imageShapes, + targetDevice) = obj.param; + + ngraph::op::PriorBoxClusteredAttrs attributes; + std::tie( + attributes.widths, + attributes.heights, + attributes.clip, + attributes.step_widths, + attributes.step_heights, + attributes.step, + attributes.offset, + attributes.variances) = specParams; + + std::ostringstream result; + const char separator = '_'; + + result << "IS=" << inputShapes << separator; + result << "imageS=" << imageShapes << separator; + result << "netPRC=" << netPrecision << separator; + result << "inPRC=" << inPrc << separator; + result << "outPRC=" << outPrc << separator; + result << "inL=" << inLayout << separator; + result << "outL=" << outLayout << separator; + result << "widths=" << CommonTestUtils::vec2str(attributes.widths) << separator; + result << "heights=" << CommonTestUtils::vec2str(attributes.heights) << separator; + result << "variances="; + if (attributes.variances.empty()) + result << "()" << separator; + else + result << CommonTestUtils::vec2str(attributes.variances) << separator; + result << "stepWidth=" << attributes.step_widths << separator; + result << "stepHeight=" << attributes.step_heights << separator; + result << "step=" << attributes.step << separator; + result << "offset=" << attributes.offset << separator; + result << "clip=" << std::boolalpha << attributes.clip << separator; + result << "trgDev=" << targetDevice; + return result.str(); + } + +protected: + void SetUp() override { + priorBoxClusteredSpecificParams specParams; + + InferenceEngine::Layout inLayout; + InferenceEngine::Layout outLayout; + ov::test::ElementType netPrecision; + ov::test::ElementType inPrc; + ov::test::ElementType outPrc; + ov::test::InputShape inputShapes; + ov::test::InputShape imageShapes; + std::tie(specParams, netPrecision, + inPrc, outPrc, inLayout, outLayout, + inputShapes, imageShapes, targetDevice) = GetParam(); + + selectedType = makeSelectedTypeStr("ref", inPrc); + targetDevice = CommonTestUtils::DEVICE_CPU; + + init_input_shapes({ inputShapes, imageShapes }); + + ngraph::op::PriorBoxClusteredAttrs attributes; + std::tie( + attributes.widths, + attributes.heights, + attributes.clip, + attributes.step_widths, + attributes.step_heights, + attributes.step, + attributes.offset, + attributes.variances) = specParams; + + auto params = ngraph::builder::makeDynamicParams(netPrecision, { inputShapes.first, imageShapes.first }); + + auto shape_of_1 = std::make_shared(params[0]); + auto shape_of_2 = std::make_shared(params[1]); + auto priorBoxClustered = std::make_shared( + shape_of_1, + shape_of_2, + attributes); + + ngraph::ResultVector results{ std::make_shared(priorBoxClustered) }; + function = std::make_shared(results, params, "priorBoxClustered"); + } +}; + +TEST_P(PriorBoxClusteredLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); + CheckPluginRelatedResults(executableNetwork, "PriorBoxClustered"); +} + +namespace { +// Common params +const std::vector netPrecisions = { + ov::test::ElementType::f32, + ov::test::ElementType::f16 +}; + +const std::vector> widths = { + { 5.12f, 14.6f, 13.5f }, + { 7.0f, 8.2f, 33.39f } +}; + +const std::vector> heights = { + { 15.12f, 15.6f, 23.5f }, + { 10.0f, 16.2f, 36.2f } +}; + +const std::vector step_widths = { + 0.0f, 2.0f +}; + +const std::vector step_heights = { + 0.0f, 1.5f +}; + +const std::vector step = { + 0.0f, 1.0f, 1.5f +}; + +const std::vector offsets = { + 0.5f +}; + +const std::vector> variances = { + {0.1f, 0.1f, 0.2f, 0.2f}, + {0.2f}, + {} +}; + +const std::vector clips = { + true, false +}; + +const auto layerSpeficParams = ::testing::Combine( + ::testing::ValuesIn(widths), + ::testing::ValuesIn(heights), + ::testing::ValuesIn(clips), + ::testing::ValuesIn(step_widths), + ::testing::ValuesIn(step_heights), + ::testing::ValuesIn(step), + ::testing::ValuesIn(offsets), + ::testing::ValuesIn(variances) +); + +const std::vector inputShapes = { + {{4, 4}, {{4, 4}}}, + {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{4, 4}, {8, 8}}}, + {{{4, 8}, {4, 8}}, {{4, 4}, {8, 8}}} +}; + +const std::vector imageShapes = { + {{50, 50}, {{50, 50}}}, + {{ov::Dimension::dynamic(), ov::Dimension::dynamic()}, {{50, 50}, {100, 100}}}, + {{{50, 100}, {50, 100}}, {{50, 50}, {100, 100}}} +}; + +INSTANTIATE_TEST_SUITE_P(smoke_PriorBoxClustered, PriorBoxClusteredLayerCPUTest, + ::testing::Combine( + layerSpeficParams, + ::testing::ValuesIn(netPrecisions), + ::testing::Values(ov::test::ElementType::undefined), + ::testing::Values(ov::test::ElementType::undefined), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::Values(InferenceEngine::Layout::ANY), + ::testing::ValuesIn(inputShapes), + ::testing::ValuesIn(imageShapes), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + PriorBoxClusteredLayerCPUTest::getTestCaseName +); + +} // namespace +} // namespace CPULayerTestsDefinitions diff --git a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp index d6cf3cfd997..ce83bf6d650 100644 --- a/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp +++ b/src/tests/functional/shared_test_classes/src/base/ov_subgraph.cpp @@ -266,7 +266,13 @@ void SubgraphBaseTest::validate() { void SubgraphBaseTest::init_input_shapes(const std::vector& shapes) { size_t targetStaticShapeSize = shapes.front().second.size(); + for (size_t i = 1; i < shapes.size(); ++i) { + if (targetStaticShapeSize < shapes[i].second.size()) { + targetStaticShapeSize = shapes[i].second.size(); + } + } targetStaticShapes.resize(targetStaticShapeSize); + for (const auto& shape : shapes) { auto dynShape = shape.first; if (dynShape.rank() == 0) { @@ -274,10 +280,8 @@ void SubgraphBaseTest::init_input_shapes(const std::vector& shapes) dynShape = shape.second.front(); } inputDynamicShapes.push_back(dynShape); - ASSERT_EQ(shape.second.size(), targetStaticShapeSize) - << "Target static count shapes should be the same for all inputs"; - for (size_t i = 0; i < shape.second.size(); ++i) { - targetStaticShapes[i].push_back(shape.second.at(i)); + for (size_t i = 0; i < targetStaticShapeSize; ++i) { + targetStaticShapes[i].push_back(i < shape.second.size() ? shape.second.at(i) : shape.second.back()); } } } diff --git a/src/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp b/src/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp index b901dfd7bfd..16ee9b0d100 100644 --- a/src/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp +++ b/src/tests/functional/shared_test_classes/src/single_layer/prior_box.cpp @@ -59,8 +59,9 @@ void PriorBoxLayerTest::SetUp() { inPrc, outPrc, inLayout, outLayout, inputShapes, imageShapes, targetDevice) = GetParam(); - std::tie(min_size, max_size, aspect_ratio, density, fixed_ratio, fixed_size, - clip, flip, step, offset, variance, scale_all_sizes, + std::tie(min_size, max_size, aspect_ratio, + density, fixed_ratio, fixed_size, clip, + flip, step, offset, variance, scale_all_sizes, min_max_aspect_ratios_order) = specParams; auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); From e52c96389d3f04234e30c7c4397363cb81755413 Mon Sep 17 00:00:00 2001 From: Vladislav Volkov Date: Thu, 30 Dec 2021 18:22:16 +0300 Subject: [PATCH 36/78] [CPU] Bug in jit_convert fixed (#9485) --- .../src/nodes/common/cpu_convert.cpp | 142 ++++++------------ 1 file changed, 43 insertions(+), 99 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp index 31205ad84e3..a374c1e72d8 100644 --- a/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp +++ b/src/plugins/intel_cpu/src/nodes/common/cpu_convert.cpp @@ -6,6 +6,7 @@ #include "cpu_memcpy.h" #include #include +#include #include #include #include @@ -17,8 +18,8 @@ using namespace MKLDNNPlugin; using namespace InferenceEngine; -using namespace dnnl::impl::cpu::x64; -using namespace dnnl::impl::utils; +using namespace mkldnn::impl::utils; +using namespace mkldnn::impl::cpu::x64; using namespace Xbyak; namespace { @@ -52,109 +53,51 @@ void convert_vec(jit_generator & gen, gen.movdqu(gen.xword[dst], f16vec); } -class jit_convert_array : public jit_generator { +class jit_convert_array : public jit_kernel { DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_convert_array) void generate() override { - const size_t vlen = 8u; - const size_t vlen_log2 = 3; - - auto reg_src = rax; - auto reg_dst = rbx; - auto reg_sz = rdx; - - Label tail, exit; + constexpr size_t vlen = 8u; + constexpr size_t vlen_log2 = 3; preamble(); - mov(reg_src, ptr[param1 + offsetof(args_t, src)]); - mov(reg_dst, ptr[param1 + offsetof(args_t, out)]); - mov(reg_sz, ptr[param1 + offsetof(args_t, count)]); + // Get arguments addresses + auto src = arg(&args_t::src); + auto dst = arg(&args_t::out); + auto size = arg(&args_t::count); - xor_(rsi, rsi); - mov(r8, reg_sz); - shr(r8, vlen_log2); + size >>= vlen_log2; - foreach(rsi, 1, r8, [&, this](const Xbyak::Reg64& idx) { - _convert_vec(*this, reg_src, reg_dst); - add(reg_src, _src_size * vlen); - add(reg_dst, _dst_size * vlen); + foreach(0, size, [&, this](const Xbyak::Reg64& idx) { + _convert_vec(*this, src, dst); + src += _src_size * vlen; + dst += _dst_size * vlen; }); - L(tail); - - shl(rsi, vlen_log2); - sub(reg_sz, rsi); - test(reg_sz, reg_sz); - jz(exit); - - // allocate array for 8 floats on stack - sub(rsp, vlen * sizeof(float)); - mov(r8, rsp); - - vpxor(ymm4, ymm4, ymm4); - vmovups(yword[r8], ymm4); + mov(size, argPtr(&args_t::count)); + size &= vlen - 1; // Tail conversion - copy(r8, reg_src, reg_sz, _src_size); - _convert_vec(*this, r8, r8); - copy(reg_dst, r8, reg_sz, _dst_size); + _if(size != 0) + ._then([&] { + auto tmp = stack(vlen * sizeof(float)); + tmp.clear(); - // Free the array on stack - add(rsp, vlen * sizeof(float)); + auto tail_size = var(); - L(exit); + tail_size = size; + tail_size <<= static_cast(std::logb(_src_size)) - 1; + copy(tmp.pointer(), src, tail_size); - postamble(); - } + _convert_vec(*this, tmp.pointer(), tmp.pointer()); - void foreach(const Xbyak::Reg64& idx, - size_t step, - const Xbyak::Reg64& end, - std::function && fn) { - Label loop, exit; - - L(loop); - cmp(idx, end); - jge(exit); - - fn(idx); - - add(idx, step); - jmp(loop); - L(exit); - } - - void copy(const Xbyak::Reg64& dst, - const Xbyak::Reg64& src, - const Xbyak::Reg64& size, - size_t item_size) { - push(rsi); - push(r15); - - xor_(rsi, rsi); - - auto address_frame = [this](size_t size) -> const AddressFrame& { - switch (size) { - case 1: return byte; - case 2: return word; - case 4: return dword; - case 8: return qword; - default: - break; - } - return ptr; - }; - - const auto & addr_frame = address_frame(item_size); - - foreach(rsi, 1, size, [&, this](const Xbyak::Reg64& idx) { - mov(r15, addr_frame[src + idx * item_size]); - mov(addr_frame[dst + idx * item_size], r15); + tail_size = size; + tail_size <<= static_cast(std::logb(_dst_size)) - 1; + copy(dst, tmp.pointer(), tail_size); }); - pop(r15); - pop(rsi); + postamble(); } public: @@ -179,7 +122,8 @@ public: template static fn_t get() { - if (mayiuse(avx2) && cpu().has(util::Cpu::tF16C)) { + if (mayiuse(cpu_isa_t::avx2) + && dnnl::impl::cpu::x64::cpu().has(Xbyak::util::Cpu::tF16C)) { static jit_convert_array converter(convert_vec, sizeof(src_t), sizeof(dst_t)); auto & generator = static_cast(converter); generator.create_kernel(); @@ -216,7 +160,7 @@ struct PrecisionInfo { template <> struct PrecisionInfo { - using value_type = bfloat16_t; + using value_type = MKLDNNPlugin::bfloat16_t; }; template <> @@ -232,7 +176,7 @@ struct PrecisionInfo { template::value - || std::is_same::value, + || std::is_same::value, float, T>::type> struct Range { const std::tuple & fit(const Precision & prec); @@ -250,8 +194,8 @@ const std::tuple & Range::fit(const Precision & prec) { double lbound, ubound; switch (prec) { case Precision::BF16: - lbound = static_cast(std::numeric_limits::lowest()); - ubound = static_cast(std::numeric_limits::max()); + lbound = static_cast(std::numeric_limits::lowest()); + ubound = static_cast(std::numeric_limits::max()); break; case Precision::FP16: lbound = static_cast(std::numeric_limits::lowest()); @@ -366,20 +310,20 @@ struct ConvertPrecision> { }; template<> -struct ConvertPrecision> { +struct ConvertPrecision> { void operator()(ConvertContext & ctx) { auto src = static_cast(ctx.srcPtr); - auto dst = static_cast(ctx.dstPtr); + auto dst = static_cast(ctx.dstPtr); if (ctx.interimPrc.is_float()) { parallel_for(ctx.size, [&](size_t i) { - dst[i] = static_cast(src[i]); + dst[i] = static_cast(src[i]); }); } else { float lbound, ubound; std::tie(lbound, ubound) = ctx.range(); parallel_for(ctx.size, [&](size_t i) { - dst[i] = static_cast(std::trunc(std::max(std::min(src[i], ubound), lbound))); + dst[i] = static_cast(std::trunc(std::max(std::min(src[i], ubound), lbound))); }); } @@ -388,9 +332,9 @@ struct ConvertPrecision> { }; template<> -struct ConvertPrecision> { +struct ConvertPrecision> { void operator()(ConvertContext & ctx) { - auto src = static_cast(ctx.srcPtr); + auto src = static_cast(ctx.srcPtr); auto dst = static_cast(ctx.dstPtr); if (ctx.interimPrc.is_float()) { @@ -399,7 +343,7 @@ struct ConvertPrecision> { }); } else { float lbound, ubound; - std::tie(lbound, ubound) = ctx.range(); + std::tie(lbound, ubound) = ctx.range(); parallel_for(ctx.size, [&](size_t i) { dst[i] = std::trunc(std::max(std::min(static_cast(src[i]), ubound), lbound)); }); From 4dbc9ae2e779dd07e3c9e04470bec7bd0a537a7f Mon Sep 17 00:00:00 2001 From: Fedor Zharinov Date: Thu, 30 Dec 2021 19:09:12 +0300 Subject: [PATCH 37/78] benchmark_app with dynamic reshapes and API 2.0 (#8609) * API 2.0 changes * stylefix * Update samples/cpp/benchmark_app/main.cpp Co-authored-by: Nadezhda Ageeva * Update samples/cpp/benchmark_app/infer_request_wrap.hpp Co-authored-by: Ilya Churaev * Update samples/cpp/benchmark_app/utils.cpp Co-authored-by: Ilya Churaev * fixes * fix for: gpu headers are moved to another folder... yet again * fix for mac build paranoia * function,classes and files renames/change logic to work with inputs() * stylefix * 2nd portion of fixes * stylefix * Batch warnings Co-authored-by: Nadezhda Ageeva Co-authored-by: Ilya Churaev --- .../cpp/benchmark_app/infer_request_wrap.hpp | 43 +- samples/cpp/benchmark_app/inputs_filling.cpp | 467 +++++++++--------- samples/cpp/benchmark_app/inputs_filling.hpp | 19 +- samples/cpp/benchmark_app/main.cpp | 250 +++++----- .../benchmark_app/remote_blobs_filling.cpp | 186 ------- .../benchmark_app/remote_tensors_filling.cpp | 162 ++++++ ...filling.hpp => remote_tensors_filling.hpp} | 17 +- .../benchmark_app/shared_blob_allocator.hpp | 43 -- .../benchmark_app/shared_tensor_allocator.hpp | 42 ++ .../cpp/benchmark_app/statistics_report.cpp | 52 +- .../cpp/benchmark_app/statistics_report.hpp | 4 +- samples/cpp/benchmark_app/utils.cpp | 450 ++++++++++++++--- samples/cpp/benchmark_app/utils.hpp | 247 +++------ .../utils/include/samples/args_helper.hpp | 5 + .../common/utils/include/samples/common.hpp | 137 +++-- samples/cpp/common/utils/src/args_helper.cpp | 99 ++++ 16 files changed, 1284 insertions(+), 939 deletions(-) delete mode 100644 samples/cpp/benchmark_app/remote_blobs_filling.cpp create mode 100644 samples/cpp/benchmark_app/remote_tensors_filling.cpp rename samples/cpp/benchmark_app/{remote_blobs_filling.hpp => remote_tensors_filling.hpp} (81%) delete mode 100644 samples/cpp/benchmark_app/shared_blob_allocator.hpp create mode 100644 samples/cpp/benchmark_app/shared_tensor_allocator.hpp diff --git a/samples/cpp/benchmark_app/infer_request_wrap.hpp b/samples/cpp/benchmark_app/infer_request_wrap.hpp index 774ef8f2153..a26119cf7af 100644 --- a/samples/cpp/benchmark_app/infer_request_wrap.hpp +++ b/samples/cpp/benchmark_app/infer_request_wrap.hpp @@ -11,14 +11,14 @@ #include #include #include +#include #include #include #include // clang-format off -#include "inference_engine.hpp" -#include "remote_blobs_filling.hpp" +#include "remote_tensors_filling.hpp" #include "statistics_report.hpp" #include "utils.hpp" // clang-format on @@ -33,13 +33,14 @@ public: ~InferReqWrap() = default; - explicit InferReqWrap(InferenceEngine::ExecutableNetwork& net, size_t id, QueueCallbackFunction callbackQueue) - : _request(net.CreateInferRequest()), + explicit InferReqWrap(ov::runtime::CompiledModel& model, size_t id, QueueCallbackFunction callbackQueue) + : _request(model.create_infer_request()), _id(id), _lat_group_id(0), _callbackQueue(callbackQueue), outputClBuffer() { - _request.SetCompletionCallback([&]() { + _request.set_callback([&](const std::exception_ptr& ptr) { + // TODO: Add exception ptr rethrow in proper thread _endTime = Time::now(); _callbackQueue(_id, _lat_group_id, getExecutionTimeInMilliseconds()); }); @@ -47,30 +48,35 @@ public: void startAsync() { _startTime = Time::now(); - _request.StartAsync(); + _request.start_async(); } void wait() { - _request.Wait(InferenceEngine::InferRequest::RESULT_READY); + _request.wait(); } void infer() { _startTime = Time::now(); - _request.Infer(); + _request.infer(); _endTime = Time::now(); _callbackQueue(_id, _lat_group_id, getExecutionTimeInMilliseconds()); } - std::map getPerformanceCounts() { - return _request.GetPerformanceCounts(); + std::vector getPerformanceCounts() { + return _request.get_profiling_info(); } - InferenceEngine::Blob::Ptr getBlob(const std::string& name) { - return _request.GetBlob(name); + void setShape(const std::string& name, const ov::Shape& dims) { + // TODO check return status + _request.get_tensor(name).set_shape(dims); } - void setBlob(const std::string& name, const InferenceEngine::Blob::Ptr& data) { - _request.SetBlob(name, data); + ov::runtime::Tensor getTensor(const std::string& name) { + return _request.get_tensor(name); + } + + void setTensor(const std::string& name, const ov::runtime::Tensor& data) { + _request.set_tensor(name, data); } double getExecutionTimeInMilliseconds() const { @@ -90,7 +96,7 @@ public: } private: - InferenceEngine::InferRequest _request; + ov::runtime::InferRequest _request; Time::time_point _startTime; Time::time_point _endTime; size_t _id; @@ -101,13 +107,10 @@ private: class InferRequestsQueue final { public: - InferRequestsQueue(InferenceEngine::ExecutableNetwork& net, - size_t nireq, - size_t lat_group_n, - bool enable_lat_groups) + InferRequestsQueue(ov::runtime::CompiledModel& model, size_t nireq, size_t lat_group_n, bool enable_lat_groups) : enable_lat_groups(enable_lat_groups) { for (size_t id = 0; id < nireq; id++) { - requests.push_back(std::make_shared(net, + requests.push_back(std::make_shared(model, id, std::bind(&InferRequestsQueue::putIdleRequest, this, diff --git a/samples/cpp/benchmark_app/inputs_filling.cpp b/samples/cpp/benchmark_app/inputs_filling.cpp index 93a8e3895ef..1d78f375578 100644 --- a/samples/cpp/benchmark_app/inputs_filling.cpp +++ b/samples/cpp/benchmark_app/inputs_filling.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include "inputs_filling.hpp" + #include #include #include @@ -11,41 +13,9 @@ #include #include -// clang-format off -#include "samples/slog.hpp" #include "format_reader_ptr.h" - -#include "inputs_filling.hpp" -#include "shared_blob_allocator.hpp" +#include "shared_tensor_allocator.hpp" #include "utils.hpp" -// clang-format on - -using namespace InferenceEngine; - -#ifdef USE_OPENCV -static const std::vector supported_image_extensions = - {"bmp", "dib", "jpeg", "jpg", "jpe", "jp2", "png", "pbm", "pgm", "ppm", "sr", "ras", "tiff", "tif"}; -#else -static const std::vector supported_image_extensions = {"bmp"}; -#endif -static const std::vector supported_binary_extensions = {"bin"}; - -std::vector filterFilesByExtensions(const std::vector& filePaths, - const std::vector& extensions) { - std::vector filtered; - auto getExtension = [](const std::string& name) { - auto extensionPosition = name.rfind('.', name.size()); - return extensionPosition == std::string::npos ? "" : name.substr(extensionPosition + 1, name.size() - 1); - }; - for (auto& filePath : filePaths) { - auto extension = getExtension(filePath); - std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower); - if (std::find(extensions.begin(), extensions.end(), extension) != extensions.end()) { - filtered.push_back(filePath); - } - } - return filtered; -} template using uniformDistribution = typename std::conditional< @@ -54,20 +24,30 @@ using uniformDistribution = typename std::conditional< typename std::conditional::value, std::uniform_int_distribution, void>::type>::type; template -InferenceEngine::Blob::Ptr createBlobFromImage(const std::vector& files, - size_t inputId, - size_t batchSize, - const benchmark_app::InputInfo& inputInfo, - std::string* filenames_used = nullptr) { - size_t blob_size = - std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); - T* data = new T[blob_size]; +ov::runtime::Tensor createTensorFromImage(const std::vector& files, + size_t inputId, + size_t batchSize, + const benchmark_app::InputInfo& inputInfo, + const std::string& inputName, + std::string* filenames_used = nullptr) { + size_t tensor_size = + std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); + auto allocator = std::make_shared(tensor_size * sizeof(T)); + auto data = reinterpret_cast(allocator->getBuffer()); /** Collect images data ptrs **/ std::vector> vreader; vreader.reserve(batchSize); - for (size_t b = 0; b < batchSize; ++b) { + size_t imgBatchSize = 1; + if (!inputInfo.layout.empty() && ov::layout::has_batch(inputInfo.layout)) { + imgBatchSize = batchSize; + } else { + slog::warn << inputName << ": layout does not contain batch dimension. Assuming bath 1 for this input" + << slog::endl; + } + + for (size_t b = 0; b < imgBatchSize; ++b) { auto inputIndex = (inputId + b) % files.size(); if (filenames_used) { *filenames_used += (filenames_used->empty() ? "" : ", ") + files[inputIndex]; @@ -90,7 +70,7 @@ InferenceEngine::Blob::Ptr createBlobFromImage(const std::vector& f const size_t width = inputInfo.width(); const size_t height = inputInfo.height(); /** Iterate over all input images **/ - for (size_t b = 0; b < batchSize; ++b) { + for (size_t b = 0; b < imgBatchSize; ++b) { /** Iterate over all width **/ for (size_t w = 0; w < width; ++w) { /** Iterate over all height **/ @@ -112,24 +92,30 @@ InferenceEngine::Blob::Ptr createBlobFromImage(const std::vector& f } } - InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout); - auto blob = - InferenceEngine::make_shared_blob(tDesc, - std::make_shared>(data, blob_size * sizeof(T))); - blob->allocate(); - return blob; + auto tensor = ov::runtime::Tensor(inputInfo.type, inputInfo.dataShape, ov::runtime::Allocator(allocator)); + return tensor; } template -InferenceEngine::Blob::Ptr createBlobImInfo(const std::pair& image_size, - size_t batchSize, - const benchmark_app::InputInfo& inputInfo) { - size_t blob_size = - std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); - T* data = new T[blob_size]; +ov::runtime::Tensor createTensorImInfo(const std::pair& image_size, + size_t batchSize, + const benchmark_app::InputInfo& inputInfo, + const std::string& inputName) { + size_t tensor_size = + std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); + auto allocator = std::make_shared(tensor_size * sizeof(T)); + auto data = reinterpret_cast(allocator->getBuffer()); - for (size_t b = 0; b < batchSize; b++) { - size_t iminfoSize = blob_size / batchSize; + size_t infoBatchSize = 1; + if (!inputInfo.layout.empty() && ov::layout::has_batch(inputInfo.layout)) { + infoBatchSize = batchSize; + } else { + slog::warn << inputName << ": layout is not set or does not contain batch dimension. Assuming batch 1. " + << slog::endl; + } + + for (size_t b = 0; b < infoBatchSize; b++) { + size_t iminfoSize = tensor_size / infoBatchSize; for (size_t i = 0; i < iminfoSize; i++) { size_t index = b * iminfoSize + i; if (0 == i) @@ -141,35 +127,32 @@ InferenceEngine::Blob::Ptr createBlobImInfo(const std::pair& ima } } - InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout); - InferenceEngine::Blob::Ptr blob = - InferenceEngine::make_shared_blob(tDesc, - std::make_shared>(data, blob_size * sizeof(T))); - blob->allocate(); - return blob; + auto tensor = ov::runtime::Tensor(inputInfo.type, inputInfo.dataShape, ov::runtime::Allocator(allocator)); + return tensor; } template -InferenceEngine::Blob::Ptr createBlobFromBinary(const std::vector& files, - size_t inputId, - size_t batchSize, - const benchmark_app::InputInfo& inputInfo, - std::string* filenames_used = nullptr) { - size_t blob_size = - std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); - char* data = new char[blob_size * sizeof(T)]; - - // adjust batch size - std::stringstream ss; - ss << inputInfo.originalLayout; - std::string layout = ss.str(); - if (layout.find("N") == std::string::npos) { - batchSize = 1; - } else if (inputInfo.batch() != batchSize) { - batchSize = inputInfo.batch(); +ov::runtime::Tensor createTensorFromBinary(const std::vector& files, + size_t inputId, + size_t batchSize, + const benchmark_app::InputInfo& inputInfo, + const std::string& inputName, + std::string* filenames_used = nullptr) { + size_t tensor_size = + std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); + auto allocator = std::make_shared(tensor_size * sizeof(T)); + char* data = allocator->getBuffer(); + size_t binaryBatchSize = 1; + if (!inputInfo.layout.empty() && ov::layout::has_batch(inputInfo.layout)) { + binaryBatchSize = batchSize; + } else { + slog::warn << inputName + << ": layout is not set or does not contain batch dimension. Assuming that binary " + "data read from file contains data for all batches." + << slog::endl; } - for (size_t b = 0; b < batchSize; ++b) { + for (size_t b = 0; b < binaryBatchSize; ++b) { size_t inputIndex = (inputId + b) % files.size(); std::ifstream binaryFile(files[inputIndex], std::ios_base::binary | std::ios_base::ate); if (!binaryFile) { @@ -181,7 +164,7 @@ InferenceEngine::Blob::Ptr createBlobFromBinary(const std::vector& if (!binaryFile.good()) { IE_THROW() << "Can not read " << files[inputIndex]; } - auto inputSize = blob_size * sizeof(T) / batchSize; + auto inputSize = tensor_size * sizeof(T) / binaryBatchSize; if (fileSize != inputSize) { IE_THROW() << "File " << files[inputIndex] << " contains " << std::to_string(fileSize) << " bytes " @@ -193,7 +176,7 @@ InferenceEngine::Blob::Ptr createBlobFromBinary(const std::vector& binaryFile.read(&data[b * inputSize], inputSize); } else { for (int i = 0; i < inputInfo.channels(); i++) { - binaryFile.read(&data[(i * batchSize + b) * sizeof(T)], sizeof(T)); + binaryFile.read(&data[(i * binaryBatchSize + b) * sizeof(T)], sizeof(T)); } } @@ -202,128 +185,171 @@ InferenceEngine::Blob::Ptr createBlobFromBinary(const std::vector& } } - InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout); - InferenceEngine::Blob::Ptr blob = - InferenceEngine::make_shared_blob(tDesc, - std::make_shared>((T*)data, blob_size * sizeof(T))); - blob->allocate(); - return blob; + auto tensor = ov::runtime::Tensor(inputInfo.type, inputInfo.dataShape, ov::runtime::Allocator(allocator)); + return tensor; } template -InferenceEngine::Blob::Ptr createBlobRandom(const benchmark_app::InputInfo& inputInfo, - T rand_min = std::numeric_limits::min(), - T rand_max = std::numeric_limits::max()) { - size_t blob_size = - std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); - T* data = new T[blob_size]; +ov::runtime::Tensor createTensorRandom(const benchmark_app::InputInfo& inputInfo, + T rand_min = std::numeric_limits::min(), + T rand_max = std::numeric_limits::max()) { + size_t tensor_size = + std::accumulate(inputInfo.dataShape.begin(), inputInfo.dataShape.end(), 1, std::multiplies()); + auto allocator = std::make_shared(tensor_size * sizeof(T)); + auto data = reinterpret_cast(allocator->getBuffer()); std::mt19937 gen(0); uniformDistribution distribution(rand_min, rand_max); - for (size_t i = 0; i < blob_size; i++) { + for (size_t i = 0; i < tensor_size; i++) { data[i] = static_cast(distribution(gen)); } - InferenceEngine::TensorDesc tDesc(inputInfo.precision, inputInfo.dataShape, inputInfo.originalLayout); - InferenceEngine::Blob::Ptr blob = - InferenceEngine::make_shared_blob(tDesc, - std::make_shared>(data, blob_size * sizeof(T))); - blob->allocate(); - return blob; + auto tensor = ov::runtime::Tensor(inputInfo.type, inputInfo.dataShape, ov::runtime::Allocator(allocator)); + return tensor; } -InferenceEngine::Blob::Ptr getImageBlob(const std::vector& files, - size_t inputId, - size_t batchSize, - const std::pair& inputInfo, - std::string* filenames_used = nullptr) { - auto precision = inputInfo.second.precision; - if (precision == InferenceEngine::Precision::FP32) { - return createBlobFromImage(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::FP16) { - return createBlobFromImage(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::I32) { - return createBlobFromImage(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::I64) { - return createBlobFromImage(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::U8) { - return createBlobFromImage(files, inputId, batchSize, inputInfo.second, filenames_used); +ov::runtime::Tensor getImageTensor(const std::vector& files, + size_t inputId, + size_t batchSize, + const std::pair& inputInfo, + std::string* filenames_used = nullptr) { + auto type = inputInfo.second.type; + if (type == ov::element::f32) { + return createTensorFromImage(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::f16) { + return createTensorFromImage(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::i32) { + return createTensorFromImage(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::i64) { + return createTensorFromImage(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::u8) { + return createTensorFromImage(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); } else { - IE_THROW() << "Input precision is not supported for " << inputInfo.first; + IE_THROW() << "Input type is not supported for " << inputInfo.first; } } -InferenceEngine::Blob::Ptr getImInfoBlob(const std::pair& image_size, - size_t batchSize, - const std::pair& inputInfo) { - auto precision = inputInfo.second.precision; - if (precision == InferenceEngine::Precision::FP32) { - return createBlobImInfo(image_size, batchSize, inputInfo.second); - } else if (precision == InferenceEngine::Precision::FP16) { - return createBlobImInfo(image_size, batchSize, inputInfo.second); - } else if (precision == InferenceEngine::Precision::I32) { - return createBlobImInfo(image_size, batchSize, inputInfo.second); - } else if (precision == InferenceEngine::Precision::I64) { - return createBlobImInfo(image_size, batchSize, inputInfo.second); +ov::runtime::Tensor getImInfoTensor(const std::pair& image_size, + size_t batchSize, + const std::pair& inputInfo) { + auto type = inputInfo.second.type; + if (type == ov::element::f32) { + return createTensorImInfo(image_size, batchSize, inputInfo.second, inputInfo.first); + } else if (type == ov::element::f16) { + return createTensorImInfo(image_size, batchSize, inputInfo.second, inputInfo.first); + } else if (type == ov::element::i32) { + return createTensorImInfo(image_size, batchSize, inputInfo.second, inputInfo.first); + } else if (type == ov::element::i64) { + return createTensorImInfo(image_size, batchSize, inputInfo.second, inputInfo.first); } else { - IE_THROW() << "Input precision is not supported for " << inputInfo.first; + IE_THROW() << "Input type is not supported for " << inputInfo.first; } } -InferenceEngine::Blob::Ptr getBinaryBlob(const std::vector& files, - size_t inputId, - size_t batchSize, - const std::pair& inputInfo, - std::string* filenames_used = nullptr) { - auto precision = inputInfo.second.precision; - if (precision == InferenceEngine::Precision::FP32) { - return createBlobFromBinary(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::FP16) { - return createBlobFromBinary(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::I32) { - return createBlobFromBinary(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if (precision == InferenceEngine::Precision::I64) { - return createBlobFromBinary(files, inputId, batchSize, inputInfo.second, filenames_used); - } else if ((precision == InferenceEngine::Precision::U8) || (precision == InferenceEngine::Precision::BOOL)) { - return createBlobFromBinary(files, inputId, batchSize, inputInfo.second, filenames_used); +ov::runtime::Tensor getBinaryTensor(const std::vector& files, + size_t inputId, + size_t batchSize, + const std::pair& inputInfo, + std::string* filenames_used = nullptr) { + const auto& type = inputInfo.second.type; + if (type == ov::element::f32) { + return createTensorFromBinary(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::f16) { + return createTensorFromBinary(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::i32) { + return createTensorFromBinary(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if (type == ov::element::i64) { + return createTensorFromBinary(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); + } else if ((type == ov::element::u8) || (type == ov::element::boolean)) { + return createTensorFromBinary(files, + inputId, + batchSize, + inputInfo.second, + inputInfo.first, + filenames_used); } else { - IE_THROW() << "Input precision is not supported for " << inputInfo.first; + IE_THROW() << "Input type is not supported for " << inputInfo.first; } } -InferenceEngine::Blob::Ptr getRandomBlob(const std::pair& inputInfo) { - auto precision = inputInfo.second.precision; - if (precision == InferenceEngine::Precision::FP32) { - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::FP16) { - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::I32) { - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::I64) { - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::U8) { +ov::runtime::Tensor getRandomTensor(const std::pair& inputInfo) { + auto type = inputInfo.second.type; + if (type == ov::element::f32) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::f16) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::i32) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::i64) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::u8) { // uniform_int_distribution is not allowed in the C++17 // standard and vs2017/19 - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::I8) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::i8) { // uniform_int_distribution is not allowed in the C++17 standard // and vs2017/19 - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::U16) { - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::I16) { - return createBlobRandom(inputInfo.second); - } else if (precision == InferenceEngine::Precision::BOOL) { - return createBlobRandom(inputInfo.second, 0, 1); + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::u16) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::i16) { + return createTensorRandom(inputInfo.second); + } else if (type == ov::element::boolean) { + return createTensorRandom(inputInfo.second, 0, 1); } else { - IE_THROW() << "Input precision is not supported for " << inputInfo.first; + IE_THROW() << "Input type is not supported for " << inputInfo.first; } } std::string getTestInfoStreamHeader(benchmark_app::InputInfo& inputInfo) { std::stringstream strOut; - strOut << "(" << inputInfo.layout << ", " << inputInfo.precision << ", " << getShapeString(inputInfo.dataShape) - << ", "; + strOut << "(" << inputInfo.layout.to_string() << ", " << inputInfo.type.get_type_name() << ", " + << getShapeString(inputInfo.dataShape) << ", "; if (inputInfo.partialShape.is_dynamic()) { strOut << std::string("dyn:") << inputInfo.partialShape << "):\t"; } else { @@ -332,16 +358,15 @@ std::string getTestInfoStreamHeader(benchmark_app::InputInfo& inputInfo) { return strOut.str(); } -std::map> getBlobs( - std::map>& inputFiles, - std::vector& app_inputs_info) { - std::map> blobs; +std::map getTensors(std::map> inputFiles, + std::vector& app_inputs_info) { + std::map tensors; if (app_inputs_info.empty()) { throw std::logic_error("Inputs Info for network is empty!"); } if (!inputFiles.empty() && inputFiles.size() != app_inputs_info[0].size()) { - throw std::logic_error("Number of inputs specified in -i must be equal number of network inputs!"); + throw std::logic_error("Number of inputs specified in -i must be equal to number of network inputs!"); } // count image type inputs of network @@ -378,7 +403,7 @@ std::map> getBlobs( } if (files.second.empty()) { - slog::warn << "No suitable files for input found! Random data will be used for input " << input_name + slog::warn << "No suitable files for input were found! Random data will be used for input " << input_name << slog::endl; files.second = {"random"}; } @@ -438,30 +463,30 @@ std::map> getBlobs( size_t inputId = m_file % files.second.size(); auto input_info = app_inputs_info[n_shape % app_inputs_info.size()].at(input_name); - std::string blob_src_info; + std::string tensor_src_info; if (files.second[0] == "random") { // Fill random - blob_src_info = + tensor_src_info = "random (" + std::string((input_info.isImage() ? "image" : "binary data")) + " is expected)"; - blobs[input_name].push_back(getRandomBlob({input_name, input_info})); + tensors[input_name].push_back(getRandomTensor({input_name, input_info})); } else if (files.second[0] == "image_info") { // Most likely it is image info: fill with image information auto image_size = net_input_im_sizes.at(n_shape % app_inputs_info.size()); - blob_src_info = - "Image size blob " + std::to_string(image_size.first) + " x " + std::to_string(image_size.second); - blobs[input_name].push_back(getImInfoBlob(image_size, batchSize, {input_name, input_info})); + tensor_src_info = + "Image size tensor " + std::to_string(image_size.first) + " x " + std::to_string(image_size.second); + tensors[input_name].push_back(getImInfoTensor(image_size, batchSize, {input_name, input_info})); } else if (input_info.isImage()) { // Fill with Images - blobs[input_name].push_back( - getImageBlob(files.second, inputId, batchSize, {input_name, input_info}, &blob_src_info)); + tensors[input_name].push_back( + getImageTensor(files.second, inputId, batchSize, {input_name, input_info}, &tensor_src_info)); } else { // Fill with binary files - blobs[input_name].push_back( - getBinaryBlob(files.second, inputId, batchSize, {input_name, input_info}, &blob_src_info)); + tensors[input_name].push_back( + getBinaryTensor(files.second, inputId, batchSize, {input_name, input_info}, &tensor_src_info)); } // Preparing info - std::string strOut = getTestInfoStreamHeader(input_info) + blob_src_info; + std::string strOut = getTestInfoStreamHeader(input_info) + tensor_src_info; if (n_shape >= logOutput.size()) { logOutput.resize(n_shape + 1); } @@ -486,19 +511,18 @@ std::map> getBlobs( } } - return blobs; + return tensors; } -std::map> getBlobsStaticCase( - const std::vector& inputFiles, - const size_t& batchSize, - benchmark_app::InputsInfo& app_inputs_info, - size_t requestsNum) { - std::map> blobs; +std::map getTensorsStaticCase(const std::vector& inputFiles, + const size_t& batchSize, + benchmark_app::InputsInfo& app_inputs_info, + size_t requestsNum) { + std::map blobs; std::vector> net_input_im_sizes; for (auto& item : app_inputs_info) { - if (item.second.isImage()) { + if (item.second.partialShape.is_static() && item.second.isImage()) { net_input_im_sizes.push_back(std::make_pair(item.second.width(), item.second.height())); } } @@ -606,8 +630,11 @@ std::map> getBlobsStaticCas if (input_info.isImage()) { if (!imageFiles.empty()) { // Fill with Images - blobs[input_name].push_back( - getImageBlob(files.second, imageInputId, batchSize, {input_name, input_info}, &blob_src_info)); + blobs[input_name].push_back(getImageTensor(files.second, + imageInputId, + batchSize, + {input_name, input_info}, + &blob_src_info)); imageInputId = (imageInputId + batchSize) % files.second.size(); logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info; continue; @@ -615,11 +642,11 @@ std::map> getBlobsStaticCas } else { if (!binaryFiles.empty()) { // Fill with binary files - blobs[input_name].push_back(getBinaryBlob(files.second, - binaryInputId, - batchSize, - {input_name, input_info}, - &blob_src_info)); + blobs[input_name].push_back(getBinaryTensor(files.second, + binaryInputId, + batchSize, + {input_name, input_info}, + &blob_src_info)); binaryInputId = (binaryInputId + batchSize) % files.second.size(); logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info; continue; @@ -629,7 +656,7 @@ std::map> getBlobsStaticCas auto image_size = net_input_im_sizes.at(0); blob_src_info = "Image size blob " + std::to_string(image_size.first) + " x " + std::to_string(image_size.second); - blobs[input_name].push_back(getImInfoBlob(image_size, batchSize, {input_name, input_info})); + blobs[input_name].push_back(getImInfoTensor(image_size, batchSize, {input_name, input_info})); logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info; continue; } @@ -637,7 +664,7 @@ std::map> getBlobsStaticCas // Fill random blob_src_info = "random (" + std::string((input_info.isImage() ? "image" : "binary data")) + " is expected)"; - blobs[input_name].push_back(getRandomBlob({input_name, input_info})); + blobs[input_name].push_back(getRandomTensor({input_name, input_info})); logOutput[i][input_name] += getTestInfoStreamHeader(input_info) + blob_src_info; } } @@ -659,31 +686,11 @@ std::map> getBlobsStaticCas return blobs; } -void copyBlobData(InferenceEngine::Blob::Ptr& dst, const InferenceEngine::Blob::Ptr& src) { - if (src->getTensorDesc() != dst->getTensorDesc()) { +void copyTensorData(ov::runtime::Tensor& dst, const ov::runtime::Tensor& src) { + if (src.get_shape() != dst.get_shape() || src.get_byte_size() != dst.get_byte_size()) { throw std::runtime_error( - "Source and destination blobs tensor descriptions are expected to be equal for data copying."); + "Source and destination tensors shapes and byte sizes are expected to be equal for data copying."); } - InferenceEngine::MemoryBlob::Ptr srcMinput = as(src); - if (!srcMinput) { - IE_THROW() << "We expect source blob to be inherited from MemoryBlob in " - "fillBlobImage, " - << "but by fact we were not able to cast source blob to MemoryBlob"; - } - // locked memory holder should be alive all time while access to its buffer - // happens - auto srcMinputHolder = srcMinput->wmap(); - auto srcBlobData = srcMinputHolder.as(); - - InferenceEngine::MemoryBlob::Ptr dstMinput = as(dst); - if (!dstMinput) { - IE_THROW() << "We expect destination blob to be inherited from MemoryBlob in " - "fillBlobImage, " - << "but by fact we were not able to cast destination blob to MemoryBlob"; - } - auto dstMinputHolder = dstMinput->wmap(); - auto dstBlobData = dstMinputHolder.as(); - - std::memcpy(dstBlobData, srcBlobData, src->byteSize()); + memcpy(dst.data(), src.data(), src.get_byte_size()); } diff --git a/samples/cpp/benchmark_app/inputs_filling.hpp b/samples/cpp/benchmark_app/inputs_filling.hpp index fdbcce07fb3..07251cb16b2 100644 --- a/samples/cpp/benchmark_app/inputs_filling.hpp +++ b/samples/cpp/benchmark_app/inputs_filling.hpp @@ -4,24 +4,21 @@ #pragma once +#include #include #include // clang-format off -#include "inference_engine.hpp" - #include "infer_request_wrap.hpp" #include "utils.hpp" // clang-format on -std::map> getBlobs( - std::map>& inputFiles, - std::vector& app_inputs_info); +std::map getTensors(std::map> inputFiles, + std::vector& app_inputs_info); -std::map> getBlobsStaticCase( - const std::vector& inputFiles, - const size_t& batchSize, - benchmark_app::InputsInfo& app_inputs_info, - size_t requestsNum); +std::map getTensorsStaticCase(const std::vector& inputFiles, + const size_t& batchSize, + benchmark_app::InputsInfo& app_inputs_info, + size_t requestsNum); -void copyBlobData(InferenceEngine::Blob::Ptr& dst, const InferenceEngine::Blob::Ptr& src); +void copyTensorData(ov::runtime::Tensor& dst, const ov::runtime::Tensor& src); diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp index 0debab7af1b..68f138f2aa5 100644 --- a/samples/cpp/benchmark_app/main.cpp +++ b/samples/cpp/benchmark_app/main.cpp @@ -11,7 +11,7 @@ #include // clang-format off -#include "inference_engine.hpp" +#include "openvino/pass/serialize.hpp" #include "gna/gna_config.hpp" #include "gpu/gpu_config.hpp" @@ -25,13 +25,11 @@ #include "infer_request_wrap.hpp" #include "inputs_filling.hpp" #include "progress_bar.hpp" -#include "remote_blobs_filling.hpp" +#include "remote_tensors_filling.hpp" #include "statistics_report.hpp" #include "utils.hpp" // clang-format on -using namespace InferenceEngine; - static const size_t progressBarDefaultTotalCount = 1000; bool ParseAndCheckCommandLine(int argc, char* argv[]) { @@ -114,7 +112,7 @@ static void next_step(const std::string additional_info = "") { int main(int argc, char* argv[]) { std::shared_ptr statistics; try { - ExecutableNetwork exeNetwork; + ov::runtime::CompiledModel compiledModel; // ----------------- 1. Parsing and validating input arguments // ------------------------------------------------- @@ -172,13 +170,13 @@ int main(int argc, char* argv[]) { // ----------------------------------------------------------- next_step(); - Core ie; + ov::runtime::Core core; if (FLAGS_d.find("CPU") != std::string::npos && !FLAGS_l.empty()) { // CPU (MKLDNN) extensions is loaded as a shared library and passed as a // pointer to base extension const auto extension_ptr = std::make_shared(FLAGS_l); - ie.AddExtension(extension_ptr); + core.add_extension(extension_ptr); slog::info << "CPU (MKLDNN) extensions is loaded " << FLAGS_l << slog::endl; } @@ -191,13 +189,13 @@ int main(int argc, char* argv[]) { } if (config.count("GPU") && config.at("GPU").count(CONFIG_KEY(CONFIG_FILE))) { auto ext = config.at("GPU").at(CONFIG_KEY(CONFIG_FILE)); - ie.SetConfig({{CONFIG_KEY(CONFIG_FILE), ext}}, "GPU"); + core.set_config({{CONFIG_KEY(CONFIG_FILE), ext}}, "GPU"); slog::info << "GPU extensions is loaded " << ext << slog::endl; } - slog::info << "InferenceEngine: " << GetInferenceEngineVersion() << slog::endl; + slog::info << "OpenVINO: " << ov::get_openvino_version() << slog::endl; slog::info << "Device info: " << slog::endl; - slog::info << ie.GetVersions(device_name) << slog::endl; + slog::info << core.get_versions(device_name) << slog::endl; // ----------------- 3. Setting device configuration // ----------------------------------------------------------- @@ -269,7 +267,7 @@ int main(int argc, char* argv[]) { if (device_nstreams.count(device)) { // set to user defined value std::vector supported_config_keys = - ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); + core.get_metric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); if (std::find(supported_config_keys.begin(), supported_config_keys.end(), key) == supported_config_keys.end()) { throw std::logic_error("Device " + device + " doesn't support config key '" + key + "'! " + @@ -342,7 +340,7 @@ int main(int argc, char* argv[]) { device_config[GNA_CONFIG_KEY(LIB_N_THREADS)] = std::to_string(FLAGS_nthreads); } else { std::vector supported_config_keys = - ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); + core.get_metric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); auto supported = [&](const std::string& key) { return std::find(std::begin(supported_config_keys), std::end(supported_config_keys), key) != std::end(supported_config_keys); @@ -360,21 +358,22 @@ int main(int argc, char* argv[]) { } for (auto&& item : config) { - ie.SetConfig(item.second, item.first); + core.set_config(item.second, item.first); } size_t batchSize = FLAGS_b; - Precision precision = Precision::UNSPECIFIED; + ov::element::Type type = ov::element::undefined; std::string topology_name = ""; std::vector app_inputs_info; std::string output_name; // Takes priority over config from file if (!FLAGS_cache_dir.empty()) { - ie.SetConfig({{CONFIG_KEY(CACHE_DIR), FLAGS_cache_dir}}); + core.set_config({{CONFIG_KEY(CACHE_DIR), FLAGS_cache_dir}}); } bool isDynamicNetwork = false; + if (FLAGS_load_from_file && !isNetworkCompiled) { next_step(); slog::info << "Skipping the step for loading network from file" << slog::endl; @@ -383,22 +382,24 @@ int main(int argc, char* argv[]) { next_step(); slog::info << "Skipping the step for loading network from file" << slog::endl; auto startTime = Time::now(); - exeNetwork = ie.LoadNetwork(FLAGS_m, device_name); + compiledModel = core.compile_model(FLAGS_m, device_name); auto duration_ms = double_to_string(get_duration_ms_till_now(startTime)); slog::info << "Load network took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"load network time (ms)", duration_ms}}); - app_inputs_info = getInputsInfo(FLAGS_shape, - FLAGS_layout, - batchSize, - FLAGS_data_shape, - FLAGS_iscale, - FLAGS_imean, - exeNetwork.GetInputsInfo()); + app_inputs_info = getInputsInfo(FLAGS_shape, + FLAGS_layout, + batchSize, + FLAGS_data_shape, + inputFiles, + FLAGS_iscale, + FLAGS_imean, + compiledModel.inputs()); if (batchSize == 0) { batchSize = 1; } + } else if (!isNetworkCompiled) { // ----------------- 4. Reading the Intermediate Representation network // ---------------------------------------- @@ -407,14 +408,14 @@ int main(int argc, char* argv[]) { slog::info << "Loading network files" << slog::endl; auto startTime = Time::now(); - CNNNetwork cnnNetwork = ie.ReadNetwork(FLAGS_m); + auto model = core.read_model(FLAGS_m); auto duration_ms = double_to_string(get_duration_ms_till_now(startTime)); slog::info << "Read network took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"read network time (ms)", duration_ms}}); - const InputsDataMap inputInfo(cnnNetwork.getInputsInfo()); + const auto& inputInfo = std::const_pointer_cast(model)->inputs(); if (inputInfo.empty()) { throw std::logic_error("no inputs info is provided"); } @@ -424,28 +425,56 @@ int main(int argc, char* argv[]) { next_step(); // Parse input shapes if specified bool reshape = false; - app_inputs_info = getInputsInfo(FLAGS_shape, - FLAGS_layout, - FLAGS_b, - FLAGS_data_shape, - FLAGS_iscale, - FLAGS_imean, - inputInfo, - reshape); + app_inputs_info = getInputsInfo(FLAGS_shape, + FLAGS_layout, + FLAGS_b, + FLAGS_data_shape, + inputFiles, + FLAGS_iscale, + FLAGS_imean, + inputInfo, + reshape); if (reshape) { benchmark_app::PartialShapes shapes = {}; for (auto& item : app_inputs_info[0]) shapes[item.first] = item.second.partialShape; slog::info << "Reshaping network: " << getShapesString(shapes) << slog::endl; startTime = Time::now(); - cnnNetwork.reshape(shapes); + model->reshape(shapes); duration_ms = double_to_string(get_duration_ms_till_now(startTime)); slog::info << "Reshape network took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"reshape network time (ms)", duration_ms}}); } - topology_name = cnnNetwork.getName(); + + // ----------------- 6. Configuring inputs and outputs + // ---------------------------------------------------------------------- + next_step(); + auto preproc = ov::preprocess::PrePostProcessor(model); + + processPrecision(*model, FLAGS_ip, FLAGS_op, FLAGS_iop); + for (auto& item : model->inputs()) { + // if precision for input set by user, then set it to app_inputs + const auto& name = item.get_any_name(); + if (!FLAGS_ip.empty() || FLAGS_iop.find(name) != std::string::npos) { + for (auto& info : app_inputs_info) { + info.at(name).type = item.get_element_type(); + } + } else if (app_inputs_info[0].at(name).isImage()) { + // image input, set U8 + for (auto& info : app_inputs_info) { + info.at(name).type = ov::element::u8; + } + } + auto& in = preproc.input(name); + in.tensor().set_element_type(app_inputs_info[0].at(name).type); + + // Explicitly set inputs layout. + in.model().set_layout(app_inputs_info[0].at(name).layout); + } + + model = preproc.build(); // Check if network has dynamic shapes auto input_info = app_inputs_info[0]; @@ -455,37 +484,22 @@ int main(int argc, char* argv[]) { return i.second.partialShape.is_dynamic(); }); + topology_name = model->get_friendly_name(); // use batch size according to provided layout and shapes (static case) - if (batchSize == 0 || !isDynamicNetwork) { - batchSize = (!FLAGS_layout.empty()) ? getBatchSize(app_inputs_info[0]) : cnnNetwork.getBatchSize(); + if (!isDynamicNetwork) { + batchSize = getModelInputBatchSize(*model); + + slog::info << "Network batch size: " << batchSize << slog::endl; + } else if (batchSize == 0) { + batchSize = 1; } - slog::info << (batchSize != 0 ? "Network batch size was changed to: " : "Network batch size: ") << batchSize - << slog::endl; - - // ----------------- 6. Configuring inputs and outputs - // ---------------------------------------------------------------------- - next_step(); - - processPrecision(cnnNetwork, FLAGS_ip, FLAGS_op, FLAGS_iop); - for (auto& item : cnnNetwork.getInputsInfo()) { - // if precision for input set by user, then set it to app_inputs - // if it an image, set U8 - if (!FLAGS_ip.empty() || FLAGS_iop.find(item.first) != std::string::npos || - item.second->getPartialShape().is_dynamic()) { - app_inputs_info[0].at(item.first).precision = item.second->getPrecision(); - } else if (app_inputs_info[0].at(item.first).isImage()) { - app_inputs_info[0].at(item.first).precision = Precision::U8; - item.second->setPrecision(app_inputs_info[0].at(item.first).precision); - } - } - - printInputAndOutputsInfo(cnnNetwork); + printInputAndOutputsInfoShort(*model); // ----------------- 7. Loading the model to the device // -------------------------------------------------------- next_step(); startTime = Time::now(); - exeNetwork = ie.LoadNetwork(cnnNetwork, device_name); + compiledModel = core.compile_model(model, device_name); duration_ms = double_to_string(get_duration_ms_till_now(startTime)); slog::info << "Load network took " << duration_ms << " ms" << slog::endl; if (statistics) @@ -502,19 +516,21 @@ int main(int argc, char* argv[]) { // -------------------------------------------------------- next_step(); auto startTime = Time::now(); - exeNetwork = ie.ImportNetwork(FLAGS_m, device_name, {}); + compiledModel = core.compile_model(FLAGS_m, device_name, {}); auto duration_ms = double_to_string(get_duration_ms_till_now(startTime)); slog::info << "Import network took " << duration_ms << " ms" << slog::endl; if (statistics) statistics->addParameters(StatisticsReport::Category::EXECUTION_RESULTS, {{"import network time (ms)", duration_ms}}); - app_inputs_info = getInputsInfo(FLAGS_shape, - FLAGS_layout, - FLAGS_b, - FLAGS_data_shape, - FLAGS_iscale, - FLAGS_imean, - exeNetwork.GetInputsInfo()); + + app_inputs_info = getInputsInfo(FLAGS_shape, + FLAGS_layout, + FLAGS_b, + FLAGS_data_shape, + inputFiles, + FLAGS_iscale, + FLAGS_imean, + compiledModel.inputs()); if (batchSize == 0) { batchSize = 1; } @@ -543,11 +559,11 @@ int main(int argc, char* argv[]) { if (!ov_perf_hint.empty()) { for (const auto& device : devices) { std::vector supported_config_keys = - ie.GetMetric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); + core.get_metric(device, METRIC_KEY(SUPPORTED_CONFIG_KEYS)); slog::info << "Device: " << device << slog::endl; for (const auto& cfg : supported_config_keys) { try { - slog::info << " {" << cfg << " , " << exeNetwork.GetConfig(cfg).as(); + slog::info << " {" << cfg << " , " << compiledModel.get_config(cfg).as(); } catch (...) { }; slog::info << " }" << slog::endl; @@ -558,7 +574,7 @@ int main(int argc, char* argv[]) { // Update number of streams for (auto&& ds : device_nstreams) { const std::string key = getDeviceTypeFromName(ds.first) + "_THROUGHPUT_STREAMS"; - device_nstreams[ds.first] = ie.GetConfig(ds.first, key).as(); + device_nstreams[ds.first] = core.get_config(ds.first, key).as(); } // Number of requests @@ -569,11 +585,10 @@ int main(int argc, char* argv[]) { } else { std::string key = METRIC_KEY(OPTIMAL_NUMBER_OF_INFER_REQUESTS); try { - nireq = exeNetwork.GetMetric(key).as(); + nireq = compiledModel.get_metric(key).as(); } catch (const std::exception& ex) { IE_THROW() << "Every device used with the benchmark_app should " - << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS " - "ExecutableNetwork metric. " + << "support OPTIMAL_NUMBER_OF_INFER_REQUESTS metric. " << "Failed to query the metric for the " << device_name << " with error:" << ex.what(); } } @@ -618,7 +633,7 @@ int main(int argc, char* argv[]) { {"topology", topology_name}, {"target device", device_name}, {"API", FLAGS_api}, - {"precision", std::string(precision.name())}, + {"precision", std::string(type.get_type_name())}, {"batch size", std::to_string(batchSize)}, {"number of iterations", std::to_string(niter)}, {"number of parallel infer requests", std::to_string(nireq)}, @@ -638,7 +653,7 @@ int main(int argc, char* argv[]) { // ---------------------------------------- next_step(); - InferRequestsQueue inferRequestsQueue(exeNetwork, nireq, app_inputs_info.size(), FLAGS_pcseq); + InferRequestsQueue inferRequestsQueue(compiledModel, nireq, app_inputs_info.size(), FLAGS_pcseq); bool inputHasName = false; if (inputFiles.size() > 0) { @@ -649,33 +664,33 @@ int main(int argc, char* argv[]) { std::vector<::gpu::BufferType> clInputsBuffer; bool useGpuMem = false; - std::map> inputsData; + std::map inputsData; if (isFlagSetInCommandLine("use_device_mem")) { if (device_name.find("GPU") == 0) { - inputsData = ::gpu::getRemoteInputBlobs(inputFiles, app_inputs_info, exeNetwork, clInputsBuffer); + inputsData = ::gpu::getRemoteInputTensors(inputFiles, app_inputs_info, compiledModel, clInputsBuffer); useGpuMem = true; } else if (device_name.find("CPU") == 0) { if (newInputType) { - inputsData = getBlobs(inputFiles, app_inputs_info); + inputsData = getTensors(inputFiles, app_inputs_info); } else { - inputsData = - getBlobsStaticCase(inputFiles.empty() ? std::vector{} : inputFiles.begin()->second, - batchSize, - app_inputs_info[0], - nireq); + inputsData = getTensorsStaticCase( + inputFiles.empty() ? std::vector{} : inputFiles.begin()->second, + batchSize, + app_inputs_info[0], + nireq); } } else { IE_THROW() << "Requested device doesn't support `use_device_mem` option."; } } else { if (newInputType) { - inputsData = getBlobs(inputFiles, app_inputs_info); + inputsData = getTensors(inputFiles, app_inputs_info); } else { inputsData = - getBlobsStaticCase(inputFiles.empty() ? std::vector{} : inputFiles.begin()->second, - batchSize, - app_inputs_info[0], - nireq); + getTensorsStaticCase(inputFiles.empty() ? std::vector{} : inputFiles.begin()->second, + batchSize, + app_inputs_info[0], + nireq); } } // ----------------- 10. Measuring performance @@ -726,7 +741,7 @@ int main(int argc, char* argv[]) { slog::info << "Inputs setup stage will be included in performance measurements." << slog::endl; } - // copy prepared data straight into inferRequest->getBlob() + // copy prepared data straight into inferRequest->getTensor() // for inference only mode if (inferenceOnly) { if (nireq < inputsData.begin()->second.size()) @@ -736,23 +751,24 @@ int main(int argc, char* argv[]) { auto inputs = app_inputs_info[i % app_inputs_info.size()]; for (auto& item : inputs) { auto inputName = item.first; - const auto& inputBlob = inputsData.at(inputName)[i % inputsData.at(inputName).size()]; - // for remote blobs setBlob is used, they are already allocated on the device + const auto& inputTensor = inputsData.at(inputName)[i % inputsData.at(inputName).size()]; + // for remote blobs setTensor is used, they are already allocated on the device if (useGpuMem) { - inferRequest->setBlob(inputName, inputBlob); + inferRequest->setTensor(inputName, inputTensor); } else { - InferenceEngine::Blob::Ptr requestBlob = inferRequest->getBlob(inputName); + auto requestTensor = inferRequest->getTensor(inputName); if (isDynamicNetwork) { - requestBlob->setShape(inputBlob->getTensorDesc().getDims()); + requestTensor.set_shape(inputTensor.get_shape()); } - copyBlobData(requestBlob, inputBlob); + copyTensorData(requestTensor, inputTensor); } } if (useGpuMem) { - auto outputBlobs = ::gpu::getRemoteOutputBlobs(exeNetwork, inferRequest->getOutputClBuffer()); - for (auto& output : exeNetwork.GetOutputsInfo()) { - inferRequest->setBlob(output.first, outputBlobs[output.first]); + auto outputTensors = + ::gpu::getRemoteOutputTensors(compiledModel, inferRequest->getOutputClBuffer()); + for (auto& output : compiledModel.outputs()) { + inferRequest->setTensor(output.get_any_name(), outputTensors[output.get_any_name()]); } } ++i; @@ -771,13 +787,13 @@ int main(int argc, char* argv[]) { for (auto& item : inputs) { auto inputName = item.first; const auto& data = inputsData.at(inputName)[0]; - inferRequest->setBlob(inputName, data); + inferRequest->setTensor(inputName, data); } if (useGpuMem) { - auto outputBlobs = ::gpu::getRemoteOutputBlobs(exeNetwork, inferRequest->getOutputClBuffer()); - for (auto& output : exeNetwork.GetOutputsInfo()) { - inferRequest->setBlob(output.first, outputBlobs[output.first]); + auto outputTensors = ::gpu::getRemoteOutputTensors(compiledModel, inferRequest->getOutputClBuffer()); + for (auto& output : compiledModel.outputs()) { + inferRequest->setTensor(output.get_any_name(), outputTensors[output.get_any_name()]); } } } @@ -824,18 +840,29 @@ int main(int argc, char* argv[]) { if (isDynamicNetwork) { batchSize = getBatchSize(inputs); + if (!std::any_of(inputs.begin(), + inputs.end(), + [](const std::pair& info) { + return ov::layout::has_batch(info.second.layout); + })) { + slog::warn + << "No batch dimension was found, asssuming batch to be 1. Beware: this might affect " + "FPS calculation." + << slog::endl; + } } for (auto& item : inputs) { auto inputName = item.first; const auto& data = inputsData.at(inputName)[iteration % inputsData.at(inputName).size()]; - inferRequest->setBlob(inputName, data); + inferRequest->setTensor(inputName, data); } if (useGpuMem) { - auto outputBlobs = ::gpu::getRemoteOutputBlobs(exeNetwork, inferRequest->getOutputClBuffer()); - for (auto& output : exeNetwork.GetOutputsInfo()) { - inferRequest->setBlob(output.first, outputBlobs[output.first]); + auto outputTensors = + ::gpu::getRemoteOutputTensors(compiledModel, inferRequest->getOutputClBuffer()); + for (auto& output : compiledModel.outputs()) { + inferRequest->setTensor(output.get_any_name(), outputTensors[output.get_any_name()]); } } } @@ -971,8 +998,9 @@ int main(int argc, char* argv[]) { if (!FLAGS_exec_graph_path.empty()) { try { - CNNNetwork execGraphInfo = exeNetwork.GetExecGraphInfo(); - execGraphInfo.serialize(FLAGS_exec_graph_path); + std::string fileName = fileNameNoExt(FLAGS_exec_graph_path); + ov::pass::Serialize serializer(fileName + ".xml", fileName + ".bin"); + serializer.run_on_model(std::const_pointer_cast(compiledModel.get_runtime_model())); slog::info << "executable graph is stored to " << FLAGS_exec_graph_path << slog::endl; } catch (const std::exception& ex) { slog::err << "Can't get executable graph: " << ex.what() << slog::endl; @@ -980,12 +1008,12 @@ int main(int argc, char* argv[]) { } if (perf_counts) { - std::vector> perfCounts; + std::vector> perfCounts; for (size_t ireq = 0; ireq < nireq; ireq++) { auto reqPerfCounts = inferRequestsQueue.requests[ireq]->getPerformanceCounts(); if (FLAGS_pc) { slog::info << "Performance counts for " << ireq << "-th infer request:" << slog::endl; - printPerformanceCounts(reqPerfCounts, std::cout, getFullDeviceName(ie, FLAGS_d), false); + printPerformanceCounts(reqPerfCounts, std::cout, getFullDeviceName(core, FLAGS_d), false); } perfCounts.push_back(reqPerfCounts); } @@ -1011,7 +1039,7 @@ int main(int argc, char* argv[]) { for (auto& item : app_inputs_info[i]) { std::stringstream input_shape; auto shape = item.second.dataShape; - std::copy(shape.begin(), shape.end() - 1, std::ostream_iterator(input_shape, ",")); + std::copy(shape.begin(), shape.end() - 1, std::ostream_iterator(input_shape, ",")); input_shape << shape.back(); slog::info << " " << item.first << " : " << getShapeString(item.second.dataShape); } diff --git a/samples/cpp/benchmark_app/remote_blobs_filling.cpp b/samples/cpp/benchmark_app/remote_blobs_filling.cpp deleted file mode 100644 index 6a98825f87c..00000000000 --- a/samples/cpp/benchmark_app/remote_blobs_filling.cpp +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (C) 2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include -#include -#include -#include -#include - -// clang-format off -#include - -#include "remote_blobs_filling.hpp" -// clang-format on - -namespace gpu { - -template -using uniformDistribution = typename std::conditional< - std::is_floating_point::value, - std::uniform_real_distribution, - typename std::conditional::value, std::uniform_int_distribution, void>::type>::type; - -template -void fillBufferRandom(void* inputBuffer, - size_t elementsNum, - T rand_min = std::numeric_limits::min(), - T rand_max = std::numeric_limits::max()) { - std::mt19937 gen(0); - uniformDistribution distribution(rand_min, rand_max); - auto inputBufferData = static_cast(inputBuffer); - for (size_t i = 0; i < elementsNum; i++) { - inputBufferData[i] = static_cast(distribution(gen)); - } -} - -void fillBuffer(void* inputBuffer, size_t elementsNum, InferenceEngine::Precision precision) { - if (precision == InferenceEngine::Precision::FP32) { - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::FP16) { - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::I32) { - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::I64) { - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::U8) { - // uniform_int_distribution is not allowed in the C++17 - // standard and vs2017/19 - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::I8) { - // uniform_int_distribution is not allowed in the C++17 standard - // and vs2017/19 - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::U16) { - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::I16) { - fillBufferRandom(inputBuffer, elementsNum); - } else if (precision == InferenceEngine::Precision::BOOL) { - fillBufferRandom(inputBuffer, elementsNum, 0, 1); - } else { - IE_THROW() << "Requested precision is not supported"; - } -} - -size_t getBytesPerElement(InferenceEngine::Precision precision) { - switch (precision) { - case InferenceEngine::Precision::FP32: - return 4; - case InferenceEngine::Precision::FP16: - return 2; - case InferenceEngine::Precision::I32: - return 4; - case InferenceEngine::Precision::I64: - return 8; - case InferenceEngine::Precision::U8: - return 1; - case InferenceEngine::Precision::I8: - return 1; - case InferenceEngine::Precision::U16: - return 2; - case InferenceEngine::Precision::I16: - return 2; - case InferenceEngine::Precision::BOOL: - return 1; - default: - IE_THROW() << "Requested precision is not supported"; - } -} - -std::map> getRemoteInputBlobs( - const std::map>& inputFiles, - const std::vector& app_inputs_info, - const InferenceEngine::ExecutableNetwork& exeNetwork, - std::vector& clBuffer) { -#ifdef HAVE_DEVICE_MEM_SUPPORT - slog::info << "Device memory will be used for input and output blobs" << slog::endl; - if (inputFiles.size()) { - slog::warn << "Device memory supports only random data at this moment, input images will be ignored" - << slog::endl; - } - - std::map> remoteBlobs; - auto context = exeNetwork.GetContext(); - auto oclContext = std::dynamic_pointer_cast(context)->get(); - auto oclInstance = std::make_shared(oclContext); - - auto setShared = [&](const std::string name, const InferenceEngine::TensorDesc& desc, bool fillRandom = false) { - cl_int err; - auto inputDims = desc.getDims(); - auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies()); - auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision()); - - clBuffer.push_back(cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err)); - - if (fillRandom) { - void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(), - CL_TRUE, - CL_MEM_READ_WRITE, - 0, - (cl::size_type)inputSize); - fillBuffer(mappedPtr, elementsNum, desc.getPrecision()); - oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr); - } - - auto blob = InferenceEngine::gpu::make_shared_blob(desc, context, clBuffer.back()); - blob->allocate(); - remoteBlobs[name].push_back(blob); - }; - - for (auto& inputs_info : app_inputs_info) { - for (auto& input : inputs_info) { - // Fill random - slog::info << "Prepare remote blob for input '" << input.first << "' with random values (" - << std::string((input.second.isImage() ? "image" : "some binary data")) << " is expected)" - << slog::endl; - setShared(input.first, - InferenceEngine::TensorDesc(input.second.precision, - input.second.dataShape, - getLayoutFromString(input.second.layout)), - true); - } - } - - return remoteBlobs; -#else - IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked"; -#endif -} - -std::map getRemoteOutputBlobs( - const InferenceEngine::ExecutableNetwork& exeNetwork, - std::map& clBuffer) { -#ifdef HAVE_DEVICE_MEM_SUPPORT - std::map outputBlobs; - for (auto& output : exeNetwork.GetOutputsInfo()) { - cl_int err; - auto context = exeNetwork.GetContext(); - auto oclContext = std::dynamic_pointer_cast(context)->get(); - auto oclInstance = std::make_shared(oclContext); - - auto desc = output.second->getTensorDesc(); - auto inputDims = desc.getDims(); - auto elementsNum = std::accumulate(begin(inputDims), end(inputDims), 1, std::multiplies()); - auto inputSize = elementsNum * getBytesPerElement(desc.getPrecision()); - - cl::size_type bufferSize = 0; - if (clBuffer.find(output.first) == clBuffer.end()) { - clBuffer[output.first] = - cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); - } else { - auto& buff = clBuffer[output.first]; - buff.getInfo(CL_MEM_SIZE, &bufferSize); - if (inputSize != bufferSize) { - buff = cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); - } - } - outputBlobs[output.first] = InferenceEngine::gpu::make_shared_blob(desc, context, clBuffer[output.first]); - } - - return outputBlobs; -#else - IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked"; -#endif -} -} // namespace gpu diff --git a/samples/cpp/benchmark_app/remote_tensors_filling.cpp b/samples/cpp/benchmark_app/remote_tensors_filling.cpp new file mode 100644 index 00000000000..ac1c106e51f --- /dev/null +++ b/samples/cpp/benchmark_app/remote_tensors_filling.cpp @@ -0,0 +1,162 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "remote_tensors_filling.hpp" + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_DEVICE_MEM_SUPPORT +# include +# include +#endif + +namespace gpu { + +template +using uniformDistribution = typename std::conditional< + std::is_floating_point::value, + std::uniform_real_distribution, + typename std::conditional::value, std::uniform_int_distribution, void>::type>::type; + +template +void fillBufferRandom(void* inputBuffer, + size_t elementsNum, + T rand_min = std::numeric_limits::min(), + T rand_max = std::numeric_limits::max()) { + std::mt19937 gen(0); + uniformDistribution distribution(rand_min, rand_max); + auto inputBufferData = static_cast(inputBuffer); + for (size_t i = 0; i < elementsNum; i++) { + inputBufferData[i] = static_cast(distribution(gen)); + } +} + +void fillBuffer(void* inputBuffer, size_t elementsNum, const ov::element::Type& type) { + if (type == ov::element::f32) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::f16) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::i32) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::i64) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::u8) { + // uniform_int_distribution is not allowed in the C++17 + // standard and vs2017/19 + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::i8) { + // uniform_int_distribution is not allowed in the C++17 standard + // and vs2017/19 + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::u16) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::i16) { + fillBufferRandom(inputBuffer, elementsNum); + } else if (type == ov::element::boolean) { + fillBufferRandom(inputBuffer, elementsNum, 0, 1); + } else { + IE_THROW() << "Requested type is not supported"; + } +} + +std::map getRemoteInputTensors( + const std::map>& inputFiles, + const std::vector& app_inputs_info, + const ov::runtime::CompiledModel& compiledModel, + std::vector& clBuffer) { +#ifdef HAVE_DEVICE_MEM_SUPPORT + slog::info << "Device memory will be used for input and output blobs" << slog::endl; + if (inputFiles.size()) { + slog::warn << "Device memory supports only random data at this moment, input images will be ignored" + << slog::endl; + } + + std::map remoteTensors; + auto context = compiledModel.get_context(); + auto& oclContext = static_cast(context); + auto oclInstance = std::make_shared(oclContext.get()); + + for (auto& inputs_info : app_inputs_info) { + for (auto& input : inputs_info) { + // Fill random + slog::info << "Prepare remote blob for input '" << input.first << "' with random values (" + << std::string((input.second.isImage() ? "image" : "some binary data")) << " is expected)" + << slog::endl; + + auto tensor = oclContext.create_tensor(input.second.type, input.second.dataShape, clBuffer.back().get()); + remoteTensors[input.first].push_back(tensor); + + // Creating and filling shared buffers + cl_int err; + auto elementsNum = std::accumulate(begin(input.second.dataShape), + end(input.second.dataShape), + 1, + std::multiplies()); + auto inputSize = elementsNum * input.second.type.bitwidth() / 8; + + clBuffer.push_back( + cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err)); + + void* mappedPtr = oclInstance->_queue.enqueueMapBuffer(clBuffer.back(), + CL_TRUE, + CL_MEM_READ_WRITE, + 0, + (cl::size_type)inputSize); + if (inputFiles.empty()) { + // Filling in random data + fillBuffer(mappedPtr, elementsNum, input.second.type); + } else { + // TODO: add filling with real image data + } + oclInstance->_queue.enqueueUnmapMemObject(clBuffer.back(), mappedPtr); + } + } + + return remoteTensors; +#else + IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked"; +#endif +} + +std::map getRemoteOutputTensors(const ov::runtime::CompiledModel& compiledModel, + std::map& clBuffer) { +#ifdef HAVE_DEVICE_MEM_SUPPORT + std::map outputTensors; + for (auto& output : compiledModel.outputs()) { + auto context = compiledModel.get_context(); + auto& oclContext = static_cast(context); + auto oclInstance = std::make_shared(oclContext.get()); + + cl_int err; + auto elementsNum = + std::accumulate(begin(output.get_shape()), end(output.get_shape()), 1, std::multiplies()); + auto inputSize = elementsNum * output.get_element_type().bitwidth() / 8; + + cl::size_type bufferSize = 0; + if (clBuffer.find(output.get_any_name()) == clBuffer.end()) { + clBuffer[output.get_any_name()] = + cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); + } else { + auto& buff = clBuffer[output.get_any_name()]; + buff.getInfo(CL_MEM_SIZE, &bufferSize); + if (inputSize != bufferSize) { + buff = cl::Buffer(oclInstance->_context, CL_MEM_READ_WRITE, (cl::size_type)inputSize, NULL, &err); + } + } + outputTensors[output.get_any_name()] = oclContext.create_tensor(output.get_element_type(), + output.get_shape(), + clBuffer[output.get_any_name()].get()); + } + + return outputTensors; +#else + IE_THROW() << "Device memory requested for GPU device, but OpenCL was not linked"; +#endif +} +} // namespace gpu diff --git a/samples/cpp/benchmark_app/remote_blobs_filling.hpp b/samples/cpp/benchmark_app/remote_tensors_filling.hpp similarity index 81% rename from samples/cpp/benchmark_app/remote_blobs_filling.hpp rename to samples/cpp/benchmark_app/remote_tensors_filling.hpp index 73ce0e5e4ab..17293fd2a44 100644 --- a/samples/cpp/benchmark_app/remote_blobs_filling.hpp +++ b/samples/cpp/benchmark_app/remote_tensors_filling.hpp @@ -6,14 +6,9 @@ #if defined(HAVE_GPU_DEVICE_MEM_SUPPORT) # define HAVE_DEVICE_MEM_SUPPORT -# include "gpu/gpu_context_api_ocl.hpp" +# include #endif - -// clang-format off -#include "inference_engine.hpp" - #include "utils.hpp" -// clang-format on namespace gpu { @@ -62,14 +57,12 @@ struct OpenCL { using BufferType = void*; #endif -std::map> getRemoteInputBlobs( +std::map getRemoteInputTensors( const std::map>& inputFiles, const std::vector& app_inputs_info, - const InferenceEngine::ExecutableNetwork& exeNetwork, + const ov::runtime::CompiledModel& compiledModel, std::vector& clBuffer); -std::map getRemoteOutputBlobs( - const InferenceEngine::ExecutableNetwork& exeNetwork, - std::map& clBuffer); - +std::map getRemoteOutputTensors(const ov::runtime::CompiledModel& compiledModel, + std::map& clBuffer); } // namespace gpu diff --git a/samples/cpp/benchmark_app/shared_blob_allocator.hpp b/samples/cpp/benchmark_app/shared_blob_allocator.hpp deleted file mode 100644 index 189c522e0d4..00000000000 --- a/samples/cpp/benchmark_app/shared_blob_allocator.hpp +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ie_allocator.hpp" - -template -class SharedBlobAllocator : public InferenceEngine::IAllocator { -public: - SharedBlobAllocator(const T* data, size_t size) : data(data), size(size){}; - - ~SharedBlobAllocator() { - free((void*)data); - }; - - void* lock(void* handle, InferenceEngine::LockOp op = InferenceEngine::LOCK_FOR_WRITE) noexcept override { - if (handle == data) { - return (void*)data; - } - return nullptr; - } - - void unlock(void* handle) noexcept override{}; - - void* alloc(size_t size) noexcept override { - return size <= this->size ? (void*)data : nullptr; - }; - - bool free(void* handle) noexcept override { - if (handle == data) { - delete[] data; - data = nullptr; - return true; - } - return false; - }; - -private: - const T* data; - size_t size; -}; diff --git a/samples/cpp/benchmark_app/shared_tensor_allocator.hpp b/samples/cpp/benchmark_app/shared_tensor_allocator.hpp new file mode 100644 index 00000000000..f9cf5a0056d --- /dev/null +++ b/samples/cpp/benchmark_app/shared_tensor_allocator.hpp @@ -0,0 +1,42 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/runtime/allocator.hpp" + +class SharedTensorAllocator : public ov::runtime::AllocatorImpl { +public: + SharedTensorAllocator(size_t sizeBytes) : size(sizeBytes) { + data = new char[size]; + } + + ~SharedTensorAllocator() { + delete[] data; + } + + virtual void* allocate(const size_t bytes, const size_t) override { + return bytes <= this->size ? (void*)data : nullptr; + } + + void deallocate(void* handle, const size_t bytes, const size_t) override { + if (handle == data) { + delete[] data; + data = nullptr; + } + } + + bool is_equal(const AllocatorImpl& other) const override { + auto other_blob_allocator = dynamic_cast(&other); + return other_blob_allocator != nullptr && other_blob_allocator == this; + } + + char* getBuffer() { + return data; + } + +private: + char* data; + size_t size; +}; diff --git a/samples/cpp/benchmark_app/statistics_report.cpp b/samples/cpp/benchmark_app/statistics_report.cpp index 67ab291f55f..5e0abffa5ec 100644 --- a/samples/cpp/benchmark_app/statistics_report.cpp +++ b/samples/cpp/benchmark_app/statistics_report.cpp @@ -56,10 +56,8 @@ void StatisticsReport::dump() { } void StatisticsReport::dumpPerformanceCountersRequest(CsvDumper& dumper, const PerformaceCounters& perfCounts) { - auto performanceMapSorted = perfCountersSorted(perfCounts); - - long long total = 0L; - long long total_cpu = 0L; + std::chrono::microseconds total = std::chrono::microseconds::zero(); + std::chrono::microseconds total_cpu = std::chrono::microseconds::zero(); dumper << "layerName" << "execStatus" @@ -69,31 +67,31 @@ void StatisticsReport::dumpPerformanceCountersRequest(CsvDumper& dumper, const P << "cpuTime (ms)"; dumper.endLine(); - for (const auto& layer : performanceMapSorted) { - dumper << layer.first; // layer name + for (const auto& layer : perfCounts) { + dumper << layer.node_name; // layer name - switch (layer.second.status) { - case InferenceEngine::InferenceEngineProfileInfo::EXECUTED: + switch (layer.status) { + case ov::runtime::ProfilingInfo::Status::EXECUTED: dumper << "EXECUTED"; break; - case InferenceEngine::InferenceEngineProfileInfo::NOT_RUN: + case ov::runtime::ProfilingInfo::Status::NOT_RUN: dumper << "NOT_RUN"; break; - case InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT: + case ov::runtime::ProfilingInfo::Status::OPTIMIZED_OUT: dumper << "OPTIMIZED_OUT"; break; } - dumper << layer.second.layer_type << layer.second.exec_type; - dumper << std::to_string(layer.second.realTime_uSec / 1000.0) << std::to_string(layer.second.cpu_uSec / 1000.0); - total += layer.second.realTime_uSec; - total_cpu += layer.second.cpu_uSec; + dumper << layer.node_type << layer.exec_type; + dumper << std::to_string(layer.real_time.count() / 1000.0) << std::to_string(layer.cpu_time.count() / 1000.0); + total += layer.real_time; + total_cpu += layer.cpu_time; dumper.endLine(); } dumper << "Total" << "" << "" << ""; - dumper << total / 1000.0 << total_cpu / 1000.0; + dumper << total.count() / 1000.0 << total_cpu.count() / 1000.0; dumper.endLine(); dumper.endLine(); } @@ -116,24 +114,28 @@ void StatisticsReport::dumpPerformanceCounters(const std::vector performanceCountersAvg; + std::vector performanceCountersAvg; // iterate over each processed infer request and handle its PM data for (size_t i = 0; i < perfCounts.size(); i++) { - auto performanceMapSorted = perfCountersSorted(perfCounts[i]); // iterate over each layer from sorted vector and add required PM data // to the per-layer maps - for (const auto& pm : performanceMapSorted) { - if (performanceCountersAvg.count(pm.first) == 0) { - performanceCountersAvg[pm.first] = perfCounts.at(i).at(pm.first); - } else { - performanceCountersAvg[pm.first].realTime_uSec += perfCounts.at(i).at(pm.first).realTime_uSec; - performanceCountersAvg[pm.first].cpu_uSec += perfCounts.at(i).at(pm.first).cpu_uSec; + for (const auto& pm : perfCounts[i]) { + int idx = 0; + for (; idx < performanceCountersAvg.size(); idx++) { + if (performanceCountersAvg[idx].node_name == pm.node_name) { + performanceCountersAvg[idx].real_time += pm.real_time; + performanceCountersAvg[idx].cpu_time += pm.cpu_time; + break; + } + } + if (idx < performanceCountersAvg.size()) { + performanceCountersAvg.push_back(pm); } } } for (auto& pm : performanceCountersAvg) { - pm.second.realTime_uSec /= perfCounts.size(); - pm.second.cpu_uSec /= perfCounts.size(); + pm.real_time /= perfCounts.size(); + pm.cpu_time /= perfCounts.size(); } return performanceCountersAvg; }; diff --git a/samples/cpp/benchmark_app/statistics_report.hpp b/samples/cpp/benchmark_app/statistics_report.hpp index 9ae3c34143e..6a70c256955 100644 --- a/samples/cpp/benchmark_app/statistics_report.hpp +++ b/samples/cpp/benchmark_app/statistics_report.hpp @@ -10,8 +10,6 @@ #include // clang-format off -#include "inference_engine.hpp" - #include "samples/common.hpp" #include "samples/csv_dumper.hpp" #include "samples/slog.hpp" @@ -74,7 +72,7 @@ private: /// @brief Responsible for collecting of statistics and dumping to .csv file class StatisticsReport { public: - typedef std::map PerformaceCounters; + typedef std::vector PerformaceCounters; typedef std::vector> Parameters; struct Config { diff --git a/samples/cpp/benchmark_app/utils.cpp b/samples/cpp/benchmark_app/utils.cpp index 734c096abde..483aefaa37c 100644 --- a/samples/cpp/benchmark_app/utils.cpp +++ b/samples/cpp/benchmark_app/utils.cpp @@ -2,6 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 // +#include + #include #include #include @@ -25,33 +27,28 @@ namespace benchmark_app { bool InputInfo::isImage() const { if ((layout != "NCHW") && (layout != "NHWC") && (layout != "CHW") && (layout != "HWC")) return false; - return (channels() == 3); + // If tensor_shape is still empty, assume this is still an Image and tensor shape will be filled later + return (dataShape.empty() || channels() == 3); } bool InputInfo::isImageInfo() const { if (layout != "NC") return false; return (channels() >= 2); } -size_t InputInfo::getDimentionByLayout(char character) const { - size_t pos = layout.find(character); - if (pos == std::string::npos) - throw std::runtime_error("Error: Can't get " + std::string(character, 1) + " from layout " + layout); - return dataShape.at(pos); -} size_t InputInfo::width() const { - return getDimentionByLayout('W'); + return dataShape.at(ov::layout::width_idx(layout)); } size_t InputInfo::height() const { - return getDimentionByLayout('H'); + return dataShape.at(ov::layout::height_idx(layout)); } size_t InputInfo::channels() const { - return getDimentionByLayout('C'); + return dataShape.at(ov::layout::channels_idx(layout)); } size_t InputInfo::batch() const { - return getDimentionByLayout('N'); + return dataShape.at(ov::layout::batch_idx(layout)); } size_t InputInfo::depth() const { - return getDimentionByLayout('D'); + return dataShape.at(ov::layout::depth_idx(layout)); } } // namespace benchmark_app @@ -152,48 +149,36 @@ std::map parseNStreamsValuePerDevice(const std::vector size_t getBatchSize(const benchmark_app::InputsInfo& inputs_info) { size_t batch_size = 0; for (auto& info : inputs_info) { - std::size_t batch_index = info.second.layout.find("N"); - if (batch_index != std::string::npos) { + if (ov::layout::has_batch(info.second.layout)) { if (batch_size == 0) - batch_size = info.second.dataShape[batch_index]; - else if (batch_size != info.second.dataShape[batch_index]) + batch_size = info.second.batch(); + else if (batch_size != info.second.batch()) throw std::logic_error("Can't deterimine batch size: batch is " "different for different inputs!"); } } - if (batch_size == 0) + if (batch_size == 0) { batch_size = 1; + } return batch_size; } -InferenceEngine::Layout getLayoutFromString(const std::string& string_layout) { - static const std::unordered_map layouts = { - {"NCHW", InferenceEngine::Layout::NCHW}, - {"NHWC", InferenceEngine::Layout::NHWC}, - {"NCDHW", InferenceEngine::Layout::NCDHW}, - {"NDHWC", InferenceEngine::Layout::NDHWC}, - {"C", InferenceEngine::Layout::C}, - {"CHW", InferenceEngine::Layout::CHW}, - {"HWC", InferenceEngine::Layout::HWC}, - {"HW", InferenceEngine::Layout::HW}, - {"NC", InferenceEngine::Layout::NC}, - {"CN", InferenceEngine::Layout::CN}}; - auto it = layouts.find(string_layout); - if (it != layouts.end()) { - return it->second; +size_t getModelInputBatchSize(const ov::Model& model) { + try { + auto& param = model.get_parameters()[0]; + auto layout = param->get_layout(); + return param->get_shape().at(ov::layout::batch_idx(layout)); + } catch (...) { + slog::warn + << "No batch dimension was found, asssuming batch to be 1. Beware: this might affect FPS calculation." + << slog::endl; + return 1; // Default batch value } - IE_THROW() << "Unknown layout with name '" << string_layout << "'."; } -std::string getShapeString(const InferenceEngine::SizeVector& shape) { +std::string getShapeString(const ov::Shape& shape) { std::stringstream ss; - ss << "["; - for (size_t i = 0; i < shape.size(); ++i) { - if (i > 0) - ss << ", "; - ss << shape.at(i); - } - ss << "]"; + ss << shape; return ss.str(); } @@ -207,22 +192,6 @@ std::string getShapesString(const benchmark_app::PartialShapes& shapes) { return ss.str(); } -std::string getShapesString(const InferenceEngine::ICNNNetwork::InputShapes& shapes) { - std::stringstream ss; - for (auto& shape : shapes) { - if (!ss.str().empty()) - ss << ", "; - ss << "\'" << shape.first << "': ["; - for (size_t i = 0; i < shape.second.size(); i++) { - if (i > 0) - ss << ", "; - ss << shape.second.at(i); - } - ss << "]"; - } - return ss.str(); -} - std::map> parseScaleOrMean(const std::string& scale_mean, const benchmark_app::InputsInfo& inputs_info) { // Format: data:[255,255,255],info[255,255,255] @@ -284,9 +253,9 @@ std::vector parsePartialShape(const std::string& partial_shap return shape; } -InferenceEngine::SizeVector parseTensorShape(const std::string& dataShape) { +ov::Shape parseDataShape(const std::string& dataShapeStr) { std::vector shape; - for (auto& dim : split(dataShape, ',')) { + for (auto& dim : split(dataShapeStr, ',')) { shape.push_back(std::stoi(dim)); } return shape; @@ -376,6 +345,317 @@ std::map> parseInputArguments(const std::v return mapped_files; } +std::map> parseInputParameters( + const std::string& parameter_string, + const std::vector>& input_info) { + // Parse parameter string like "input0[value0],input1[value1]" or "[value]" (applied to all + // inputs) + std::map> return_value; + std::string search_string = parameter_string; + auto start_pos = search_string.find_first_of('['); + auto input_name = search_string.substr(0, start_pos); + while (start_pos != std::string::npos) { + auto end_pos = search_string.find_first_of(']'); + if (end_pos == std::string::npos) + break; + if (start_pos) + input_name = search_string.substr(0, start_pos); + auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); + if (!input_name.empty()) { + return_value[input_name].push_back(input_value); + } else { + for (auto& item : input_info) { + return_value[item.get_any_name()].push_back(input_value); + } + } + search_string = search_string.substr(end_pos + 1); + if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) + break; + if (search_string.front() == ',') + search_string = search_string.substr(1); + start_pos = search_string.find_first_of('['); + } + if (!search_string.empty()) + throw std::logic_error("Can't parse input parameter string: " + parameter_string); + return return_value; +} + +std::vector getInputsInfo(const std::string& shape_string, + const std::string& layout_string, + const size_t batch_size, + const std::string& data_shapes_string, + const std::map>& fileNames, + const std::string& scale_string, + const std::string& mean_string, + const std::vector>& input_info, + bool& reshape_required) { + std::map> shape_map = parseInputParameters(shape_string, input_info); + std::map> data_shapes_map = + parseInputParameters(data_shapes_string, input_info); + std::map> layout_map = parseInputParameters(layout_string, input_info); + + size_t min_size = 1, max_size = 1; + if (!data_shapes_map.empty()) { + min_size = std::min_element(data_shapes_map.begin(), + data_shapes_map.end(), + [](std::pair> a, + std::pair> b) { + return a.second.size() < b.second.size() && a.second.size() != 1; + }) + ->second.size(); + + max_size = std::max_element(data_shapes_map.begin(), + data_shapes_map.end(), + [](std::pair> a, + std::pair> b) { + return a.second.size() < b.second.size(); + }) + ->second.size(); + if (min_size != max_size) { + throw std::logic_error( + "Shapes number for every input should be either 1 or should be equal to shapes number of other inputs"); + } + slog::info << "Number of test configurations is calculated basing on -tensor_shape parameter" << slog::endl; + } else if (fileNames.size() > 0) { + slog::info << "Number of test configurations is calculated basing on number of input images" << slog::endl; + min_size = std::min_element(fileNames.begin(), + fileNames.end(), + [](std::pair> a, + std::pair> b) { + return a.second.size() < b.second.size() && a.second.size() != 1; + }) + ->second.size(); + + max_size = std::max_element(fileNames.begin(), + fileNames.end(), + [](std::pair> a, + std::pair> b) { + return a.second.size() < b.second.size(); + }) + ->second.size(); + if (min_size != max_size) { + slog::warn << "Number of input files is different for some inputs, minimal number of files will be used (" + << min_size << ")" << slog::endl; + } + } + + reshape_required = false; + + std::map currentFileCounters; + for (auto& item : input_info) { + currentFileCounters[item.get_any_name()] = 0; + } + + std::vector info_maps; + for (size_t i = 0; i < min_size; ++i) { + benchmark_app::InputsInfo info_map; + + for (auto& item : input_info) { + benchmark_app::InputInfo info; + auto name = item.get_any_name(); + + // Layout + if (layout_map.count(name)) { + if (layout_map.at(name).size() > 1) { + throw std::logic_error( + "layout command line parameter doesn't support multiple layouts for one input."); + } + info.layout = ov::Layout(layout_map.at(name)[0]); + // reshape_required = true; + } else { + info.layout = dynamic_cast(*item.get_node()).get_layout(); + } + + // Calculating default layout values if needed + std::string newLayout = ""; + if (info.layout.empty()) { + switch (item.get_partial_shape().size()) { + case 3: + newLayout = "CHW"; + break; + case 4: + // Rough check for layout type, basing on max number of image channels + newLayout = (item.get_partial_shape()[3].get_max_length() <= 4 && + item.get_partial_shape()[1].get_max_length() > 4) + ? "NHWC" + : "NCHW"; + break; + } + if (newLayout != "") { + info.layout = ov::Layout(newLayout); + } + if (info_maps.empty()) { // Show warnings only for 1st test case config, as for other test cases + // they will be the same + slog::warn << item.get_node()->get_friendly_name() << ": layout is not set explicitly" + << (newLayout != "" ? std::string(", so it is defaulted to ") + newLayout : "") + << ". It is STRONGLY recommended to set layout manually to avoid further issues." + << slog::endl; + } + } + + // Precision + info.type = item.get_element_type(); + // Partial Shape + if (shape_map.count(name)) { + if (shape_map.at(name).size() > 1) { + throw std::logic_error( + "shape command line parameter doesn't support multiple shapes for one input."); + } + info.partialShape = parsePartialShape(shape_map.at(name)[0]); + reshape_required = true; + } else { + info.partialShape = item.get_partial_shape(); + } + + // Files might be mapped without input name. In case of only one input we may map them to the only input + // directly + std::string filesInputName = + fileNames.size() == 1 && input_info.size() == 1 && fileNames.begin()->first == "" ? "" : name; + + // Tensor Shape + if (info.partialShape.is_dynamic() && data_shapes_map.count(name)) { + info.dataShape = parseDataShape(data_shapes_map.at(name)[i % data_shapes_map.at(name).size()]); + } else if (info.partialShape.is_dynamic() && fileNames.count(filesInputName) && info.isImage()) { + auto& namesVector = fileNames.at(filesInputName); + if (containsBinaries(namesVector)) { + throw std::logic_error("Input files list for input " + item.get_any_name() + + " contains binary file(s) and input shape is dynamic. Tensor shape should " + "be defined explicitly (using -tensor_shape)."); + } + + info.dataShape = ov::Shape(info.partialShape.size(), 0); + for (int i = 0; i < info.partialShape.size(); i++) { + auto& dim = info.partialShape[i]; + if (dim.is_static()) { + info.dataShape[i] = dim.get_length(); + } + } + + size_t tensorBatchSize = std::max(batch_size, (size_t)1); + if (ov::layout::has_batch(info.layout)) { + if (info.batch()) { + tensorBatchSize = std::max(tensorBatchSize, info.batch()); + } else { + info.dataShape[ov::layout::batch_idx(info.layout)] = tensorBatchSize; + } + } + + size_t w = 0; + size_t h = 0; + size_t fileIdx = currentFileCounters[item.get_any_name()]; + for (; fileIdx < currentFileCounters[item.get_any_name()] + tensorBatchSize; fileIdx++) { + if (fileIdx >= namesVector.size()) { + throw std::logic_error( + "Not enough files to fill in full batch (number of files should be a multiple of batch " + "size if -tensor_shape parameter is omitted and shape is dynamic)"); + } + FormatReader::ReaderPtr reader(namesVector[fileIdx].c_str()); + if ((w && w != reader->width()) || (h && h != reader->height())) { + throw std::logic_error("Image sizes putting into one batch should be of the same size if input " + "shape is dynamic and -tensor_shape is omitted. Problem file: " + + namesVector[fileIdx]); + } + w = reader->width(); + h = reader->height(); + } + currentFileCounters[item.get_any_name()] = fileIdx; + + if (!info.dataShape[ov::layout::height_idx(info.layout)]) { + info.dataShape[ov::layout::height_idx(info.layout)] = h; + } + if (!info.dataShape[ov::layout::width_idx(info.layout)]) { + info.dataShape[ov::layout::width_idx(info.layout)] = w; + } + + if (std::any_of(info.dataShape.begin(), info.dataShape.end(), [](size_t d) { + return d == 0; + })) { + throw std::logic_error("Not enough information in shape and image to determine tensor shape " + "automatically autmatically. Input: " + + item.get_node()->get_friendly_name() + + ", File name: " + namesVector[fileIdx - 1]); + } + + } else if (info.partialShape.is_static()) { + info.dataShape = info.partialShape.get_shape(); + if (data_shapes_map.find(name) != data_shapes_map.end()) { + throw std::logic_error( + "Network's input \"" + name + + "\" is static. Use -shape argument for static inputs instead of -data_shape."); + } + } else if (!data_shapes_map.empty()) { + throw std::logic_error("Can't find network input name \"" + name + "\" in \"-data_shape " + + data_shapes_string + "\" command line parameter"); + } else { + throw std::logic_error("-i or -data_shape command line parameter should be set for all inputs in case " + "of network with dynamic shapes."); + } + + // Update shape with batch if needed (only in static shape case) + // Update blob shape only not affecting network shape to trigger dynamic batch size case + if (batch_size != 0) { + if (ov::layout::has_batch(info.layout)) { + std::size_t batch_index = ov::layout::batch_idx(info.layout); + if (info.dataShape.at(batch_index) != batch_size) { + if (info.partialShape.is_static()) { + info.partialShape[batch_index] = batch_size; + } + info.dataShape[batch_index] = batch_size; + reshape_required = true; + } + } else { + slog::warn << "Input '" << item.get_node()->get_friendly_name() + << "' doesn't have batch dimension in layout. -b option will be ignored for this input." + << slog::endl; + } + } + info_map[name] = info; + } + + // Update scale and mean + std::map> scale_map = parseScaleOrMean(scale_string, info_map); + std::map> mean_map = parseScaleOrMean(mean_string, info_map); + + for (auto& item : info_map) { + if (item.second.isImage()) { + item.second.scale.assign({1, 1, 1}); + item.second.mean.assign({0, 0, 0}); + + if (scale_map.count(item.first)) { + item.second.scale = scale_map.at(item.first); + } + if (mean_map.count(item.first)) { + item.second.mean = mean_map.at(item.first); + } + } + } + + info_maps.push_back(info_map); + } + + return info_maps; +} + +std::vector getInputsInfo(const std::string& shape_string, + const std::string& layout_string, + const size_t batch_size, + const std::string& tensors_shape_string, + const std::map>& fileNames, + const std::string& scale_string, + const std::string& mean_string, + const std::vector>& input_info) { + bool reshape_required = false; + return getInputsInfo(shape_string, + layout_string, + batch_size, + tensors_shape_string, + fileNames, + scale_string, + mean_string, + input_info, + reshape_required); +} + #ifdef USE_OPENCV void dump_config(const std::string& filename, const std::map>& config) { auto plugin_to_opencv_format = [](const std::string& str) -> std::string { @@ -429,3 +709,55 @@ void load_config(const std::string& filename, std::map supported_image_extensions = + {"bmp", "dib", "jpeg", "jpg", "jpe", "jp2", "png", "pbm", "pgm", "ppm", "sr", "ras", "tiff", "tif"}; +#else +const std::vector supported_image_extensions = {"bmp"}; +#endif +const std::vector supported_binary_extensions = {"bin"}; + +std::string getExtension(const std::string& name) { + auto extensionPosition = name.rfind('.', name.size()); + return extensionPosition == std::string::npos ? "" : name.substr(extensionPosition + 1, name.size() - 1); +}; + +bool isBinaryFile(const std::string& filePath) { + auto extension = getExtension(filePath); + std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower); + return std::find(supported_binary_extensions.begin(), supported_binary_extensions.end(), extension) != + supported_binary_extensions.end(); +} + +bool isImageFile(const std::string& filePath) { + auto extension = getExtension(filePath); + std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower); + return std::find(supported_binary_extensions.begin(), supported_binary_extensions.end(), extension) != + supported_binary_extensions.end(); +} + +bool containsBinaries(const std::vector& filePaths) { + std::vector filtered; + for (auto& filePath : filePaths) { + auto extension = getExtension(filePath); + std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower); + if (std::find(supported_binary_extensions.begin(), supported_binary_extensions.end(), extension) != + supported_binary_extensions.end()) { + return true; + } + } + return false; +} +std::vector filterFilesByExtensions(const std::vector& filePaths, + const std::vector& extensions) { + std::vector filtered; + for (auto& filePath : filePaths) { + auto extension = getExtension(filePath); + std::transform(extension.begin(), extension.end(), extension.begin(), ::tolower); + if (std::find(extensions.begin(), extensions.end(), extension) != extensions.end()) { + filtered.push_back(filePath); + } + } + return filtered; +} \ No newline at end of file diff --git a/samples/cpp/benchmark_app/utils.hpp b/samples/cpp/benchmark_app/utils.hpp index b6c62ff4c21..8851987d9d9 100644 --- a/samples/cpp/benchmark_app/utils.hpp +++ b/samples/cpp/benchmark_app/utils.hpp @@ -7,12 +7,11 @@ #include #include #include +#include #include #include #include -#include "ngraph/partial_shape.hpp" - typedef std::chrono::high_resolution_clock Time; typedef std::chrono::nanoseconds ns; @@ -36,21 +35,20 @@ inline std::string double_to_string(const double number) { namespace benchmark_app { struct InputInfo { - InferenceEngine::Precision precision; - ngraph::PartialShape partialShape; - InferenceEngine::SizeVector dataShape; - std::string layout; - InferenceEngine::Layout originalLayout; + ov::element::Type type; + ov::PartialShape partialShape; + ov::Shape dataShape; + ov::Layout layout; std::vector scale; std::vector mean; bool isImage() const; bool isImageInfo() const; - size_t getDimentionByLayout(char character) const; size_t width() const; size_t height() const; size_t channels() const; size_t batch() const; size_t depth() const; + std::vector fileNames; }; using InputsInfo = std::map; using PartialShapes = std::map; @@ -60,214 +58,81 @@ std::vector parseDevices(const std::string& device_string); uint32_t deviceDefaultDeviceDurationInSeconds(const std::string& device); std::map parseNStreamsValuePerDevice(const std::vector& devices, const std::string& values_string); - -InferenceEngine::Layout getLayoutFromString(const std::string& string_layout); -std::string getShapeString(const InferenceEngine::SizeVector& shape); +size_t getModelInputBatchSize(const ov::Model& model); +std::string getShapeString(const ov::Shape& shape); std::string getShapesString(const benchmark_app::PartialShapes& shapes); -std::string getShapesString(const InferenceEngine::ICNNNetwork::InputShapes& shapes); size_t getBatchSize(const benchmark_app::InputsInfo& inputs_info); std::vector split(const std::string& s, char delim); - std::map> parseScaleOrMean(const std::string& scale_mean, const benchmark_app::InputsInfo& inputs_info); std::vector parsePartialShape(const std::string& partial_shape); -InferenceEngine::SizeVector parseTensorShape(const std::string& data_shape); +ov::Shape parseDataShape(const std::string& dataShapeStr); std::pair> parseInputFiles(const std::string& file_paths_string); std::map> parseInputArguments(const std::vector& args); -template -std::map> parseInputParameters(const std::string parameter_string, - const std::map& input_info) { - // Parse parameter string like "[value0]", "[value0][value1]" or "input0[value0][value1],input1[value2][value3]" - // (applied to all inputs) - std::map> return_value; - std::string search_string = parameter_string; - auto start_pos = search_string.find_first_of('['); - auto input_name = search_string.substr(0, start_pos); - while (start_pos != std::string::npos) { - auto end_pos = search_string.find_first_of(']'); - if (end_pos == std::string::npos) - break; - if (start_pos) - input_name = search_string.substr(0, start_pos); - auto input_value = search_string.substr(start_pos + 1, end_pos - start_pos - 1); - if (!input_name.empty()) { - return_value[input_name].push_back(input_value); - } else { - for (auto& item : input_info) { - return_value[item.first].push_back(input_value); - } - } - search_string = search_string.substr(end_pos + 1); - if (search_string.empty() || (search_string.front() != ',' && search_string.front() != '[')) - break; - if (search_string.front() == ',') - search_string = search_string.substr(1); - start_pos = search_string.find_first_of('['); - } - if (!search_string.empty()) - throw std::logic_error("Can't parse input parameter string: " + parameter_string); - return return_value; -} +std::map> parseInputParameters(const std::string& parameter_string, + const ov::ParameterVector& input_info); -template +/// +/// Parses command line data and data obtained from the function and returns configuration of each input +/// +/// command-line shape string +/// command-line layout string +/// command-line batch string +/// command-line tensor_shape string +/// command-line iscale string +/// command-line imean string +/// inputs vector obtained from ov::Model +/// returns true to this parameter if reshape is required +/// vector of benchmark_app::InputsInfo elements. +/// Each element is a configuration item for every test configuration case +/// (number of cases is calculated basing on tensor_shape and other parameters). +/// Each element is a map (input_name, configuration) containing data for each input std::vector getInputsInfo(const std::string& shape_string, const std::string& layout_string, const size_t batch_size, const std::string& data_shapes_string, + const std::map>& fileNames, const std::string& scale_string, const std::string& mean_string, - const std::map& input_info, - bool& reshape_required) { - std::map> shape_map = parseInputParameters(shape_string, input_info); - std::map> data_shapes_map = - parseInputParameters(data_shapes_string, input_info); - std::map> layout_map = parseInputParameters(layout_string, input_info); + const std::vector>& input_info, + bool& reshape_required); - size_t min_size = 1, max_size = 1; - if (!data_shapes_map.empty()) { - min_size = std::min_element(data_shapes_map.begin(), - data_shapes_map.end(), - [](std::pair> a, - std::pair> b) { - return a.second.size() < b.second.size() && a.second.size() != 1; - }) - ->second.size(); - - max_size = std::max_element(data_shapes_map.begin(), - data_shapes_map.end(), - [](std::pair> a, - std::pair> b) { - return a.second.size() < b.second.size(); - }) - ->second.size(); - if (min_size != max_size) { - throw std::logic_error( - "Shapes number for every input should be either 1 or should be equal to shapes number of other inputs"); - } - } - - reshape_required = false; - - std::vector info_maps; - - for (size_t i = 0; i < min_size; ++i) { - benchmark_app::InputsInfo info_map; - for (auto& item : input_info) { - benchmark_app::InputInfo info; - auto name = item.first; - auto descriptor = item.second->getTensorDesc(); - // Precision - info.precision = descriptor.getPrecision(); - // Partial Shape - if (shape_map.count(name)) { - std::vector parsed_shape; - if (shape_map.at(name).size() > 1) { - throw std::logic_error( - "shape command line parameter doesn't support multiple shapes for one input."); - } - info.partialShape = parsePartialShape(shape_map.at(name)[0]); - reshape_required = true; - } else { - info.partialShape = item.second->getPartialShape(); - } - - if (info.partialShape.is_dynamic() && info.isImage()) { - throw std::logic_error( - "benchmark_app supports only binary and random data as input for dynamic models at this moment."); - } - - // Tensor Shape - if (info.partialShape.is_dynamic() && data_shapes_map.count(name)) { - info.dataShape = parseTensorShape(data_shapes_map.at(name)[i % data_shapes_map.at(name).size()]); - } else if (info.partialShape.is_static()) { - info.dataShape = info.partialShape.get_shape(); - if (data_shapes_map.find(name) != data_shapes_map.end()) { - throw std::logic_error( - "Network's input \"" + name + - "\" is static. Use -shape argument for static inputs instead of -data_shape."); - } - } else if (!data_shapes_map.empty()) { - throw std::logic_error("Can't find network input name \"" + name + "\" in \"-data_shape " + - data_shapes_string + "\" command line parameter"); - } else { - throw std::logic_error( - "data_shape command line parameter should be set in case of network with dynamic shapes."); - } - - // Layout - info.originalLayout = descriptor.getLayout(); - if (layout_map.count(name)) { - if (layout_map.at(name).size() > 1) { - throw std::logic_error( - "layout command line parameter doesn't support multiple layouts for one input."); - } - info.layout = layout_map.at(name)[0]; - std::transform(info.layout.begin(), info.layout.end(), info.layout.begin(), ::toupper); - } else { - std::stringstream ss; - ss << descriptor.getLayout(); - info.layout = ss.str(); - } - // Update shape with batch if needed (only in static shape case) - // Update blob shape only not affecting network shape to trigger dynamic batch size case - if (batch_size != 0) { - std::size_t batch_index = info.layout.find("N"); - if ((batch_index != std::string::npos) && (info.dataShape.at(batch_index) != batch_size)) { - if (info.partialShape.is_static()) { - info.partialShape[batch_index] = batch_size; - } - info.dataShape[batch_index] = batch_size; - reshape_required = true; - } - } - info_map[name] = info; - } - - // Update scale and mean - std::map> scale_map = parseScaleOrMean(scale_string, info_map); - std::map> mean_map = parseScaleOrMean(mean_string, info_map); - - for (auto& item : info_map) { - if (item.second.isImage()) { - item.second.scale.assign({1, 1, 1}); - item.second.mean.assign({0, 0, 0}); - - if (scale_map.count(item.first)) { - item.second.scale = scale_map.at(item.first); - } - if (mean_map.count(item.first)) { - item.second.mean = mean_map.at(item.first); - } - } - } - - info_maps.push_back(info_map); - } - - return info_maps; -} - -template +/// +/// Parses command line data and data obtained from the function and returns configuration of each input +/// +/// command-line shape string +/// command-line layout string +/// command-line batch string +/// command-line tensor_shape string +/// command-line iscale string +/// command-line imean string +/// inputs vector obtained from ov::Model +/// returns true to this parameter if reshape is required +/// vector of benchmark_app::InputsInfo elements. +/// Each element is a configuration item for every test configuration case +/// (number of cases is calculated basing on tensor_shape and other parameters). +/// Each element is a map (input_name, configuration) containing data for each +/// input std::vector getInputsInfo(const std::string& shape_string, const std::string& layout_string, const size_t batch_size, const std::string& data_shapes_string, + const std::map>& fileNames, const std::string& scale_string, const std::string& mean_string, - const std::map& input_info) { - bool reshape_required = false; - return getInputsInfo(shape_string, - layout_string, - batch_size, - data_shapes_string, - scale_string, - mean_string, - input_info, - reshape_required); -} + const std::vector>& input_info); #ifdef USE_OPENCV void dump_config(const std::string& filename, const std::map>& config); void load_config(const std::string& filename, std::map>& config); #endif + +extern const std::vector supported_image_extensions; +extern const std::vector supported_binary_extensions; + +bool isBinaryFile(const std::string& filePath); +bool isImageFile(const std::string& filePath); +bool containsBinaries(const std::vector& filePaths); +std::vector filterFilesByExtensions(const std::vector& filePaths, + const std::vector& extensions); diff --git a/samples/cpp/common/utils/include/samples/args_helper.hpp b/samples/cpp/common/utils/include/samples/args_helper.hpp index eea28cbe394..6a893dd3ad2 100644 --- a/samples/cpp/common/utils/include/samples/args_helper.hpp +++ b/samples/cpp/common/utils/include/samples/args_helper.hpp @@ -54,3 +54,8 @@ void configurePrePostProcessing(std::shared_ptr& function, const std::string& iml, const std::string& oml, const std::string& ioml); + +//--- API 2.0 ------------------------------------------------------------------------- +void printInputAndOutputsInfo(const ov::Model& network); +void printInputAndOutputsInfoShort(const ov::Model& network); +void processPrecision(const ov::Model& network, const std::string& ip, const std::string& op, const std::string& iop); diff --git a/samples/cpp/common/utils/include/samples/common.hpp b/samples/cpp/common/utils/include/samples/common.hpp index c2f37fc8f8d..68450e97c47 100644 --- a/samples/cpp/common/utils/include/samples/common.hpp +++ b/samples/cpp/common/utils/include/samples/common.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -572,35 +573,18 @@ static UNUSED bool writeOutputBmp(unsigned char* data, size_t height, size_t wid return true; } -static std::vector> perfCountersSorted( - std::map perfMap) { - using perfItem = std::pair; - std::vector sorted; - for (auto& kvp : perfMap) - sorted.push_back(kvp); - - std::stable_sort(sorted.begin(), sorted.end(), [](const perfItem& l, const perfItem& r) { - return l.second.execution_index < r.second.execution_index; - }); - - return sorted; -} - -static UNUSED void printPerformanceCounts( - const std::map& performanceMap, - std::ostream& stream, - std::string deviceName, - bool bshowHeader = true) { - long long totalTime = 0; +static UNUSED void printPerformanceCounts(const std::map& performanceMap, + std::ostream& stream, + std::string deviceName, + bool bshowHeader = true) { + std::chrono::microseconds totalTime = std::chrono::microseconds::zero(); // Print performance counts if (bshowHeader) { stream << std::endl << "performance counts:" << std::endl << std::endl; } std::ios::fmtflags fmt(std::cout.flags()); - auto performanceMapSorted = perfCountersSorted(performanceMap); - - for (const auto& it : performanceMapSorted) { + for (const auto& it : performanceMap) { std::string toPrint(it.first); const int maxLayerName = 30; @@ -611,38 +595,39 @@ static UNUSED void printPerformanceCounts( stream << std::setw(maxLayerName) << std::left << toPrint; switch (it.second.status) { - case InferenceEngine::InferenceEngineProfileInfo::EXECUTED: + case ov::runtime::ProfilingInfo::Status::EXECUTED: stream << std::setw(15) << std::left << "EXECUTED"; break; - case InferenceEngine::InferenceEngineProfileInfo::NOT_RUN: + case ov::runtime::ProfilingInfo::Status::NOT_RUN: stream << std::setw(15) << std::left << "NOT_RUN"; break; - case InferenceEngine::InferenceEngineProfileInfo::OPTIMIZED_OUT: + case ov::runtime::ProfilingInfo::Status::OPTIMIZED_OUT: stream << std::setw(15) << std::left << "OPTIMIZED_OUT"; break; } - stream << std::setw(30) << std::left << "layerType: " + std::string(it.second.layer_type) + " "; - stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.second.realTime_uSec); - stream << std::setw(20) << std::left << "cpu: " + std::to_string(it.second.cpu_uSec); + stream << std::setw(30) << std::left << "layerType: " + std::string(it.second.node_type) + " "; + stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.second.real_time.count()); + stream << std::setw(20) << std::left << "cpu: " + std::to_string(it.second.cpu_time.count()); stream << " execType: " << it.second.exec_type << std::endl; - if (it.second.realTime_uSec > 0) { - totalTime += it.second.realTime_uSec; + if (it.second.real_time.count() > 0) { + totalTime += it.second.real_time; } } - stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime) << " microseconds" << std::endl; + stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count()) << " microseconds" + << std::endl; std::cout << std::endl; std::cout << "Full device name: " << deviceName << std::endl; std::cout << std::endl; std::cout.flags(fmt); } -static UNUSED void printPerformanceCounts(InferenceEngine::InferRequest request, - std::ostream& stream, - std::string deviceName, - bool bshowHeader = true) { - auto performanceMap = request.GetPerformanceCounts(); - printPerformanceCounts(performanceMap, stream, deviceName, bshowHeader); -} +// static UNUSED void printPerformanceCounts(InferenceEngine::InferRequest request, +// std::ostream& stream, +// std::string deviceName, +// bool bshowHeader = true) { +// auto performanceMap = request.GetPerformanceCounts(); +// printPerformanceCounts(performanceMap, stream, deviceName, bshowHeader); +//} inline std::map getMapFullDevicesNames(InferenceEngine::Core& ie, std::vector devices) { @@ -679,15 +664,6 @@ inline std::string getFullDeviceName(InferenceEngine::Core& ie, std::string devi } } -inline std::string getFullDeviceName(ov::runtime::Core& ie, std::string device) { - InferenceEngine::Parameter p; - try { - p = ie.get_metric(device, METRIC_KEY(FULL_DEVICE_NAME)); - return p.as(); - } catch (InferenceEngine::Exception&) { - return ""; - } -} /** * @brief This class represents an object that is found by an object detection net */ @@ -1155,3 +1131,68 @@ inline void showAvailableDevices() { * @param comment - lines starting with symbol `comment` are skipped */ std::map parseConfig(const std::string& configName, char comment = '#'); + +//--- API 2.0 -------------------------------------------------------------------------------------- +inline std::string getFullDeviceName(ov::runtime::Core& core, std::string device) { + InferenceEngine::Parameter p; + try { + p = core.get_metric(device, METRIC_KEY(FULL_DEVICE_NAME)); + return p.as(); + } catch (InferenceEngine::Exception&) { + return ""; + } +} + +static UNUSED void printPerformanceCounts(std::vector performanceData, + std::ostream& stream, + std::string deviceName, + bool bshowHeader = true) { + std::chrono::microseconds totalTime = std::chrono::microseconds::zero(); + // Print performance counts + if (bshowHeader) { + stream << std::endl << "performance counts:" << std::endl << std::endl; + } + + for (const auto& it : performanceData) { + std::string toPrint(it.node_name); + const int maxLayerName = 30; + + if (it.node_name.length() >= maxLayerName) { + toPrint = it.node_name.substr(0, maxLayerName - 4); + toPrint += "..."; + } + + stream << std::setw(maxLayerName) << std::left << toPrint; + switch (it.status) { + case ov::runtime::ProfilingInfo::Status::EXECUTED: + stream << std::setw(15) << std::left << "EXECUTED"; + break; + case ov::runtime::ProfilingInfo::Status::NOT_RUN: + stream << std::setw(15) << std::left << "NOT_RUN"; + break; + case ov::runtime::ProfilingInfo::Status::OPTIMIZED_OUT: + stream << std::setw(15) << std::left << "OPTIMIZED_OUT"; + break; + } + stream << std::setw(30) << std::left << "layerType: " + std::string(it.node_type) + " "; + stream << std::setw(20) << std::left << "realTime: " + std::to_string(it.real_time.count()); + stream << std::setw(20) << std::left << "cpu: " + std::to_string(it.cpu_time.count()); + stream << " execType: " << it.exec_type << std::endl; + if (it.real_time.count() > 0) { + totalTime += it.real_time; + } + } + stream << std::setw(20) << std::left << "Total time: " + std::to_string(totalTime.count()) << " microseconds" + << std::endl; + std::cout << std::endl; + std::cout << "Full device name: " << deviceName << std::endl; + std::cout << std::endl; +} + +static UNUSED void printPerformanceCounts(ov::runtime::InferRequest request, + std::ostream& stream, + std::string deviceName, + bool bshowHeader = true) { + auto performanceMap = request.get_profiling_info(); + printPerformanceCounts(performanceMap, stream, deviceName, bshowHeader); +} diff --git a/samples/cpp/common/utils/src/args_helper.cpp b/samples/cpp/common/utils/src/args_helper.cpp index 50cd936662d..eb2c331665a 100644 --- a/samples/cpp/common/utils/src/args_helper.cpp +++ b/samples/cpp/common/utils/src/args_helper.cpp @@ -367,6 +367,21 @@ void printInputAndOutputsInfo(const InferenceEngine::CNNNetwork& network) { } } +//--- API 2.0 ------------------------------------------------------------------------------------- +void printInputAndOutputsInfoShort(const ov::Model& network) { + std::cout << "Network inputs:" << std::endl; + for (auto&& param : network.get_parameters()) { + auto l = param->get_layout(); + std::cout << " " << param->get_friendly_name() << " : " << param->get_element_type() << " / " + << param->get_layout().to_string() << std::endl; + } + std::cout << "Network outputs:" << std::endl; + for (auto&& result : network.get_results()) { + std::cout << " " << result->get_friendly_name() << " : " << result->get_element_type() << " / " + << result->get_layout().to_string() << std::endl; + } +} + void printInputAndOutputsInfo(const ov::Model& network) { slog::info << "model name: " << network.get_friendly_name() << slog::endl; @@ -530,3 +545,87 @@ void configurePrePostProcessing(std::shared_ptr& model, model = preprocessor.build(); } + +ov::element::Type getPrecision(std::string value, + const std::unordered_map& supported_precisions) { + std::transform(value.begin(), value.end(), value.begin(), ::toupper); + + const auto precision = supported_precisions.find(value); + if (precision == supported_precisions.end()) { + throw std::logic_error("\"" + value + "\"" + " is not a valid precision"); + } + + return precision->second; +} + +ov::element::Type getPrecision2(const std::string& value) { + static const std::unordered_map supported_precisions = { + {"FP32", ov::element::f32}, + {"FP16", ov::element::f16}, + {"BF16", ov::element::bf16}, + {"U64", ov::element::u64}, + {"I64", ov::element::i64}, + {"U32", ov::element::u32}, + {"I32", ov::element::i32}, + {"U16", ov::element::u16}, + {"I16", ov::element::i16}, + {"U8", ov::element::u8}, + {"I8", ov::element::i8}, + {"BOOL", ov::element::boolean}, + }; + + return getPrecision(value, supported_precisions); +} + +void setPrecisions(const ov::Model& network, const std::string& iop) { + const auto user_precisions_map = parseArgMap(iop); + + for (auto&& item : user_precisions_map) { + const auto& layer_name = item.first; + const auto& user_precision = item.second; + + auto& params = network.get_parameters(); + auto& results = network.get_results(); + + const auto input = + std::find_if(params.begin(), params.end(), [&item](const std::shared_ptr& a) { + return a->get_friendly_name() == item.first; + }); + const auto output = + std::find_if(results.begin(), results.end(), [&layer_name](const std::shared_ptr& a) { + return a->get_friendly_name() == layer_name; + }); + + if (input != params.end()) { + (*input)->set_element_type(getPrecision2(user_precision)); + } else if (output != results.end()) { + for (int i = 0; i < (*output)->get_output_size(); i++) { + (*output)->set_output_type(i, getPrecision2(user_precision), (*output)->get_output_shape(i)); + } + } else { + throw std::logic_error(layer_name + " is not an input neither output"); + } + } +} + +void processPrecision(const ov::Model& network, const std::string& ip, const std::string& op, const std::string& iop) { + if (!ip.empty()) { + const auto user_precision = getPrecision2(ip); + for (auto&& layer : network.get_parameters()) { + layer->set_element_type(user_precision); + } + } + + if (!op.empty()) { + auto user_precision = getPrecision2(op); + for (auto&& layer : network.get_results()) { + for (int i = 0; i < layer->get_output_size(); i++) { + layer->set_output_type(i, user_precision, layer->get_output_shape(i)); + } + } + } + + if (!iop.empty()) { + setPrecisions(network, iop); + } +} From 50a33436d48eb3221cc487a2eb66f8d68f7893b3 Mon Sep 17 00:00:00 2001 From: Mateusz Tabaka Date: Thu, 30 Dec 2021 18:13:11 +0100 Subject: [PATCH 38/78] Add MatMulMultiplyFusion (#9023) * Add MatMulMultiplyFusion MatMulMultiplyFusion replaces following subgraph: MatMul->Multiply (with const) to following: Multiply->MatMul where Multiply is applied to MatMul's second input. --- .../src/pass/insert_movebroadcast.cpp | 18 +- .../matmul_multiply_fusion.hpp | 64 ++++++ .../common_optimizations.cpp | 22 +- .../matmul_multiply_fusion.cpp | 183 +++++++++++++++ .../moc_transformations.cpp | 22 +- .../pull_transpose_through_fq.cpp | 6 +- .../tests/frontend/shared/src/op_fuzzy.cpp | 2 +- .../snippets/movebroadcast.cpp | 25 +-- .../matmul_multiply_fusion.cpp | 177 +++++++++++++++ .../subgraph_tests/matmul_multiply_fusion.cpp | 208 ++++++++++++++++++ .../subgraph_tests/matmul_multiply_fusion.hpp | 18 ++ .../subgraph/matmul_multiply_fusion.hpp | 48 ++++ .../src/subgraph/matmul_multiply_fusion.cpp | 138 ++++++++++++ 13 files changed, 884 insertions(+), 47 deletions(-) create mode 100644 src/common/transformations/include/transformations/common_optimizations/matmul_multiply_fusion.hpp create mode 100644 src/common/transformations/src/transformations/common_optimizations/matmul_multiply_fusion.cpp create mode 100644 src/tests/functional/inference_engine/transformations/matmul_multiply_fusion.cpp create mode 100644 src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/matmul_multiply_fusion.cpp create mode 100644 src/tests/functional/plugin/shared/include/subgraph_tests/matmul_multiply_fusion.hpp create mode 100644 src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/matmul_multiply_fusion.hpp create mode 100644 src/tests/functional/shared_test_classes/src/subgraph/matmul_multiply_fusion.cpp diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 727046a54c9..0800da5266a 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -31,18 +31,18 @@ std::shared_ptr numpy_broadcast_node(const ngraph::Output value.get_shape().size(); + if (!do_broadcast) { + for (size_t index = 0; index < output_shape.size(); ++index) { + if (source_shape.at(index) == 1 && output_shape.at(index) != 1) { + do_broadcast = true; + break; + } } } remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name() - << " " << broadcast_axes << " " << broadcasted_node->get_shape() << " -> " << output_shape << std::endl; + << " " << broadcasted_node->get_shape() << " -> " << output_shape << std::endl; // it shouldn't be a probrem for now since we don't consider StridedSlice and Broadcast here if (auto constant = ngraph::as_type_ptr(broadcasted_node)) { @@ -63,7 +63,7 @@ std::shared_ptr numpy_broadcast_node(const ngraph::Output(broadcasted_node, output_shape); } diff --git a/src/common/transformations/include/transformations/common_optimizations/matmul_multiply_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/matmul_multiply_fusion.hpp new file mode 100644 index 00000000000..05d8955f52d --- /dev/null +++ b/src/common/transformations/include/transformations/common_optimizations/matmul_multiply_fusion.hpp @@ -0,0 +1,64 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include + +namespace ngraph { +namespace pass { + +class TRANSFORMATIONS_API MatMulMultiplyFusion; + +} // namespace pass +} // namespace ngraph + +/** + * @ingroup ie_transformation_common_api + * @brief MatMulMultiplyFusion transformation matches following graph: + * + * +----------+ +----------+ + * | A | | B | + * +----------+ +----------+ + * | | + * ----------- ---------- + * | | + * v v + * +--------+ + * | MatMul | + * +--------+ + * | + * v + * +----------+ +----------+ + * | Multiply |<----| Constant | + * +----------+ +----------+ + * + * + * and replaces with: + * + * +-------+ +----------+ + * | B | | Constant | + * +-------+ +----------+ + * | | + * ------ ------ + * | | + * v v + * +----------+ +----------+ + * | A | | Multiply | + * +----------+ +----------+ + * | | + * ----------- ---------- + * | | + * v v + * +--------+ + * | MatMul | + * +--------+ + */ +class ngraph::pass::MatMulMultiplyFusion: public ngraph::pass::MatcherPass { +public: + NGRAPH_RTTI_DECLARATION; + MatMulMultiplyFusion(); +}; diff --git a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp index a8213292b08..d548824fe06 100644 --- a/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/common_optimizations.cpp @@ -52,6 +52,7 @@ #include "transformations/common_optimizations/interpolate_sequence_fusion.hpp" #include "transformations/common_optimizations/convert_compression_only_to_legacy.hpp" #include +#include "transformations/common_optimizations/matmul_multiply_fusion.hpp" #include "transformations/op_conversions/bidirectional_sequences_decomposition.hpp" #include "transformations/op_conversions/convert_pad_to_group_conv.hpp" #include "transformations/op_conversions/convert_divide.hpp" @@ -166,16 +167,17 @@ bool ngraph::pass::CommonOptimizations::run_on_model(const std::shared_ptr(); manager.register_pass(); - auto conv_fusions = manager.register_pass(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->set_name("ngraph::pass::ConvFusions"); + auto multiply_fusions = manager.register_pass(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->set_name("ngraph::pass::MultiplyFusions"); manager.register_pass(); manager.register_pass(); // not plugins implemented gather8 diff --git a/src/common/transformations/src/transformations/common_optimizations/matmul_multiply_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/matmul_multiply_fusion.cpp new file mode 100644 index 00000000000..becf2e2d67e --- /dev/null +++ b/src/common/transformations/src/transformations/common_optimizations/matmul_multiply_fusion.cpp @@ -0,0 +1,183 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/matmul_multiply_fusion.hpp" +#include "transformations/utils/utils.hpp" + +#include +#include +#include +#include "itt.hpp" + +using namespace ngraph; + +NGRAPH_RTTI_DEFINITION(pass::MatMulMultiplyFusion, "MatMulMultiplyFusion", 0); + +static std::shared_ptr fuse_const_to_weights(const std::shared_ptr& matmul, + const Output& weights, + std::shared_ptr mul_const, + const op::AutoBroadcastSpec& autob) { + auto const_shape = mul_const->get_shape(); + auto const_rank = static_cast(const_shape.size()); + const auto& weights_shape = weights.get_shape(); + int64_t weights_rank = static_cast(weights_shape.size()); + + // Fuse if const is a scalar + if (ngraph::is_scalar(const_shape)) { + return std::make_shared(weights, mul_const); + } + + // Disallow consts that have rank greater than weights' rank when MatMul has dynamic rank. + // Or if MatMul result rank is static - disallow constant that extends MatMul rank + const auto& matmul_rank = matmul->get_output_partial_shape(0).rank(); + if (matmul_rank.is_dynamic()) { + if (const_rank > weights_rank) { + return nullptr; + } + } else if (matmul_rank.get_length() < const_rank) { + return nullptr; + } + + // Disallow const with shapes other than (a, b, ..., 1, z) + if (const_rank > 1 && const_shape[const_rank - 2] != 1) { + return nullptr; + } + + // If weights is not a constant node - disallow Multiply constant + // that extends weights rank. This is LPT requirement in case where + // weights are meant to be quantized. + if (const_rank > weights_rank && !ov::is_type(weights.get_node())) { + return nullptr; + } + + auto matmul_casted = std::dynamic_pointer_cast(matmul); + if (!matmul_casted) { + return nullptr; + } + + // Check if const shape matches weights + if (shape_size(const_shape) > 1) { + if (const_shape.back() > 1) { + // Check if const's last dimension matches last weights dimension + if (matmul_casted->get_transpose_b()) { + if (weights_rank > 1 && const_shape.back() != weights_shape[weights_rank - 2]) { + return nullptr; + } + } else if (const_shape.back() != weights_shape.back()) { + return nullptr; + } + } + + // Check if Multiply constant broadcasts MatMul input or weights. + // If it broadcasts both, we're dealing with case like: + // MatMul({1, 1, n, m}, const{1, 1, m, k}) -> mm{1, 1, n, k} + // Multiply(mm{1, 1, n, k}, const{x, y, 1, k}) + // + // After fusion, it'd look like: + // MatMul({1, 1, n, m}, const{x, y, m, k}) -> mm{x, y, n, k} + // In general, x * y elementwise multiples of size {n, k} should be cheaper than x * y matrix multiplies + // of size {n, m} x {m, k}, so the fusion should be disallowed in that case. + if (const_rank > 2) { + bool const_broadcasts_weights = weights_rank < const_rank; + for (int64_t i = 3; i <= const_rank; i++) { + if (const_shape[const_rank - i] != 1) { + const_broadcasts_weights = const_broadcasts_weights || + ((weights_rank - i >= 0) && (weights_shape[weights_rank - i] != const_shape[const_rank - i])); + } + } + bool const_broadcasts_input = true; + const auto& input_shape = matmul->get_input_partial_shape(0); + if (input_shape.rank().is_static()) { + const auto& input_rank = input_shape.rank().get_length(); + const_broadcasts_input = input_rank < const_rank; + for (int64_t i = 3; i <= const_rank; i++) { + if (const_shape[const_rank - i] != 1) { + const_broadcasts_input = const_broadcasts_input || + ((input_rank - i >= 0) && (input_shape[input_rank - i] != const_shape[const_rank - i])); + } + } + } + if (const_broadcasts_input && const_broadcasts_weights) { + return nullptr; + } + } + } + + auto transpose_const = [] (const std::shared_ptr& mul_const) -> std::shared_ptr { + auto const_shape = mul_const->get_shape(); + auto const_rank = const_shape.size(); + if (shape_size(const_shape) == 1 || (const_rank > 1 && const_shape[const_rank - 2] == 1 && const_shape[const_rank - 1] == 1)) { + // Nothing to transpose - constant has shape (..., 1, 1) + return mul_const; + } + std::shared_ptr new_const = mul_const; + // Scalars were fused before, it suffices to check for 1D shape here + if (const_rank == 1) { + const_shape.insert(const_shape.begin(), 1); + new_const = std::make_shared(mul_const, + opset8::Constant::create(element::u64, Shape{const_shape.size()}, const_shape), false); + } + std::vector perm(const_shape.size()); + std::iota(perm.begin(), perm.end(), 0); + std::swap(*(perm.end() - 1), *(perm.end() - 2)); + auto transpose = std::make_shared(new_const, + opset8::Constant::create(element::i64, Shape{perm.size()}, perm)); + return get_constant_from_source(transpose); + }; + + // If weights meant to be transposed - we need to also transpose constant + if (matmul_casted->get_transpose_b()) { + auto transpose = transpose_const(mul_const); + if (!transpose) + return nullptr; + return std::make_shared(weights, transpose); + } + return std::make_shared(weights, mul_const); +} + +pass::MatMulMultiplyFusion::MatMulMultiplyFusion() { + MATCHER_SCOPE(MatMulMultiplyFusion); + auto input_pattern = pattern::any_input(); + auto weights_pattern = pattern::any_input(pattern::has_static_shape()); + auto mul_const_pattern = pattern::wrap_type(); + auto matmul_pattern = pattern::wrap_type({input_pattern, weights_pattern}); + auto mul_pattern = pattern::wrap_type({matmul_pattern, mul_const_pattern}); + + matcher_pass_callback callback = [=](pattern::Matcher& m) { + const auto& pattern_map = m.get_pattern_value_map(); + const auto& weights = pattern_map.at(weights_pattern); + auto mul = std::dynamic_pointer_cast(pattern_map.at(mul_pattern).get_node_shared_ptr()); + if (!mul) + return false; + auto mul_const = std::dynamic_pointer_cast(pattern_map.at(mul_const_pattern).get_node_shared_ptr()); + if (!mul_const) + return false; + auto matmul = pattern_map.at(matmul_pattern).get_node_shared_ptr(); + + auto new_weights = fuse_const_to_weights(matmul, weights, mul_const, mul->get_autob()); + if (!new_weights) + return false; + + // Constantfold new weights, only if old weights is a constant node. + // To make sure that subgraphs with e.g. FakeQuantize don't get constant folded here. + if (ov::is_type(weights.get_node())) { + if (auto constant = get_constant_from_source(new_weights)) { + new_weights = constant; + } + } + + const auto& input = pattern_map.at(input_pattern); + auto new_mm = matmul->clone_with_new_inputs({input, new_weights}); + new_mm->set_friendly_name(mul->get_friendly_name()); + + register_new_node(new_mm); + copy_runtime_info({mul, weights.get_node_shared_ptr(), matmul}, {new_weights, new_mm}); + replace_node(mul, new_mm); + + return true; + }; + + auto m = std::make_shared(mul_pattern, matcher_name); + this->register_matcher(m, callback); +} diff --git a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp index e2bb9998125..a954d85935f 100644 --- a/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp @@ -57,6 +57,7 @@ #include #include #include +#include NGRAPH_RTTI_DEFINITION(ngraph::pass::MOCTransformations, "MOCTransformations", 0); @@ -164,16 +165,17 @@ bool ngraph::pass::MOCTransformations::run_on_model(const std::shared_ptr(); - auto conv_fusions = manager.register_pass(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->add_matcher(); - conv_fusions->set_name("ngraph::pass::ConvFusions"); + auto multiply_fusions = manager.register_pass(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->add_matcher(); + multiply_fusions->set_name("ngraph::pass::MultiplyFusions"); manager.register_pass(); diff --git a/src/common/transformations/src/transformations/common_optimizations/pull_transpose_through_fq.cpp b/src/common/transformations/src/transformations/common_optimizations/pull_transpose_through_fq.cpp index d4f61e7e728..1cb067146ac 100644 --- a/src/common/transformations/src/transformations/common_optimizations/pull_transpose_through_fq.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/pull_transpose_through_fq.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include NGRAPH_RTTI_DEFINITION(ngraph::pass::PullTransposeThroughFQUp, "PullTransposeThroughFQUp", 0); @@ -61,7 +62,10 @@ ngraph::pass::PullTransposeThroughFQUp::PullTransposeThroughFQUp() { opset1::Constant::create(element::i64, Shape{unsqueeze_axes.size()}, unsqueeze_axes)); new_ops.push_back(fq_input.get_node_shared_ptr()); } - fq_input = op::util::make_try_fold(fq_input, transpose->input_value(1)); + fq_input = std::make_shared(fq_input, transpose->input_value(1)); + if (auto constant = get_constant_from_source(fq_input)) { + fq_input = constant; + } ngraph::copy_runtime_info(transpose, fq_input.get_node_shared_ptr()); fq_inputs.push_back(fq_input); } diff --git a/src/core/tests/frontend/shared/src/op_fuzzy.cpp b/src/core/tests/frontend/shared/src/op_fuzzy.cpp index 9b8f2716b03..73c5c76348d 100644 --- a/src/core/tests/frontend/shared/src/op_fuzzy.cpp +++ b/src/core/tests/frontend/shared/src/op_fuzzy.cpp @@ -106,7 +106,7 @@ void FrontEndFuzzyOpTest::runConvertedModel(const std::shared_ptr f(nullptr), f_ref(nullptr); +TEST_F(TransformationTestsF, InsertBroadcastMove) { { auto data0 = std::make_shared(element::f32, Shape{2, 3}); - auto data1 = std::make_shared(element::f32, Shape{1, 3}); + auto data1 = std::make_shared(element::f32, Shape{1, 1, 3}); auto add = std::make_shared(data0, data1); - f = std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); + function = std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); - pass::Manager m; - m.register_pass(); - m.register_pass(); - m.run_passes(f); - ASSERT_NO_THROW(check_rt_info(f)); + manager.register_pass(); } { auto data0 = std::make_shared(element::f32, Shape{2, 3}); - auto data1 = std::make_shared(element::f32, Shape{1, 3}); - auto move = std::make_shared(data1, data0->output(0).get_shape()); - auto add = std::make_shared(data0, move); - f_ref = std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); + auto data1 = std::make_shared(element::f32, Shape{1, 1, 3}); + auto move0 = std::make_shared(data0, Shape{1, 2, 3}); + auto move1 = std::make_shared(data1, Shape{1, 2, 3}); + auto add = std::make_shared(move0, move1); + function_ref = std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); } - - auto res = compare_functions(f, f_ref); - ASSERT_TRUE(res.first) << res.second; } diff --git a/src/tests/functional/inference_engine/transformations/matmul_multiply_fusion.cpp b/src/tests/functional/inference_engine/transformations/matmul_multiply_fusion.cpp new file mode 100644 index 00000000000..88a898e934b --- /dev/null +++ b/src/tests/functional/inference_engine/transformations/matmul_multiply_fusion.cpp @@ -0,0 +1,177 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include +#include +#include +#include + +#include "common_test_utils/ngraph_test_utils.hpp" + +using namespace ngraph; + +TEST_F(TransformationTestsF, MatMulMultiplyFusionConstantWeightsScalarConstant) { + { + auto data = std::make_shared(element::f32, Shape{4, 3}); + auto weights = opset8::Constant::create(element::f32, Shape{3, 2}, {1, 2, 3, 4, 5, 6}); + auto matmul = std::make_shared(data, weights); + auto mul_const = opset8::Constant::create(element::f32, Shape{}, {2}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(NodeVector{mul}, ParameterVector{data}); + + manager.register_pass(); + } + + { + auto data = std::make_shared(element::f32, Shape{4, 3}); + auto weights = opset8::Constant::create(element::f32, Shape{3, 2}, {2, 4, 6, 8, 10, 12}); + auto matmul = std::make_shared(data, weights); + function_ref = std::make_shared(NodeVector{matmul}, ParameterVector{data}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +TEST_F(TransformationTestsF, MatMulMultiplyFusionConstantWeightsNonScalarConstant) { + { + auto data = std::make_shared(element::f32, Shape{1, 2, 4, 3}); + auto weights = opset8::Constant::create(element::f32, Shape{3, 2}, {1, 2, 3, 4, 5, 6}); + auto matmul = std::make_shared(data, weights); + auto mul_const = opset8::Constant::create(element::f32, Shape{1, 1, 1, 2}, {2, 3}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(NodeVector{mul}, ParameterVector{data}); + + manager.register_pass(); + } + + { + auto data = std::make_shared(element::f32, Shape{1, 2, 4, 3}); + auto weights = opset8::Constant::create(element::f32, Shape{1, 1, 3, 2}, {2, 6, 6, 12, 10, 18}); + auto matmul = std::make_shared(data, weights); + function_ref = std::make_shared(NodeVector{matmul}, ParameterVector{data}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +TEST_F(TransformationTestsF, MatMulMultiplyFusionConstantTransposedWeightsNonScalarConstant) { + { + auto data = std::make_shared(element::f32, Shape{1, 2, 4, 3}); + auto weights = opset8::Constant::create(element::f32, Shape{2, 3}, {1, 2, 3, 4, 5, 6}); + auto matmul = std::make_shared(data, weights, false, true); + auto mul_const = opset8::Constant::create(element::f32, Shape{1, 1, 1, 2}, {2, 3}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(NodeVector{mul}, ParameterVector{data}); + + manager.register_pass(); + } + + { + auto data = std::make_shared(element::f32, Shape{1, 2, 4, 3}); + auto weights = opset8::Constant::create(element::f32, Shape{1, 1, 2, 3}, {2, 4, 6, 12, 15, 18}); + auto matmul = std::make_shared(data, weights, false, true); + function_ref = std::make_shared(NodeVector{matmul}, ParameterVector{data}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +TEST_F(TransformationTestsF, MatMulMultiplyFusionNonConstantTransposedWeightsNonScalarConstant) { + { + auto data = std::make_shared(element::f32, Shape{2, 3}); + auto weights = std::make_shared(element::f32, Shape{2, 3}); + auto matmul = std::make_shared(data, weights, false, true); + auto mul_const = opset8::Constant::create(element::f32, Shape{1, 2}, {4, 5}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(NodeVector{mul}, ParameterVector{data, weights}); + + manager.register_pass(); + } + + { + auto data = std::make_shared(element::f32, Shape{2, 3}); + auto weights = std::make_shared(element::f32, Shape{2, 3}); + auto mul_const = opset8::Constant::create(element::f32, Shape{2, 1}, {4, 5}); + auto mul = std::make_shared(weights, mul_const); + auto matmul = std::make_shared(data, mul, false, true); + function_ref = std::make_shared(NodeVector{matmul}, ParameterVector{data, weights}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +using MatMulMultiplyFusionParams = std::tuple; + +class MatMulMultiplyFusionDynamicShapes + : public testing::WithParamInterface, + public TransformationTestsF { +}; + +TEST_P(MatMulMultiplyFusionDynamicShapes, FusionTest) { + auto params = GetParam(); + const auto& input_shape = std::get<0>(params); + const auto& weights_shape = std::get<1>(params); + bool transpose_b = std::get<2>(params); + const auto& const_shape = std::get<3>(params); + const auto& new_weights_shape = std::get<4>(params); + bool can_fuse = std::get<5>(params); + + { + auto data = std::make_shared(element::f32, input_shape); + auto weights = opset8::Constant::create(element::f32, weights_shape, {2}); + auto matmul = std::make_shared(data, weights, false, transpose_b); + auto mul_const = opset8::Constant::create(element::f32, const_shape, {4}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(NodeVector{mul}, ParameterVector{data}); + + manager.register_pass(); + } + + if (can_fuse) { + auto data = std::make_shared(element::f32, input_shape); + auto weights = opset8::Constant::create(element::f32, new_weights_shape, {8}); + auto matmul = std::make_shared(data, weights, false, transpose_b); + function_ref = std::make_shared(NodeVector{matmul}, ParameterVector{data}); + } + comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES); +} + +std::vector params = { + MatMulMultiplyFusionParams(PartialShape::dynamic(), {2, 3}, false, {}, {2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {2, 3}, false, {1}, {2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {2, 3}, false, {1, 3}, {2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {3, 2}, true, {1, 3}, {3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 2, 3}, false, {1, 3}, {4, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 3, 2}, true, {1, 3}, {4, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 2, 3}, false, {1, 1, 3}, {4, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 3, 2}, true, {1, 1, 3}, {4, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 2, 3}, false, {4, 1, 3}, {4, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 3, 2}, true, {4, 1, 3}, {4, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 3, 2, 3}, false, {4, 3, 1, 3}, {4, 3, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {4, 3, 3, 2}, true, {4, 3, 1, 3}, {4, 3, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(2), {2, 3}, false, {1, 3}, {2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(2), {3, 2}, true, {1, 3}, {3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {2, 3}, false, {1, 1, 1, 3}, {1, 1, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {3, 2}, true, {1, 1, 1, 3}, {1, 1, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {2, 3}, false, {1, 1, 3}, {1, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {3, 2}, true, {1, 1, 3}, {1, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 2, 3}, false, {1}, {4, 3, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 2, 3}, false, {1, 3}, {4, 3, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 3, 2}, true, {1, 3}, {4, 3, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 2, 3}, false, {1, 1, 3}, {4, 3, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 3, 2}, true, {1, 1, 3}, {4, 3, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 2, 3}, false, {1, 1, 1, 3}, {4, 3, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 3, 2}, true, {1, 1, 1, 3}, {4, 3, 3, 2}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 2, 3}, false, {4, 1, 1, 3}, {4, 3, 2, 3}, true), + MatMulMultiplyFusionParams(PartialShape::dynamic(4), {4, 3, 3, 2}, true, {1, 3, 1, 3}, {4, 3, 3, 2}, true), + MatMulMultiplyFusionParams({2, Dimension::dynamic(), Dimension::dynamic(), Dimension::dynamic()}, {2, 3}, false, {2, 1, 1, 3}, {2, 1, 2, 3}, true), + MatMulMultiplyFusionParams({Dimension::dynamic(), 3, Dimension::dynamic(), Dimension::dynamic()}, {2, 3}, false, {1, 3, 1, 3}, {1, 3, 2, 3}, true), + MatMulMultiplyFusionParams({2, 3, Dimension::dynamic(), Dimension::dynamic()}, {2, 3}, false, {2, 3, 1, 3}, {2, 3, 2, 3}, true), + // negative cases + MatMulMultiplyFusionParams(PartialShape::dynamic(), {2, 3}, false, {1, 1, 1}, {}, false), + MatMulMultiplyFusionParams(PartialShape::dynamic(2), {2, 3}, false, {1, 1, 1}, {}, false), + MatMulMultiplyFusionParams(PartialShape::dynamic(), {1, 2, 3}, false, {3, 1, 3}, {}, false), + MatMulMultiplyFusionParams(PartialShape::dynamic(3), {1, 2, 3}, false, {3, 1, 3}, {}, false), + MatMulMultiplyFusionParams({1, 1, Dimension::dynamic(), Dimension::dynamic()}, {2, 3}, false, {2, 3, 1, 3}, {}, false), +}; + +INSTANTIATE_TEST_SUITE_P(TransformationTests, MatMulMultiplyFusionDynamicShapes, ::testing::ValuesIn(params)); diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/matmul_multiply_fusion.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/matmul_multiply_fusion.cpp new file mode 100644 index 00000000000..6755cb12a6e --- /dev/null +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/matmul_multiply_fusion.cpp @@ -0,0 +1,208 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "subgraph_tests/matmul_multiply_fusion.hpp" + +using namespace SubgraphTestsDefinitions; + +namespace { +std::vector shape_params = { + {{2, 2}, {2, 2}, false, {}}, + {{2, 2}, {2, 2}, false, {1}}, + {{2, 2}, {2, 2}, false, {1, 2}}, + {{2, 2}, {2, 2}, true, {1, 2}}, + {{5}, {5}, false, {}}, + {{5}, {5, 1}, false, {}}, + {{5}, {5, 1}, false, {1}}, + {{5}, {5, 3}, false, {3}}, + {{5}, {3, 5}, true, {3}}, + {{5, 10}, {10, 7}, false, {}}, + {{5, 10}, {7, 10}, true, {}}, + {{5, 10}, {10, 7}, false, {7}}, + {{5, 10}, {7, 10}, true, {7}}, + {{5, 10}, {10, 7}, false, {1, 7}}, + {{5, 10}, {7, 10}, true, {1, 7}}, + {{5, 10}, {2, 10, 7}, false, {2, 1, 7}}, + {{5, 10}, {2, 7, 10}, true, {2, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {7}}, + {{5, 10}, {2, 3, 7, 10}, true, {7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 1, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 1, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 3, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 3, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {2, 3, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {2, 3, 1, 7}}, + {{5, 10}, {10}, false, {}}, + {{5, 10}, {10}, false, {1}}, + {{2, 3, 5, 10}, {10, 7}, false, {1}}, + {{2, 3, 5, 10}, {7, 10}, true, {1}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {1, 1, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {10, 7}, false, {1, 1, 1, 7}}, + {{1, 1, 5, 10}, {7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 3, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {1, 3, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {2, 3, 1, 7}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_MatMulMultiplyFusion, MatMulMultiplyFusion, + ::testing::Combine( + ::testing::ValuesIn(shape_params), + ::testing::Values(true), // can be fused + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MatMulMultiplyFusion::getTestCaseName); + +std::vector negative_shape_params = { + {{5}, {5}, false, {1}}, + {{5}, {5}, false, {5}}, + {{5}, {5}, false, {5, 1}}, + {{5}, {5, 3}, false, {1, 3}}, + {{2, 2}, {2, 2}, false, {2, 2}}, + {{2, 2}, {2, 2}, true, {2, 2}}, + {{5, 5}, {5, 5}, false, {5, 5}}, + {{5, 5}, {5, 5}, true, {5, 5}}, + {{5, 10}, {10}, false, {5, 1}}, + {{5, 10}, {10, 7}, false, {5, 7}}, + {{5, 10}, {7, 10}, true, {5, 7}}, + {{5, 10}, {10, 5}, false, {5, 5}}, + {{5, 10}, {5, 10}, true, {5, 5}}, + {{1, 1, 5, 10}, {10, 7}, false, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {7, 10}, true, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {1, 1, 10, 7}, false, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {1, 1, 7, 10}, true, {2, 3, 1, 7}}, + {{2, 1, 5, 10}, {1, 1, 10, 7}, false, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {1, 3, 10, 7}, false, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {10}, false, {5}}, + {{2, 3, 5, 10}, {10, 7}, false, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {5, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {5, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {1, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {1, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 1, 1, 1, 7}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_NegativeMatMulMultiplyFusion, MatMulMultiplyFusion, + ::testing::Combine( + ::testing::ValuesIn(negative_shape_params), + ::testing::Values(false), // cannot be fused + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + MatMulMultiplyFusion::getTestCaseName); + +std::vector shape_params2 = { + {{2, 2}, {2, 2}, false, {}}, + {{2, 2}, {2, 2}, false, {1}}, + {{2, 2}, {2, 2}, false, {1, 2}}, + {{2, 2}, {2, 2}, true, {1, 2}}, + {{5, 10}, {10, 7}, false, {}}, + {{5, 10}, {7, 10}, true, {}}, + {{5, 10}, {10, 7}, false, {7}}, + {{5, 10}, {7, 10}, true, {7}}, + {{5, 10}, {10, 7}, false, {1, 7}}, + {{5, 10}, {7, 10}, true, {1, 7}}, + {{5, 10}, {2, 10, 7}, false, {2, 1, 7}}, + {{5, 10}, {2, 7, 10}, true, {2, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {7}}, + {{5, 10}, {2, 3, 7, 10}, true, {7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 1, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 1, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {1, 3, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {1, 3, 1, 7}}, + {{5, 10}, {2, 3, 10, 7}, false, {2, 3, 1, 7}}, + {{5, 10}, {2, 3, 7, 10}, true, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {1}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {1, 3, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {1, 3, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 10, 7}, false, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {2, 3, 7, 10}, true, {2, 3, 1, 7}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_QuantizedMatMulMultiplyFusion, QuantizedMatMulMultiplyFusion, + ::testing::Combine( + ::testing::ValuesIn(shape_params2), + ::testing::Values(true), // can be fused + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + QuantizedMatMulMultiplyFusion::getTestCaseName); + +std::vector negative_shape_params2 = { + {{2, 2}, {2, 2}, false, {2, 2}}, + {{2, 2}, {2, 2}, true, {2, 2}}, + {{5, 5}, {5, 5}, false, {5, 5}}, + {{5, 5}, {5, 5}, true, {5, 5}}, + {{5, 10}, {10, 7}, false, {5, 7}}, + {{5, 10}, {7, 10}, true, {5, 7}}, + {{5, 10}, {10, 5}, false, {5, 5}}, + {{5, 10}, {5, 10}, true, {5, 5}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1, 1}}, + {{1, 1, 5, 10}, {10, 7}, false, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {7, 10}, true, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1, 1, 1}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {1, 1, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {10, 7}, false, {2, 3, 1, 7}}, + {{2, 3, 5, 10}, {7, 10}, true, {2, 3, 1, 7}}, + {{1, 1, 5, 10}, {10, 7}, false, {1, 1, 1, 7}}, + {{1, 1, 5, 10}, {7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {5, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {5, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {1, 1, 1, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {1, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {1, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 10, 7}, false, {2, 3, 5, 7}}, + {{2, 3, 5, 10}, {3, 7, 10}, true, {2, 3, 5, 7}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_NegativeQuantizedMatMulMultiplyFusion, QuantizedMatMulMultiplyFusion, + ::testing::Combine( + ::testing::ValuesIn(negative_shape_params2), + ::testing::Values(false), // cannot be fused + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + QuantizedMatMulMultiplyFusion::getTestCaseName); + +} // namespace diff --git a/src/tests/functional/plugin/shared/include/subgraph_tests/matmul_multiply_fusion.hpp b/src/tests/functional/plugin/shared/include/subgraph_tests/matmul_multiply_fusion.hpp new file mode 100644 index 00000000000..fb1c2ff3e79 --- /dev/null +++ b/src/tests/functional/plugin/shared/include/subgraph_tests/matmul_multiply_fusion.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "shared_test_classes/subgraph/matmul_multiply_fusion.hpp" + +namespace SubgraphTestsDefinitions { + +TEST_P(MatMulMultiplyFusion, CompareWithRefs) { + Run(); +} + +TEST_P(QuantizedMatMulMultiplyFusion, CompareWithRefs) { + Run(); +} +} // namespace SubgraphTestsDefinitions diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/matmul_multiply_fusion.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/matmul_multiply_fusion.hpp new file mode 100644 index 00000000000..d4c76e2278b --- /dev/null +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/matmul_multiply_fusion.hpp @@ -0,0 +1,48 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "shared_test_classes/base/layer_test_utils.hpp" +#include + +namespace SubgraphTestsDefinitions { + +struct MatMulMultiplyFusionShapeParams { + ngraph::Shape input_shape; + ngraph::Shape weights_shape; + bool trans_b; + ngraph::Shape const_shape; +}; + +typedef std::tuple< + MatMulMultiplyFusionShapeParams, + bool, // whether Mul can be fused to MatMul in this case + std::string // Device name + > MatMulMultiplyFusionParams; + +class MatMulMultiplyFusion + : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + +protected: + void SetUp() override; +}; + +class QuantizedMatMulMultiplyFusion + : public testing::WithParamInterface, + virtual public LayerTestsUtils::LayerTestsCommon { +public: + static std::string getTestCaseName(const testing::TestParamInfo &obj); + +protected: + void SetUp() override; + void TearDown() override; +}; + +} // namespace SubgraphTestsDefinitions diff --git a/src/tests/functional/shared_test_classes/src/subgraph/matmul_multiply_fusion.cpp b/src/tests/functional/shared_test_classes/src/subgraph/matmul_multiply_fusion.cpp new file mode 100644 index 00000000000..52c08ca4398 --- /dev/null +++ b/src/tests/functional/shared_test_classes/src/subgraph/matmul_multiply_fusion.cpp @@ -0,0 +1,138 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "transformations/common_optimizations/matmul_multiply_fusion.hpp" +#include "shared_test_classes/subgraph/matmul_multiply_fusion.hpp" +#include "ngraph_functions/builders.hpp" +#include + +namespace SubgraphTestsDefinitions { + +using namespace ngraph; + +std::string MatMulMultiplyFusion::getTestCaseName(const testing::TestParamInfo &obj) { + MatMulMultiplyFusionShapeParams shape_params; + std::string device; + std::tie(shape_params, std::ignore, device) = obj.param; + std::ostringstream results; + + results << "input=" << shape_params.input_shape << "_"; + results << "weights=" << shape_params.weights_shape << "_"; + results << "transB=" << std::boolalpha << shape_params.trans_b << "_"; + results << "const=" << shape_params.const_shape << "_"; + results << "dev=" << device; + return results.str(); +} + +void MatMulMultiplyFusion::SetUp() { + MatMulMultiplyFusionShapeParams shape_params; + element::Type precision = element::f32; + bool can_be_fused; + std::tie(shape_params, can_be_fused, targetDevice) = GetParam(); + + const auto& input_shape = shape_params.input_shape; + const auto& weights_shape = shape_params.weights_shape; + const auto& const_shape = shape_params.const_shape; + + auto param = std::make_shared(precision, input_shape); + auto weights = opset8::Constant::create(precision, weights_shape, {0.5}); + auto matmul = std::make_shared(param, weights, false, shape_params.trans_b); + auto mul_const = opset8::Constant::create(precision, const_shape, {2.0}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(OutputVector{mul}, ParameterVector{param}); + + auto transformed_function = clone_function(*function); + pass::Manager manager; + manager.register_pass(); + manager.run_passes(transformed_function); + + bool functions_equal; + auto orig_function = clone_function(*function); + std::tie(functions_equal, std::ignore) = compare_functions(transformed_function, orig_function, true); + if (can_be_fused) { + ASSERT_FALSE(functions_equal); + } else { + ASSERT_TRUE(functions_equal); + } +} + +std::string QuantizedMatMulMultiplyFusion::getTestCaseName(const testing::TestParamInfo &obj) { + MatMulMultiplyFusionShapeParams shape_params; + std::string device; + std::tie(shape_params, std::ignore, device) = obj.param; + std::ostringstream results; + + results << "input=" << shape_params.input_shape << "_"; + results << "weights=" << shape_params.weights_shape << "_"; + results << "transB=" << std::boolalpha << shape_params.trans_b << "_"; + results << "const=" << shape_params.const_shape << "_"; + results << "dev=" << device; + return results.str(); +} + +void QuantizedMatMulMultiplyFusion::SetUp() { + MatMulMultiplyFusionShapeParams shape_params; + element::Type precision = element::f32; + bool can_be_fused; + std::tie(shape_params, can_be_fused, targetDevice) = GetParam(); + + const auto& input_shape = shape_params.input_shape; + auto weights_shape = shape_params.weights_shape; + const auto& const_shape = shape_params.const_shape; + + auto param = std::make_shared(precision, input_shape); + auto low = opset8::Constant::create(precision, {1}, {-2}); + auto high = opset8::Constant::create(precision, {1}, {2}); + auto input_fq = std::make_shared(param, low, high, low, high, 256); + std::shared_ptr weights = opset8::Constant::create(precision, weights_shape, {0.5}); + weights = std::make_shared(weights, low, high, low, high, 255); + if (shape_params.trans_b) { + std::vector perm(weights_shape.size(), 0); + std::iota(perm.begin(), perm.end(), 0); + std::swap(*(perm.end() - 2), *(perm.end() - 1)); + auto perm_const = opset8::Constant::create(element::i32, {perm.size()}, perm); + weights = std::make_shared(weights, perm_const); + } + auto matmul = std::make_shared(input_fq, weights); + auto mul_const = opset8::Constant::create(precision, const_shape, {2}); + auto mul = std::make_shared(matmul, mul_const); + function = std::make_shared(OutputVector{mul}, ParameterVector{param}); + + auto transformed_function = clone_function(*function); + pass::Manager manager; + manager.register_pass(); + manager.run_passes(transformed_function); + + bool functions_equal; + auto orig_function = clone_function(*function); + std::tie(functions_equal, std::ignore) = compare_functions(transformed_function, orig_function, true); + if (can_be_fused) { + ASSERT_FALSE(functions_equal); + } else { + ASSERT_TRUE(functions_equal); + } +} + +void QuantizedMatMulMultiplyFusion::TearDown() { + auto get_layer_type = [] (const std::shared_ptr& node) -> const std::string& { + const auto& rt_info = node->get_rt_info(); + auto it = rt_info.find(ExecGraphInfoSerialization::LAYER_TYPE); + IE_ASSERT(it != rt_info.end()); + return it->second.as(); + }; + + auto runtime_function = executableNetwork.GetExecGraphInfo().getFunction(); + int ops_found = 0; + for (const auto& node : runtime_function->get_ordered_ops()) { + const auto& layer_type = get_layer_type(node); + if (layer_type == "FullyConnected" || layer_type == "MatMul") { + ops_found++; + auto inputs = node->input_values(); + ASSERT_EQ(element::u8, inputs[0].get_element_type()); + ASSERT_EQ(element::i8, inputs[1].get_element_type()); + } + } + ASSERT_GT(ops_found, 0); +} +} // namespace SubgraphTestsDefinitions From 738a5717422b6597e896bddb9dc18462ef409cde Mon Sep 17 00:00:00 2001 From: Maxim Vafin Date: Thu, 30 Dec 2021 21:01:55 +0300 Subject: [PATCH 39/78] Use layouts in legacy pre-processing (#9355) * Use layouts in legacy pre-processing * Use layouts for batch in MO * Unify code for channel and batch dimension * Fix issue when idx is None --- .../tools/mo/back/ReverseInputChannels.py | 62 +++++++------ .../openvino/tools/mo/front/common/layout.py | 39 ++++++++ .../tools/mo/middle/AddMeanScaleValues.py | 17 ++-- .../openvino/tools/mo/middle/passes/infer.py | 21 +++-- .../mo/back/ReverseInputChannels_test.py | 71 +++++++++++++-- .../mo/middle/AddMeanScaleValues_test.py | 90 +++++++++++++++++++ 6 files changed, 250 insertions(+), 50 deletions(-) diff --git a/tools/mo/openvino/tools/mo/back/ReverseInputChannels.py b/tools/mo/openvino/tools/mo/back/ReverseInputChannels.py index fe12c7dad42..3413fdd4808 100644 --- a/tools/mo/openvino/tools/mo/back/ReverseInputChannels.py +++ b/tools/mo/openvino/tools/mo/back/ReverseInputChannels.py @@ -5,16 +5,17 @@ import logging as log import numpy as np -from openvino.tools.mo.ops.gather import Gather -from openvino.tools.mo.ops.split import Split from openvino.tools.mo.back.replacement import BackReplacementPattern +from openvino.tools.mo.front.common.layout import get_dim_from_layout, get_features_dim from openvino.tools.mo.front.common.partial_infer.utils import int64_array from openvino.tools.mo.front.common.partial_infer.utils import mo_array from openvino.tools.mo.front.tf.graph_utils import create_op_with_const_inputs from openvino.tools.mo.graph.graph import Graph from openvino.tools.mo.graph.graph import Node from openvino.tools.mo.ops.concat import Concat +from openvino.tools.mo.ops.gather import Gather from openvino.tools.mo.ops.op import Op, PermuteAttrs +from openvino.tools.mo.ops.split import Split class ReverseChannels(Op): @@ -52,50 +53,47 @@ class InsertReverseChannels(BackReplacementPattern): enabled = False @staticmethod - def get_fw_index(node: Node, idx: int) -> int: - if not node.has_valid('rt_info'): + def get_channel_index(node: Node) -> int: + guessed_layout = 'NCHW' + if node.has_valid('rt_info'): + rt_info = node.rt_info + if rt_info.contains('old_api_map_order'): + old_api_map_version = rt_info.get_attribute_version('old_api_map_order') + old_api_map = rt_info.info['old_api_map_order', old_api_map_version] + if 'inverse_order' in old_api_map.info: + order = old_api_map.info['inverse_order'] + assert len(order) == len(guessed_layout) + guessed_layout = np.array(list(guessed_layout))[order] + guessed_layout = ''.join(guessed_layout) + idx, has_layout = get_dim_from_layout(node, 'C') + if has_layout: return idx - - rt_info = node.rt_info - if not rt_info.contains('old_api_map_order'): - return idx - - old_api_map_version = rt_info.get_attribute_version('old_api_map_order') - old_api_map = rt_info.info['old_api_map_order', old_api_map_version] - if 'inverse_order' not in old_api_map.info: - return idx - - order = old_api_map.info['inverse_order'] - node_name = node.soft_get('name', node.id) - - if idx < 0: - assert not node.out_port(0).disconnected(), 'Cannot normalize negative axis {} in node {} ' \ - 'as out port is disconnected.'.format(idx, node_name) - data_rank = len(list(node.out_port(0).data.get_shape())) - idx = data_rank + idx - - assert len(order) > idx >= 0, \ - 'Channel index {} is incompatible with old_api_map in node {}.'.format(idx, node_name) - return list(order).index(idx) + else: + return get_features_dim(guessed_layout, len(node.shape)) def find_and_replace_pattern(self, graph: Graph): all_params = [(p.soft_get('name', p.id), p, list(p.out_port(0).data.get_shape())) for p in graph.get_op_nodes(type='Parameter')] - suitable_params = [(name, p, shape) for name, p, shape in all_params if - len(shape) == 4 and shape[self.get_fw_index(p, 1)] == 3] + suitable_params = [] + for name, p, shape in all_params: + if len(shape) == 4: + idx = self.get_channel_index(p) + if idx is not None and shape[idx] == 3: + suitable_params.append((name, p, shape, idx)) + log.debug('All network inputs: {}'.format({name: shape for name, _, shape in all_params})) - log.debug('Will reverse input channels for: {}'.format({name: shape for name, _, shape in suitable_params})) + log.debug('Will reverse input channels for: {}'.format({name: shape for name, _, shape, _ in suitable_params})) if len(suitable_params) < len(all_params): log.error('Network has {} inputs overall, but only {} of them are suitable for input channels reversing.\n' 'Suitable for input channel reversing inputs are 4-dimensional with 3 channels\nAll inputs: {}\n' 'Suitable inputs {}'.format(len(all_params), len(suitable_params), {name: shape for name, _, shape in all_params}, - {name: shape for name, _, shape in suitable_params}), + {name: shape for name, _, shape, _ in suitable_params}), extra={'is_warning': True}) - for name, parameter, _ in suitable_params: - reverse_index = int64_array(self.get_fw_index(parameter, 1)) + for name, parameter, _, idx in suitable_params: + reverse_index = int64_array(idx) if parameter.out_port(0).disconnected(): continue diff --git a/tools/mo/openvino/tools/mo/front/common/layout.py b/tools/mo/openvino/tools/mo/front/common/layout.py index a6a1e37c1d8..f2747524e80 100644 --- a/tools/mo/openvino/tools/mo/front/common/layout.py +++ b/tools/mo/openvino/tools/mo/front/common/layout.py @@ -5,6 +5,7 @@ import numpy as np from openvino.tools.mo.front.common.partial_infer.utils import dynamic_dimension_value from openvino.tools.mo.front.common.partial_infer.utils import mo_array, int64_array +from openvino.tools.mo.graph.graph import Node from openvino.tools.mo.utils.error import Error nchw_to_nhwc_permute = int64_array([0, 2, 3, 1]) @@ -112,3 +113,41 @@ def shape_for_layout(layout: str, **kwargs): if depth is not None: output_shape[get_depth_dim(layout, shape_len)] = depth return output_shape + + +def get_dim_from_layout(node: Node, dim: str): + """ + Gets index of dimension from layout specified for node. + :param node: node to get dim for. + :param dim: name of dimension to get index for. + :return: tuple with index of the dimension and bool flag if the node has layout specified or no. + """ + layout = None + graph = node.graph + if 'layout_values' in graph.graph['cmd_params'] and graph.graph['cmd_params'].layout_values: + layout_values = graph.graph['cmd_params'].layout_values.copy() + if '' in layout_values: + in_nodes = graph.get_op_nodes(op='Parameter') + if len(in_nodes) == 1: + in_node = in_nodes[0] + layout_values[in_node.soft_get('name', in_node.id)] = layout_values[''] + del layout_values[''] + name = node.soft_get('name', node.id) + if name in layout_values: + if layout_values[name]['source_layout']: + layout = layout_values[name]['source_layout'] + + if layout: + from openvino.runtime import Layout # pylint: disable=no-name-in-module,import-error + + layout_parsed = Layout(layout) + has_dim = layout_parsed.has_name(dim) + if has_dim: + idx = layout_parsed.get_index_by_name(dim) + if idx < 0: + idx = len(node.shape) + idx + return idx, True + else: + return None, True + else: + return None, False diff --git a/tools/mo/openvino/tools/mo/middle/AddMeanScaleValues.py b/tools/mo/openvino/tools/mo/middle/AddMeanScaleValues.py index 6e95697839d..7d6bf6fe383 100644 --- a/tools/mo/openvino/tools/mo/middle/AddMeanScaleValues.py +++ b/tools/mo/openvino/tools/mo/middle/AddMeanScaleValues.py @@ -5,13 +5,13 @@ import logging as log import numpy as np -from openvino.tools.mo.ops.elementwise import Add, Mul -from openvino.tools.mo.front.common.layout import get_features_dim +from openvino.tools.mo.front.common.layout import get_dim_from_layout, get_features_dim from openvino.tools.mo.front.common.partial_infer.utils import compatible_dims from openvino.tools.mo.front.extractor import get_node_id_with_ports from openvino.tools.mo.front.tf.graph_utils import create_op_with_const_inputs from openvino.tools.mo.graph.graph import Graph, Node from openvino.tools.mo.middle.replacement import MiddleReplacementPattern +from openvino.tools.mo.ops.elementwise import Add, Mul from openvino.tools.mo.utils.cli_parser import get_node_name_with_port_from_input_value from openvino.tools.mo.utils.error import Error from openvino.tools.mo.utils.utils import refer_to_faq_msg @@ -42,14 +42,21 @@ class AddMeanScaleValues(MiddleReplacementPattern): if all([x == optimize_value for x in value]): return assert input_node.has_valid('shape') - features_dim_idx = get_features_dim(graph.graph['layout'], len(input_node.shape)) - assert compatible_dims(value.size, input_node.shape[features_dim_idx]) or value.size == 1 + in_name = input_node.soft_get('name', input_node.id) + features_dim_idx, has_layout = get_dim_from_layout(input_node, 'C') + if features_dim_idx is None: + if has_layout: + log.warning('Layout for input {} doesn\'t have channel ("C") dimension to apply {} preprocessing. ' + 'Skipping this input.'.format(in_name, preprocessing_name)) + features_dim_idx = get_features_dim(graph.graph['layout'], len(input_node.shape)) + assert compatible_dims(value.size, input_node.shape[features_dim_idx]) or value.size == 1, \ + "Incompatible layout, please specify correct layout for the node" shape = np.ones(len(input_node.shape), dtype=np.int64) shape[features_dim_idx] = value.size value = value.reshape(shape) - name = input_node.soft_get('name', input_node.id) + '/' + preprocessing_name + name = in_name + '/' + preprocessing_name preprocessing = create_op_with_const_inputs(graph, op=op, port_value_dict={1: value}, op_attrs={'name': name}) if input_node.op == 'Parameter' and input_node.has_and_set('data_type'): diff --git a/tools/mo/openvino/tools/mo/middle/passes/infer.py b/tools/mo/openvino/tools/mo/middle/passes/infer.py index bf63ab4302f..0a327913012 100644 --- a/tools/mo/openvino/tools/mo/middle/passes/infer.py +++ b/tools/mo/openvino/tools/mo/middle/passes/infer.py @@ -6,6 +6,7 @@ from typing import List import networkx as nx +from openvino.tools.mo.front.common.layout import get_dim_from_layout from openvino.tools.mo.front.common.partial_infer.utils import dynamic_dimension from openvino.tools.mo.graph.graph import Node, Graph, dict_includes from openvino.tools.mo.utils.error import Error @@ -220,10 +221,20 @@ def override_batch(graph: Graph, batch: int): batch: user defined integer value to override batch """ if batch is not None: - for node_id, data in graph.nodes(data=True): - if 'op' in data and data['op'] == 'Parameter' and not data.get('fixed_batch', False): - validate_batch_in_shape(data['shape'], data['name']) - data['shape'][0] = batch + in_nodes = graph.get_op_nodes(op='Parameter') + for node in in_nodes: + if not node.soft_get('fixed_batch', False): + name = node.soft_get('name', node.id) + idx, has_layout = get_dim_from_layout(node, 'N') + if has_layout: + if idx is not None: + node['shape'][idx] = batch + else: + log.warning( + 'Layout for input {} doesn\'t have batch dimension. Skipping this input.'.format(name)) + else: + validate_batch_in_shape(node['shape'], name) + node['shape'][0] = batch def validate_batch_in_shape(shape, layer_name: str): @@ -242,6 +253,7 @@ def validate_batch_in_shape(shape, layer_name: str): 'dimension or not.\n\n For example, you want to set batch dimension equals 100 ' + 'for the input layer "data" with shape (10,34). Although you can not use --batch, ' + 'you should pass --input_shape (100,34) instead of --batch 100. \n\n' + + 'You can also tell Model Optimizer where batch dimension is located by specifying --layout. \n\n' + refer_to_faq_msg(39)) .format(layer_name, shape)) @@ -328,4 +340,3 @@ def reverse_infer(graph: Graph, nodes: list): if node.has_valid('reverse_infer'): log.debug("Executed reverse infer for node '{}'".format(node.soft_get('name', node.id))) node.reverse_infer(node) - diff --git a/tools/mo/unit_tests/mo/back/ReverseInputChannels_test.py b/tools/mo/unit_tests/mo/back/ReverseInputChannels_test.py index 3ffabbf1697..ed45849dd01 100644 --- a/tools/mo/unit_tests/mo/back/ReverseInputChannels_test.py +++ b/tools/mo/unit_tests/mo/back/ReverseInputChannels_test.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import unittest +from argparse import Namespace import numpy as np @@ -53,6 +54,22 @@ nodes3 = { } +def get_nodes(shape, axis=1): + return { + **regular_op_with_shaped_data('placeholder1', shape, + {'type': 'Parameter', 'shape': shape, 'rt_info': RTInfo()}), + **regular_op_with_shaped_data('placeholder2', [1, 1, 1, 1], {'type': 'Parameter', 'shape': [1, 1, 1, 1]}), + + **regular_op_with_shaped_data('mul', shape, {'type': 'Multiply'}), + **regular_op_with_shaped_data('reverse_channels', shape, + {'op': 'ReverseChannels', 'type': None, 'axis': int64_array(axis)}), + + **regular_op_with_shaped_data('pad', shape, {'type': 'Pad'}), + + **result('result'), + } + + class ReverseInputChannelsTest(unittest.TestCase): def check_graph_attrs(self, graph: Graph, parameter_node_names: list): for node in graph.get_op_nodes(): @@ -229,15 +246,53 @@ class ReverseInputChannelsTest(unittest.TestCase): self.assertTrue(reverse_channels.axis == 1) self.assertTrue(type(reverse_channels.axis) == np.ndarray) - def test_get_fw_index(self): - graph = build_graph(nodes, [*connect('placeholder1', 'result')]) + def test_insert(self): + graph = build_graph(get_nodes([1, 3, 10, 10]), + [*connect('placeholder1', '0:mul'), *connect('placeholder2', '1:mul'), + *connect('mul', 'result')], nodes_with_edges_only=True, + cli=Namespace(reverse_input_channels=True)) + + InsertReverseChannels().find_and_replace_pattern(graph) + graph_ref = build_graph(get_nodes([1, 3, 10, 10]), + [*connect('placeholder1', 'reverse_channels'), *connect('reverse_channels', '0:mul'), + *connect('placeholder2', '1:mul'), *connect('mul', 'result')]) + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_insert_old_api_map(self): + graph = build_graph(get_nodes([1, 10, 10, 3]), + [*connect('placeholder1', '0:mul'), *connect('placeholder2', '1:mul'), + *connect('mul', 'result')], nodes_with_edges_only=True, + cli=Namespace(reverse_input_channels=True)) + node = Node(graph, 'placeholder1') old_api_map = OldAPIMapOrder(version=0) node.rt_info.info[('old_api_map_order', old_api_map.get_version())] = old_api_map node.rt_info.info[('old_api_map_order', old_api_map.get_version())].old_api_transpose_parameter([0, 2, 3, 1]) - self.assertTrue(InsertReverseChannels.get_fw_index(node, 0) == 0) - self.assertTrue(InsertReverseChannels.get_fw_index(node, 1) == 3) - self.assertTrue(InsertReverseChannels.get_fw_index(node, 2) == 1) - self.assertTrue(InsertReverseChannels.get_fw_index(node, 3) == 2) - self.assertTrue(InsertReverseChannels.get_fw_index(node, -2) == 1) - self.assertTrue(type(InsertReverseChannels.get_fw_index(node, 0)) == int) + + InsertReverseChannels().find_and_replace_pattern(graph) + graph_ref = build_graph(get_nodes([1, 10, 10, 3], 3), + [*connect('placeholder1', 'reverse_channels'), *connect('reverse_channels', '0:mul'), + *connect('placeholder2', '1:mul'), *connect('mul', 'result')]) + + node2 = Node(graph_ref, 'placeholder1') + node2.rt_info = node.rt_info + + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) + + def test_insert_layout(self): + graph = build_graph(get_nodes([1, 10, 10, 3]), + [*connect('placeholder1', '0:mul'), *connect('placeholder2', '1:mul'), + *connect('mul', 'result')], nodes_with_edges_only=True, + cli=Namespace(reverse_input_channels=True, + layout_values={ + 'placeholder1': {'source_layout': 'nhwc', 'target_layout': None}})) + + InsertReverseChannels().find_and_replace_pattern(graph) + graph_ref = build_graph(get_nodes([1, 10, 10, 3], 3), + [*connect('placeholder1', 'reverse_channels'), *connect('reverse_channels', '0:mul'), + *connect('placeholder2', '1:mul'), *connect('mul', 'result')]) + + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) diff --git a/tools/mo/unit_tests/mo/middle/AddMeanScaleValues_test.py b/tools/mo/unit_tests/mo/middle/AddMeanScaleValues_test.py index 42d95e3b30a..6dea0b06355 100644 --- a/tools/mo/unit_tests/mo/middle/AddMeanScaleValues_test.py +++ b/tools/mo/unit_tests/mo/middle/AddMeanScaleValues_test.py @@ -375,3 +375,93 @@ class AddMeanScaleValuesTest(unittest.TestCase): self.check_graph_attrs(graph, graph_ref, []) add_node = graph.get_op_nodes(type="Add")[0] self.assertTrue(add_node.in_port(1).get_connection().get_source().node['value'].dtype == np.float32) + + def test_mean_values_explicit_and_optimized_layout(self): + graph_ref = build_graph(nodes, [ + *connect('parameter', '0:add_mean'), + *connect('mean', '1:add_mean'), + *connect('add_mean', 'result'), + *connect('parameter_2', 'result_2'), + ]) + + argv = Namespace(mean_scale_values={'parameter': {'mean': np.array([1., 2., 3.])}, + 'parameter_2': {'mean': np.array([0., 0., 0.])}}, + layout_values={'parameter': {'source_layout': 'nchw', 'target_layout': None}, + 'parameter_2': {'source_layout': 'nchw', 'target_layout': None}} + ) + graph = build_graph(nodes, [*connect('parameter', 'result'), *connect('parameter_2', 'result_2')], + nodes_with_edges_only=True, cli=argv) + self.set_graph_attrs(graph, ['parameter', 'parameter_2']) + self.set_graph_attrs(graph_ref, ['parameter', 'parameter_2']) + graph.graph['layout'] = 'NHWC' + + AddMeanScaleValues().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) + (flag, resp) = compare_graphs(graph, graph_ref, 'result_2', check_op_attrs=True) + self.assertTrue(flag, resp) + self.check_graph_attrs(graph, graph_ref, ['parameter', 'parameter_2']) + + def test_mean_values_explicit_and_scale_values_optimized_layout(self): + graph_ref = build_graph(nodes, [ + *connect('parameter', '0:add_mean'), + *connect('mean', '1:add_mean'), + *connect('add_mean', 'result'), + ]) + + argv = Namespace(mean_scale_values={'parameter': {'scale': np.array([1.]), 'mean': np.array([1., 2., 3.])}}, + layout_values={'': {'source_layout': 'nchw', 'target_layout': None}} + ) + graph = build_graph(nodes, [*connect('parameter', 'result')], nodes_with_edges_only=True, cli=argv) + self.set_graph_attrs(graph, ['parameter']) + self.set_graph_attrs(graph_ref, ['parameter']) + graph.graph['layout'] = 'NHWC' + + AddMeanScaleValues().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) + self.check_graph_attrs(graph, graph_ref, ['parameter']) + + def test_mean_values_optimized_and_scale_values_explicit_layout(self): + graph_ref = build_graph(nodes, [ + *connect('parameter', '0:mul_scale'), + *connect('scale', '1:mul_scale'), + *connect('mul_scale', 'result'), + ]) + + argv = Namespace( + mean_scale_values={'parameter': {'scale': np.array([1., 2., 3.]), 'mean': np.array([0., 0., 0.])}}, + layout_values={'': {'source_layout': 'nchw', 'target_layout': None}} + ) + graph = build_graph(nodes, [*connect('parameter', 'result')], nodes_with_edges_only=True, cli=argv) + self.set_graph_attrs(graph, ['parameter']) + self.set_graph_attrs(graph_ref, ['parameter']) + graph.graph['layout'] = 'NHWC' + + AddMeanScaleValues().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) + self.check_graph_attrs(graph, graph_ref, ['parameter']) + + def test_mean_values_explicit_and_scale_values_explicit_layout(self): + graph_ref = build_graph(nodes, [ + *connect('parameter', '0:add_mean'), + *connect('mean', '1:add_mean'), + *connect('add_mean', '0:mul_scale'), + *connect('scale', '1:mul_scale'), + *connect('mul_scale', 'result'), + ]) + + argv = Namespace(mean_scale_values=[[np.array([1., 2., 3.]), np.array([1., 2., 3.])]], + layout_values={'': {'source_layout': 'nchw', 'target_layout': None}} + ) + graph = build_graph(nodes, [*connect('parameter', 'result')], + nodes_with_edges_only=True, cli=argv) + self.set_graph_attrs(graph, ['parameter']) + self.set_graph_attrs(graph_ref, ['parameter']) + graph.graph['layout'] = 'NHWC' + + AddMeanScaleValues().find_and_replace_pattern(graph) + (flag, resp) = compare_graphs(graph, graph_ref, 'result', check_op_attrs=True) + self.assertTrue(flag, resp) + self.check_graph_attrs(graph, graph_ref, ['parameter']) From c26a904c6e0438a5c0ea075165ba5e61dc872d88 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Mon, 3 Jan 2022 09:58:16 +0900 Subject: [PATCH 40/78] Disable blocked layout selection for quantize when the user is gemm beacuse no gemm kernels are using blocked format currently (#9425) --- .../intel_gpu/src/graph/layout_optimizer.cpp | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index e4486e32e9c..17feb992489 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -17,6 +17,7 @@ #include "pooling_inst.h" #include "one_hot_inst.h" #include "permute_inst.h" +#include "gemm_inst.h" #include "quantize_inst.h" #include "mvn_inst.h" #include "depth_to_space_inst.h" @@ -1470,7 +1471,25 @@ format layout_optimizer::get_preferred_format(program_node& node) { layout{ data_types::f32, format::bfyx, tensor{} }).format; } else if (node.is_type()) { auto layout = node.get_output_layout(); - if (layout.format.spatial_num() == 2 && + + std::function only_gemm_users = [&](const program_node& node) { + bool all_users_gemm = true; + + for (auto user : node.get_users()) { + if (user->is_type() || user->is_type()) + all_users_gemm &= only_gemm_users(*user); + else if (user->is_type()) + all_users_gemm &= true; + else + return false; + } + + return all_users_gemm; + }; + if (only_gemm_users(node)) { + // TODO: Gemm is not supporting fsv layouts + expected = format::bfyx; + } else if (layout.format.spatial_num() == 2 && (layout.data_type == data_types::i8 || layout.data_type == data_types::u8) && layout.size.batch[0] % 16 == 0) { if (use_onednn_impls && layout.size.batch[0] % 32 == 0) { From 326289265b2adacdc98bc4c05b05bbd3a22d1337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20Do=C5=82bniak?= Date: Mon, 3 Jan 2022 14:04:45 +0100 Subject: [PATCH 41/78] MO: progress bar implementation for new frontends (#9381) --- .../python/src/openvino/frontend/__init__.py | 1 + .../src/pyopenvino/frontend/extensions.cpp | 24 ++++++ .../src/pyopenvino/frontend/extensions.hpp | 1 + .../python/src/pyopenvino/pyopenvino.cpp | 1 + src/core/tests/CMakeLists.txt | 5 +- src/core/tests/frontend/progress_reporter.cpp | 75 +++++++++++++++++++ .../include/common/extension_holder.hpp | 18 +++++ .../extension/progress_reporter_extension.hpp | 39 ++++++++++ .../extension/progress_reporter_extension.cpp | 22 ++++++ .../openvino/frontend/onnx/frontend.hpp | 5 +- .../onnx/frontend/src/core/graph.cpp | 32 +++++--- .../onnx/frontend/src/core/graph.hpp | 14 ++-- src/frontends/onnx/frontend/src/editor.cpp | 18 ++--- src/frontends/onnx/frontend/src/editor.hpp | 12 +-- src/frontends/onnx/frontend/src/frontend.cpp | 15 ++-- .../onnx/frontend/src/input_model.cpp | 24 +++--- .../onnx/frontend/src/input_model.hpp | 16 ++-- .../onnx/frontend/src/utils/onnx_internal.cpp | 13 ++-- .../onnx/frontend/src/utils/onnx_internal.hpp | 30 ++++---- tools/mo/openvino/tools/mo/main.py | 5 +- tools/mo/openvino/tools/mo/utils/logger.py | 35 +++++++++ 21 files changed, 312 insertions(+), 93 deletions(-) create mode 100644 src/core/tests/frontend/progress_reporter.cpp create mode 100644 src/frontends/common/include/common/extension_holder.hpp create mode 100644 src/frontends/common/include/openvino/frontend/extension/progress_reporter_extension.hpp create mode 100644 src/frontends/common/src/extension/progress_reporter_extension.cpp diff --git a/src/bindings/python/src/openvino/frontend/__init__.py b/src/bindings/python/src/openvino/frontend/__init__.py index 156b240e00a..8e94c76ca9f 100644 --- a/src/bindings/python/src/openvino/frontend/__init__.py +++ b/src/bindings/python/src/openvino/frontend/__init__.py @@ -43,6 +43,7 @@ from openvino.pyopenvino import Place from openvino.pyopenvino import TelemetryExtension from openvino.pyopenvino import DecoderTransformationExtension from openvino.pyopenvino import JsonConfigExtension +from openvino.pyopenvino import ProgressReporterExtension # exceptions from openvino.pyopenvino import NotImplementedFailure diff --git a/src/bindings/python/src/pyopenvino/frontend/extensions.cpp b/src/bindings/python/src/pyopenvino/frontend/extensions.cpp index 45285d86079..5c89dcabe6b 100644 --- a/src/bindings/python/src/pyopenvino/frontend/extensions.cpp +++ b/src/bindings/python/src/pyopenvino/frontend/extensions.cpp @@ -11,6 +11,7 @@ #include "manager.hpp" #include "openvino/frontend/exception.hpp" #include "openvino/frontend/extension/decoder_transformation.hpp" +#include "openvino/frontend/extension/progress_reporter_extension.hpp" #include "openvino/frontend/extension/telemetry.hpp" #include "pyopenvino/graph/model.hpp" @@ -54,3 +55,26 @@ void regclass_frontend_JsonConfigExtension(py::module m) { return std::make_shared(path); })); } + +void regclass_frontend_ProgressReporterExtension(py::module m) { + py::class_, ov::Extension> ext{ + m, + "ProgressReporterExtension", + py::dynamic_attr()}; + + ext.doc() = "An extension class intented to use as progress reporting utility"; + + ext.def(py::init([]() { + return std::make_shared(); + })); + + ext.def(py::init([](const ProgressReporterExtension::progress_notifier_callback& callback) { + return std::make_shared(callback); + })); + + ext.def(py::init([](ProgressReporterExtension::progress_notifier_callback&& callback) { + return std::make_shared(std::move(callback)); + })); + + ext.def("report_progress", &ProgressReporterExtension::report_progress); +} diff --git a/src/bindings/python/src/pyopenvino/frontend/extensions.hpp b/src/bindings/python/src/pyopenvino/frontend/extensions.hpp index 03133fd0be6..36a03cb669a 100644 --- a/src/bindings/python/src/pyopenvino/frontend/extensions.hpp +++ b/src/bindings/python/src/pyopenvino/frontend/extensions.hpp @@ -11,3 +11,4 @@ namespace py = pybind11; void regclass_frontend_TelemetryExtension(py::module m); void regclass_frontend_DecoderTransformationExtension(py::module m); void regclass_frontend_JsonConfigExtension(py::module m); +void regclass_frontend_ProgressReporterExtension(py::module m); diff --git a/src/bindings/python/src/pyopenvino/pyopenvino.cpp b/src/bindings/python/src/pyopenvino/pyopenvino.cpp index 572d5f27e29..07cca64d1d7 100644 --- a/src/bindings/python/src/pyopenvino/pyopenvino.cpp +++ b/src/bindings/python/src/pyopenvino/pyopenvino.cpp @@ -140,6 +140,7 @@ PYBIND11_MODULE(pyopenvino, m) { regclass_frontend_TelemetryExtension(m); regclass_frontend_DecoderTransformationExtension(m); regclass_frontend_JsonConfigExtension(m); + regclass_frontend_ProgressReporterExtension(m); regmodule_offline_transformations(m); } diff --git a/src/core/tests/CMakeLists.txt b/src/core/tests/CMakeLists.txt index 98cc6dea4d1..b8db1d00cda 100644 --- a/src/core/tests/CMakeLists.txt +++ b/src/core/tests/CMakeLists.txt @@ -467,7 +467,7 @@ set(MULTI_TEST_SRC backend/sqrt.in.cpp ) -set(OP_EVAL_TEST_SRC +set(OP_EVAL_TEST_SRC # It should be a part of template plugin op_eval/binary_convolution.cpp op_eval/bucketize.cpp @@ -518,7 +518,8 @@ endif() # SOURCE FOR FRONTEND TESTING file(GLOB FRONTEND_TESTS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/frontend/frontend_manager.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/frontend/decoder_transformation_extension.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/frontend/decoder_transformation_extension.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/frontend/progress_reporter.cpp) list(APPEND SRC ${FRONTEND_TESTS_SRC}) foreach(src IN LISTS SRC MULTI_TEST_SRC OP_EVAL_TEST_SRC) diff --git a/src/core/tests/frontend/progress_reporter.cpp b/src/core/tests/frontend/progress_reporter.cpp new file mode 100644 index 00000000000..98f92fc7bc7 --- /dev/null +++ b/src/core/tests/frontend/progress_reporter.cpp @@ -0,0 +1,75 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "gtest/gtest.h" +#include "openvino/frontend/exception.hpp" +#include "openvino/frontend/extension/progress_reporter_extension.hpp" + +using namespace ov::frontend; + +TEST(ProgressReporter_Callables, LambdaReporter) { + const auto lambda = [](float progress, unsigned int total, unsigned int completed) { + EXPECT_NEAR(progress, 0.5, 0.0001); + EXPECT_EQ(total, 100); + EXPECT_EQ(completed, 50); + }; + + ProgressReporterExtension ext{lambda}; + ext.report_progress(0.5, 100, 50); +} + +TEST(ProgressReporter_Callables, RvalueLambdaReporter) { + ProgressReporterExtension ext{[](float progress, unsigned int total, unsigned int completed) { + EXPECT_NEAR(progress, 0.5, 0.0001); + EXPECT_EQ(total, 100); + EXPECT_EQ(completed, 50); + }}; + + ext.report_progress(0.5, 100, 50); +} + +TEST(ProgressReporter_Callables, StructReporter) { + struct ProgressConsumer { + void operator()(float progress, unsigned int total, unsigned int completed) { + EXPECT_NEAR(progress, 0.5675, 0.0001); + EXPECT_EQ(total, 37); + EXPECT_EQ(completed, 21); + } + }; + + ProgressConsumer consumer; + + ProgressReporterExtension ext{consumer}; + ext.report_progress(0.5675, 37, 21); +} + +namespace { +void function_reporter(float progress, unsigned int total, unsigned int completed) { + EXPECT_NEAR(progress, 0.2574, 0.0001); + EXPECT_EQ(total, 101); + EXPECT_EQ(completed, 26); +} + +void reporter_stub(float, unsigned int, unsigned int) {} +} // namespace + +TEST(ProgressReporter_Callables, FunctionReporter) { + ProgressReporterExtension ext{function_reporter}; + ext.report_progress(0.2574, 101, 26); +} + +TEST(ProgressReporter, ReportMoreStepsThanTotal) { + ProgressReporterExtension ext{reporter_stub}; + EXPECT_THROW(ext.report_progress(0.0, 100, 101), ov::frontend::GeneralFailure); +} + +TEST(ProgressReporter, ReportMoreThan100Percent) { + ProgressReporterExtension ext{reporter_stub}; + EXPECT_THROW(ext.report_progress(1.00001, 100, 50), ov::frontend::GeneralFailure); +} + +TEST(ProgressReporter, ReportLessThanZeroPercent) { + ProgressReporterExtension ext{reporter_stub}; + EXPECT_THROW(ext.report_progress(-100.0, 100, 50), ov::frontend::GeneralFailure); +} diff --git a/src/frontends/common/include/common/extension_holder.hpp b/src/frontends/common/include/common/extension_holder.hpp new file mode 100644 index 00000000000..30569fc05db --- /dev/null +++ b/src/frontends/common/include/common/extension_holder.hpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/frontend/extension/progress_reporter_extension.hpp" +#include "openvino/frontend/extension/telemetry.hpp" + +namespace ov { +namespace frontend { +struct ExtensionHolder { + ExtensionHolder() : progress_reporter{std::make_shared()} {} + std::shared_ptr progress_reporter; + std::shared_ptr telemetry; +}; +} // namespace frontend +} // namespace ov diff --git a/src/frontends/common/include/openvino/frontend/extension/progress_reporter_extension.hpp b/src/frontends/common/include/openvino/frontend/extension/progress_reporter_extension.hpp new file mode 100644 index 00000000000..d02595f9991 --- /dev/null +++ b/src/frontends/common/include/openvino/frontend/extension/progress_reporter_extension.hpp @@ -0,0 +1,39 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/extension.hpp" +#include "openvino/frontend/visibility.hpp" + +namespace ov { +namespace frontend { +class FRONTEND_API ProgressReporterExtension : public ov::Extension { +public: + /// \brief A progress reporting callback signature. A FunctionObject that matches this signature should be passed + /// to the constructor of this extension. The extension will then invoke this as a callback each time the + /// progress needs to be reported. The callback itself is responsible for consuming the reported values. + /// + /// \param progress A float value in the range [0.0, 1.0] indicating the total progress of an operation. + /// \param total_steps The total number of steps that a given instance of this extension is tracking + /// \param completed_completed The current number of completed steps (out of the total number of steps to take) + using progress_notifier_callback = std::function; + + /// \brief The default constructor which creates a reporter that doesn't report progress + ProgressReporterExtension() : m_callback{[](float, unsigned int, unsigned int) {}} {} + ProgressReporterExtension(const progress_notifier_callback& callback) : m_callback{callback} {} + ProgressReporterExtension(progress_notifier_callback&& callback) : m_callback{std::move(callback)} {} + + /// \brief The main method of this extension used to report the progress. + /// This method forwards its arguments to the callback stored in this class. + /// \param progress A float value in the range [0.0, 1.0] indicating the total progress of an operation. + /// \param total_steps The total number of steps that a given instance of this extension is tracking + /// \param completed_steps The current number of completed steps (out of the total number of steps to take) + void report_progress(float progress, unsigned int total_steps, unsigned int completed_steps) const; + +private: + progress_notifier_callback m_callback; +}; +} // namespace frontend +} // namespace ov diff --git a/src/frontends/common/src/extension/progress_reporter_extension.cpp b/src/frontends/common/src/extension/progress_reporter_extension.cpp new file mode 100644 index 00000000000..db3154d8206 --- /dev/null +++ b/src/frontends/common/src/extension/progress_reporter_extension.cpp @@ -0,0 +1,22 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/extension/progress_reporter_extension.hpp" + +#include "openvino/frontend/exception.hpp" + +namespace ov { +namespace frontend { +void ProgressReporterExtension::report_progress(float progress, + unsigned int total_steps, + unsigned int completed_steps) const { + FRONT_END_GENERAL_CHECK(completed_steps <= total_steps, + "When reporting the progress, the number of completed steps can be at most equal to the " + "number of total steps."); + FRONT_END_GENERAL_CHECK(progress >= 0.0f && progress <= 1.0f, + "The reported progress needs to be a value between 0.0 and 1.0"); + m_callback(progress, total_steps, completed_steps); +} +} // namespace frontend +} // namespace ov diff --git a/src/frontends/onnx/frontend/include/openvino/frontend/onnx/frontend.hpp b/src/frontends/onnx/frontend/include/openvino/frontend/onnx/frontend.hpp index cfa4016e4b5..e0465661c1e 100644 --- a/src/frontends/onnx/frontend/include/openvino/frontend/onnx/frontend.hpp +++ b/src/frontends/onnx/frontend/include/openvino/frontend/onnx/frontend.hpp @@ -4,10 +4,9 @@ #pragma once +#include #include -#include "openvino/frontend/extension/telemetry.hpp" - #ifdef OPENVINO_STATIC_LIBRARY # define ONNX_FRONTEND_API # define ONNX_FRONTEND_C_API @@ -38,7 +37,7 @@ protected: InputModel::Ptr load_impl(const std::vector& params) const override; private: - std::shared_ptr m_telemetry; + ExtensionHolder m_extensions; }; } // namespace onnx diff --git a/src/frontends/onnx/frontend/src/core/graph.cpp b/src/frontends/onnx/frontend/src/core/graph.cpp index be2f5799ca4..56056fc9237 100644 --- a/src/frontends/onnx/frontend/src/core/graph.cpp +++ b/src/frontends/onnx/frontend/src/core/graph.cpp @@ -63,17 +63,17 @@ bool common_node_for_all_outputs(const OutputVector& outputs) { }; } // namespace detail -Graph::Graph(const std::shared_ptr& model_proto, - const std::shared_ptr& telemetry) - : Graph(model_proto, common::make_unique(), telemetry) {} +Graph::Graph(const std::shared_ptr& model_proto, ov::frontend::ExtensionHolder extensions) + : Graph(model_proto, common::make_unique(), std::move(extensions)) {} Graph::Graph(const std::shared_ptr& model_proto, std::unique_ptr&& cache, - const std::shared_ptr& telemetry) + ov::frontend::ExtensionHolder extensions) : m_model{common::make_unique(model_proto)}, m_cache{std::move(cache)}, - m_telemetry(telemetry) { + m_extensions{std::move(extensions)} { std::map initializers; + // Process all initializers in the graph for (const auto& initializer_tensor : m_model->get_graph().initializer()) { if (initializer_tensor.has_name()) { @@ -116,7 +116,7 @@ Graph::Graph(const std::shared_ptr& model_proto, std::map> unknown_operators; std::map op_statistics; for (const auto& node_proto : m_model->get_graph().node()) { - if (telemetry) { + if (m_extensions.telemetry) { op_statistics[node_proto.op_type()]++; } if (!m_model->is_operator_available(node_proto)) { @@ -127,9 +127,9 @@ Graph::Graph(const std::shared_ptr& model_proto, } } - if (telemetry) { + if (m_extensions.telemetry) { for (const auto& op : op_statistics) { - telemetry->send_event("op_count", "onnx_" + op.first, op.second); + m_extensions.telemetry->send_event("op_count", "onnx_" + op.first, op.second); } } @@ -149,6 +149,8 @@ Graph::Graph(const std::shared_ptr& model_proto, } void Graph::convert_to_ngraph_nodes() { + const float total = static_cast(m_model->get_graph().node().size()); + unsigned int completed = 0u; // Process ONNX graph nodes, convert to nGraph nodes for (const auto& node_proto : m_model->get_graph().node()) { const Node node{node_proto, *this}; @@ -160,6 +162,8 @@ void Graph::convert_to_ngraph_nodes() { } } OutputVector ng_nodes{make_ng_nodes(node)}; + ++completed; + m_extensions.progress_reporter->report_progress(completed / total, total, completed); } } @@ -198,6 +202,8 @@ std::shared_ptr Graph::convert() { } void Graph::decode_to_framework_nodes() { + const float total = static_cast(m_model->get_graph().node().size()); + unsigned int completed = 0u; // Process ONNX graph nodes, convert to nGraph nodes for (const auto& node_proto : m_model->get_graph().node()) { const Node node{node_proto, *this}; @@ -230,6 +236,8 @@ void Graph::decode_to_framework_nodes() { for (std::size_t i{0}; i < node.get_outputs_size(); ++i) { m_cache->emplace_node(node.output(i), std::move(ng_nodes.at(i))); } + ++completed; + m_extensions.progress_reporter->report_progress(completed / total, total, completed); } } @@ -350,8 +358,12 @@ const OpsetImports& Graph::get_opset_imports() const { } Subgraph::Subgraph(std::shared_ptr model_proto, const Graph* parent_graph) - : Graph(model_proto, common::make_unique(), parent_graph->get_telemetry()), - m_parent_graph(parent_graph) {} + : Graph(model_proto, common::make_unique()), + m_parent_graph(parent_graph) { + // do not copy a pre-configured progress reporter extension to the subgraph, copy just the telemetry + // (do not report subgraph conversion progress) + m_extensions.telemetry = parent_graph->get_extensions().telemetry; +} bool Subgraph::is_ng_node_in_cache(const std::string& name) const { if (m_cache->contains(name)) { diff --git a/src/frontends/onnx/frontend/src/core/graph.hpp b/src/frontends/onnx/frontend/src/core/graph.hpp index 35d9a20d2b7..a80b151573a 100644 --- a/src/frontends/onnx/frontend/src/core/graph.hpp +++ b/src/frontends/onnx/frontend/src/core/graph.hpp @@ -10,19 +10,19 @@ #include #include +#include "common/extension_holder.hpp" #include "core/graph_cache.hpp" #include "core/model.hpp" #include "ngraph/function.hpp" #include "ngraph/op/parameter.hpp" #include "onnx_import/core/operator_set.hpp" -#include "openvino/frontend/extension/telemetry.hpp" namespace ngraph { namespace onnx_import { class Graph : public std::enable_shared_from_this { public: - Graph(const std::shared_ptr& model_proto2, - const std::shared_ptr& telemetry = {}); + Graph(const std::shared_ptr& model_proto, + ov::frontend::ExtensionHolder extensions = {}); Graph() = delete; Graph(const Graph&) = delete; @@ -45,14 +45,14 @@ public: const OpsetImports& get_opset_imports() const; virtual ~Graph() = default; - const std::shared_ptr& get_telemetry() const { - return m_telemetry; + const ov::frontend::ExtensionHolder& get_extensions() const { + return m_extensions; } protected: Graph(const std::shared_ptr& model, std::unique_ptr&& cache, - const std::shared_ptr& telemetry = {}); + ov::frontend::ExtensionHolder extensions = {}); void set_friendly_names(const Node& onnx_node, const OutputVector& ng_subgraph_outputs) const; @@ -65,10 +65,10 @@ protected: ParameterVector m_parameters; std::unique_ptr m_model; std::unique_ptr m_cache; + ov::frontend::ExtensionHolder m_extensions = {}; private: std::vector m_nodes; - std::shared_ptr m_telemetry; }; /// \brief Representation of ONNX subgraph. It is used for example by ONNX Loop op. diff --git a/src/frontends/onnx/frontend/src/editor.cpp b/src/frontends/onnx/frontend/src/editor.cpp index f09fba33646..0fffd63c53e 100644 --- a/src/frontends/onnx/frontend/src/editor.cpp +++ b/src/frontends/onnx/frontend/src/editor.cpp @@ -221,19 +221,17 @@ struct onnx_editor::ONNXModelEditor::Impl { #endif }; -onnx_editor::ONNXModelEditor::ONNXModelEditor(const std::string& model_path, - const std::shared_ptr& telemetry) +onnx_editor::ONNXModelEditor::ONNXModelEditor(const std::string& model_path, frontend::ExtensionHolder extensions) : m_model_path{model_path}, - m_telemetry(telemetry), + m_extensions{std::move(extensions)}, m_pimpl{new ONNXModelEditor::Impl{model_path}, [](Impl* impl) { delete impl; }} {} #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) -onnx_editor::ONNXModelEditor::ONNXModelEditor(const std::wstring& model_path, - const std::shared_ptr& telemetry) +onnx_editor::ONNXModelEditor::ONNXModelEditor(const std::wstring& model_path, frontend::ExtensionHolder extensions) : m_model_path{ngraph::file_util::wstring_to_string(model_path)}, - m_telemetry(telemetry), + m_extensions{std::move(extensions)}, m_pimpl{new ONNXModelEditor::Impl{model_path}, [](Impl* impl) { delete impl; }} {} @@ -241,9 +239,9 @@ onnx_editor::ONNXModelEditor::ONNXModelEditor(const std::wstring& model_path, onnx_editor::ONNXModelEditor::ONNXModelEditor(std::istream& model_stream, const std::string& model_path, - const std::shared_ptr& telemetry) + frontend::ExtensionHolder extensions) : m_model_path{model_path}, - m_telemetry(telemetry), + m_extensions{std::move(extensions)}, m_pimpl{new ONNXModelEditor::Impl{model_stream}, [](Impl* impl) { delete impl; }} {} @@ -436,7 +434,7 @@ std::string onnx_editor::ONNXModelEditor::model_string() const { } std::shared_ptr onnx_editor::ONNXModelEditor::get_function() const { - return ngraph::onnx_import::detail::import_onnx_model(m_pimpl->m_model_proto, m_model_path, m_telemetry); + return ngraph::onnx_import::detail::import_onnx_model(m_pimpl->m_model_proto, m_model_path, m_extensions); } void onnx_editor::ONNXModelEditor::set_input_values( @@ -625,7 +623,7 @@ std::vector onnx_editor::ONNXModelEditor::get_output_ports(const Ed } std::shared_ptr onnx_editor::ONNXModelEditor::decode() { - return ngraph::onnx_import::detail::decode_to_framework_nodes(m_pimpl->m_model_proto, m_model_path, m_telemetry); + return ngraph::onnx_import::detail::decode_to_framework_nodes(m_pimpl->m_model_proto, m_model_path, m_extensions); } void onnx_editor::ONNXModelEditor::add_output(const OutputEdge& output_edge) const { diff --git a/src/frontends/onnx/frontend/src/editor.hpp b/src/frontends/onnx/frontend/src/editor.hpp index 393ab4418ee..09e8939afac 100644 --- a/src/frontends/onnx/frontend/src/editor.hpp +++ b/src/frontends/onnx/frontend/src/editor.hpp @@ -8,12 +8,14 @@ #include #include +#include "common/extension_holder.hpp" #include "editor_types.hpp" #include "ngraph/function.hpp" #include "ngraph/op/constant.hpp" #include "ngraph/partial_shape.hpp" #include "ngraph/type/element_type.hpp" #include "onnx_import/onnx_importer_visibility.hpp" +#include "openvino/frontend/extension/progress_reporter_extension.hpp" #include "openvino/frontend/extension/telemetry.hpp" namespace ov { @@ -29,11 +31,9 @@ public: /// is parsed and loaded into the m_model_proto member variable. /// /// \param model_path Path to the file containing the model. - ONNXModelEditor(const std::string& model_path, - const std::shared_ptr& telemetry = {}); + ONNXModelEditor(const std::string& model_path, frontend::ExtensionHolder extensions = {}); #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) - ONNXModelEditor(const std::wstring& model_path, - const std::shared_ptr& telemetry = {}); + ONNXModelEditor(const std::wstring& model_path, frontend::ExtensionHolder extensions = {}); #endif /// \brief Creates an editor from a model stream. The stream is parsed and loaded @@ -44,7 +44,7 @@ public: /// for ONNX external weights feature support. ONNXModelEditor(std::istream& model_stream, const std::string& path = "", - const std::shared_ptr& telemetry = {}); + frontend::ExtensionHolder extensions = {}); /// \brief Modifies the in-memory representation of the model by setting /// custom input types for all inputs specified in the provided map. @@ -296,7 +296,7 @@ public: private: void update_mapper_if_needed() const; - std::shared_ptr m_telemetry; + frontend::ExtensionHolder m_extensions; const std::string m_model_path; struct Impl; diff --git a/src/frontends/onnx/frontend/src/frontend.cpp b/src/frontends/onnx/frontend/src/frontend.cpp index 9f7180b86ce..813c3949aa8 100644 --- a/src/frontends/onnx/frontend/src/frontend.cpp +++ b/src/frontends/onnx/frontend/src/frontend.cpp @@ -12,7 +12,6 @@ #include #include "onnx_common/onnx_model_validator.hpp" -#include "openvino/frontend/extension/telemetry.hpp" using namespace ov; using namespace ov::frontend::onnx; @@ -36,27 +35,27 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector& variants) const } if (variants[0].is()) { const auto path = variants[0].as(); - return std::make_shared(path, m_telemetry); + return std::make_shared(path, m_extensions); } #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) if (variants[0].is()) { const auto path = variants[0].as(); - return std::make_shared(path, m_telemetry); + return std::make_shared(path, m_extensions); } #endif if (variants[0].is()) { const auto stream = variants[0].as(); if (variants.size() > 1 && variants[1].is()) { const auto path = variants[0].as(); - return std::make_shared(*stream, path, m_telemetry); + return std::make_shared(*stream, path, m_extensions); } #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) if (variants.size() > 1 && variants[1].is()) { const auto path = variants[1].as(); - return std::make_shared(*stream, path, m_telemetry); + return std::make_shared(*stream, path, m_extensions); } #endif - return std::make_shared(*stream, m_telemetry); + return std::make_shared(*stream, m_extensions); } return nullptr; } @@ -135,6 +134,8 @@ bool FrontEnd::supported_impl(const std::vector& variants) const { void FrontEnd::add_extension(const std::shared_ptr& extension) { if (auto telemetry = std::dynamic_pointer_cast(extension)) { - m_telemetry = telemetry; + m_extensions.telemetry = telemetry; + } else if (auto progress_reporter = std::dynamic_pointer_cast(extension)) { + m_extensions.progress_reporter = progress_reporter; } } diff --git a/src/frontends/onnx/frontend/src/input_model.cpp b/src/frontends/onnx/frontend/src/input_model.cpp index c1072b48fff..d712b6c3f8f 100644 --- a/src/frontends/onnx/frontend/src/input_model.cpp +++ b/src/frontends/onnx/frontend/src/input_model.cpp @@ -14,27 +14,23 @@ using namespace ov::frontend::onnx; NGRAPH_SUPPRESS_DEPRECATED_START -InputModel::InputModel(const std::string& path, const std::shared_ptr& telemetry) - : m_editor{std::make_shared(path, telemetry)} {} +InputModel::InputModel(const std::string& path, frontend::ExtensionHolder extensions) + : m_editor{std::make_shared(path, std::move(extensions))} {} #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) -InputModel::InputModel(const std::wstring& path, const std::shared_ptr& telemetry) - : m_editor{std::make_shared(path, telemetry)} {} +InputModel::InputModel(const std::wstring& path, frontend::ExtensionHolder extensions) + : m_editor{std::make_shared(path, std::move(extensions))} {} #endif -InputModel::InputModel(std::istream& model_stream, const std::shared_ptr& telemetry) - : m_editor{std::make_shared(model_stream, "", telemetry)} {} +InputModel::InputModel(std::istream& model_stream, frontend::ExtensionHolder extensions) + : m_editor{std::make_shared(model_stream, "", std::move(extensions))} {} -InputModel::InputModel(std::istream& model_stream, - const std::string& path, - const std::shared_ptr& telemetry) - : m_editor{std::make_shared(model_stream, path, telemetry)} {} +InputModel::InputModel(std::istream& model_stream, const std::string& path, frontend::ExtensionHolder extensions) + : m_editor{std::make_shared(model_stream, path, std::move(extensions))} {} #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT -InputModel::InputModel(std::istream& model_stream, - const std::wstring& path, - const std::shared_ptr& telemetry) - : InputModel(model_stream, ov::util::wstring_to_string(path), telemetry) {} +InputModel::InputModel(std::istream& model_stream, const std::wstring& path, frontend::ExtensionHolder extensions) + : InputModel(model_stream, ov::util::wstring_to_string(path), std::move(extensions)) {} #endif std::vector InputModel::get_inputs() const { diff --git a/src/frontends/onnx/frontend/src/input_model.hpp b/src/frontends/onnx/frontend/src/input_model.hpp index 64d94b16129..863a253f02e 100644 --- a/src/frontends/onnx/frontend/src/input_model.hpp +++ b/src/frontends/onnx/frontend/src/input_model.hpp @@ -8,26 +8,24 @@ #include #include +#include "common/extension_holder.hpp" + namespace ov { namespace frontend { namespace onnx { class InputModel : public ov::frontend::InputModel { public: - InputModel(const std::string& path, const std::shared_ptr& telemetry = {}); + InputModel(const std::string& path, ExtensionHolder extensions = {}); #if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) - InputModel(const std::wstring& path, const std::shared_ptr& telemetry = {}); + InputModel(const std::wstring& path, ExtensionHolder extensions = {}); #endif - InputModel(std::istream& model_stream, const std::shared_ptr& telemetry = {}); + InputModel(std::istream& model_stream, ExtensionHolder extensions = {}); // The path can be required even if the model is passed as a stream because it is necessary // for ONNX external data feature - InputModel(std::istream& model_stream, - const std::string& path, - const std::shared_ptr& telemetry = {}); + InputModel(std::istream& model_stream, const std::string& path, ExtensionHolder extensions = {}); #ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT - InputModel(std::istream& model_stream, - const std::wstring& path, - const std::shared_ptr& telemetry = {}); + InputModel(std::istream& model_stream, const std::wstring& path, ExtensionHolder extensions = {}); #endif std::vector get_inputs() const override; diff --git a/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp b/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp index aff727c9fa8..a8de21fa440 100644 --- a/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp +++ b/src/frontends/onnx/frontend/src/utils/onnx_internal.cpp @@ -89,18 +89,17 @@ void convert_decoded_function(std::shared_ptr function) { std::shared_ptr import_onnx_model(std::shared_ptr model_proto, const std::string& model_path, - const std::shared_ptr& telemetry) { + ov::frontend::ExtensionHolder extensions) { apply_transformations(*model_proto, model_path); - Graph graph{model_proto, telemetry}; + Graph graph{model_proto, extensions}; return graph.convert(); } -std::shared_ptr decode_to_framework_nodes( - std::shared_ptr model_proto, - const std::string& model_path, - const std::shared_ptr& telemetry) { +std::shared_ptr decode_to_framework_nodes(std::shared_ptr model_proto, + const std::string& model_path, + ov::frontend::ExtensionHolder extensions) { apply_transformations(*model_proto, model_path); - auto graph = std::make_shared(model_proto, telemetry); + auto graph = std::make_shared(model_proto, extensions); return graph->decode(); } } // namespace detail diff --git a/src/frontends/onnx/frontend/src/utils/onnx_internal.hpp b/src/frontends/onnx/frontend/src/utils/onnx_internal.hpp index c564cfe7210..7a839f35146 100644 --- a/src/frontends/onnx/frontend/src/utils/onnx_internal.hpp +++ b/src/frontends/onnx/frontend/src/utils/onnx_internal.hpp @@ -7,8 +7,8 @@ #include #include +#include "common/extension_holder.hpp" #include "ngraph/function.hpp" -#include "openvino/frontend/extension/telemetry.hpp" namespace ONNX_NAMESPACE { class ModelProto; @@ -25,32 +25,30 @@ namespace detail { /// library can cause segfaults. If stream parsing fails or the ONNX model /// contains unsupported ops, the function throws an ngraph_error exception. /// -/// \param[in] model_proto Reference to a GraphProto object. -/// \param[in] model_path The path to the imported onnx model. -/// It is required if the imported model uses data saved in -/// external files. +/// \param model_proto Reference to a GraphProto object. +/// \param model_path The path to the imported onnx model. +/// It is required if the imported model uses data saved in external files. +/// \param extensions An object containing a collection of frontend extensions to use during the import process /// /// \return An nGraph function that represents a single output from the created /// graph. std::shared_ptr import_onnx_model(std::shared_ptr model_proto, const std::string& model_path, - const std::shared_ptr& telemetry = {}); + ov::frontend::ExtensionHolder extensions = {}); /// \brief Decode ONNX model to nGraph function with ONNXFrameworkNode(s) /// -/// \param[in] model_proto Reference to a GraphProto object. -/// \param[in] model_path The path to the imported onnx model. -/// It is required if the imported model uses data saved in -/// external files. +/// \param model_proto Reference to a GraphProto object. +/// \param model_path The path to the imported onnx model. +/// It is required if the imported model uses data saved in external files. +/// \param extensions An object containing a collection of frontend extensions to use during the import process /// /// \return A nGraph function with ONNXFrameworkNodes -std::shared_ptr decode_to_framework_nodes( - std::shared_ptr model_proto, - const std::string& model_path, - const std::shared_ptr& telemetry = {}); +std::shared_ptr decode_to_framework_nodes(std::shared_ptr model_proto, + const std::string& model_path, + ov::frontend::ExtensionHolder extensions = {}); -/// \brief Converts a nGraph function (onnx model decoded to function with -/// ONNXFrameworkNode(s)) +/// \brief Converts a nGraph function (onnx model decoded to function with ONNXFrameworkNode(s)) /// to a complete function with actual compute operations /// /// \return A nGraph function. diff --git a/tools/mo/openvino/tools/mo/main.py b/tools/mo/openvino/tools/mo/main.py index 339077b2251..2866564171e 100644 --- a/tools/mo/openvino/tools/mo/main.py +++ b/tools/mo/openvino/tools/mo/main.py @@ -33,7 +33,7 @@ from openvino.tools.mo.utils.error import Error, FrameworkError from openvino.tools.mo.utils.find_ie_version import find_ie_version from openvino.tools.mo.utils.get_ov_update_message import get_ov_update_message from openvino.tools.mo.utils.guess_framework import deduce_framework_by_namespace -from openvino.tools.mo.utils.logger import init_logger +from openvino.tools.mo.utils.logger import init_logger, progress_printer from openvino.tools.mo.utils.model_analysis import AnalysisResults from openvino.tools.mo.utils.utils import refer_to_faq_msg from openvino.tools.mo.utils.telemetry_utils import send_params_info, send_framework_info @@ -43,7 +43,7 @@ from openvino.tools.mo.utils.telemetry_utils import get_tid from openvino.tools.mo.front.common.partial_infer.utils import mo_array # pylint: disable=no-name-in-module,import-error -from openvino.frontend import FrontEndManager, TelemetryExtension +from openvino.frontend import FrontEndManager, ProgressReporterExtension, TelemetryExtension def replace_ext(name: str, old: str, new: str): @@ -322,6 +322,7 @@ def prepare_ir(argv): if moc_front_end: t.send_event("mo", "conversion_method", moc_front_end.get_name() + "_frontend") moc_front_end.add_extension(TelemetryExtension("mo", t.send_event, t.send_error, t.send_stack_trace)) + moc_front_end.add_extension(ProgressReporterExtension(progress_printer(argv))) ngraph_function = moc_pipeline(argv, moc_front_end) else: t.send_event("mo", "conversion_method", "mo_legacy") diff --git a/tools/mo/openvino/tools/mo/utils/logger.py b/tools/mo/openvino/tools/mo/utils/logger.py index 5a96aace2d5..96a6a84e103 100644 --- a/tools/mo/openvino/tools/mo/utils/logger.py +++ b/tools/mo/openvino/tools/mo/utils/logger.py @@ -6,6 +6,7 @@ import logging as log import os import re import sys +from argparse import Namespace # WA for abseil bug that affects logging while importing TF starting 1.14 version # Link to original issue: https://github.com/abseil/abseil-py/issues/99 @@ -111,3 +112,37 @@ def progress_bar(function: callable): function(*args, **kwargs) return wrapper + +def progress_printer(argv: Namespace): + """ + A higher-order factory function returning a configurable callback displaying a progress bar + Depending on the configuration stored in 'argv' the progress bar can be one-line, multi-line, or silent. + """ + def _progress_bar(progress, total, completed, endline): + bar_len = 20 + + def dots(): + return '.' * int(progress * bar_len) + + print('\rProgress: [{:{}}]{:>7.2f}% done'.format(dots(), bar_len, progress*100), end=endline) + sys.stdout.flush() + + def no_progress_bar(progress, total, completed): + """ A 'dummy' progressbar which doesn't print anything """ + pass + + def oneline_progress_bar(progress, total, completed): + """ A callback that always prints the progress in the same line (mimics real GUI progress bar)""" + _progress_bar(progress, total, completed, '') + + def newline_progress_bar(progress, total, completed): + """ A callback that prints an updated progress bar in separate lines """ + _progress_bar(progress, total, completed, '\n') + + if "progress" in argv and argv.progress: + if "stream_output" in argv and argv.stream_output: + return newline_progress_bar + else: + return oneline_progress_bar + else: + return no_progress_bar From 339849fde8535042d66d1919e15c8e394a4cd8d1 Mon Sep 17 00:00:00 2001 From: Alexey Lebedev Date: Tue, 4 Jan 2022 13:54:15 +0300 Subject: [PATCH 42/78] [PYTHON API] add_extension api (#9339) * Pass ov::extension in core.add_extension * fix code style * revert mock and inherit extension on python side * Fix code style * Enable add_extension test * Fix code style * Move Extension to openvino.runtime * use static_cast instead c-cast * Fix code style * fix test --- .../python/src/openvino/runtime/__init__.py | 1 + .../python/src/pyopenvino/core/core.cpp | 19 +++-- .../python/src/pyopenvino/core/extension.cpp | 1 + .../tests/test_inference_engine/test_core.py | 79 +++++++++++-------- 4 files changed, 59 insertions(+), 41 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/__init__.py b/src/bindings/python/src/openvino/runtime/__init__.py index 850d55bc9bb..8424f2dbd5d 100644 --- a/src/bindings/python/src/openvino/runtime/__init__.py +++ b/src/bindings/python/src/openvino/runtime/__init__.py @@ -70,6 +70,7 @@ from openvino.runtime.ie_api import OVAny from openvino.pyopenvino import Version from openvino.pyopenvino import Parameter from openvino.pyopenvino import Tensor +from openvino.pyopenvino import Extension from openvino.pyopenvino import ProfilingInfo from openvino.pyopenvino import get_version from openvino.pyopenvino import get_batch diff --git a/src/bindings/python/src/pyopenvino/core/core.cpp b/src/bindings/python/src/pyopenvino/core/core.cpp index 2201ea00aad..6441e42bfd5 100644 --- a/src/bindings/python/src/pyopenvino/core/core.cpp +++ b/src/bindings/python/src/pyopenvino/core/core.cpp @@ -127,12 +127,19 @@ void regclass_Core(py::module m) { py::arg("device_name"), py::arg("config") = py::dict()); - cls.def( - "add_extension", - [](ov::runtime::Core& self, const std::string& library_path) { - return self.add_extension(library_path); - }, - py::arg("library_path")); + cls.def("add_extension", + static_cast(&ov::runtime::Core::add_extension), + py::arg("library_path")); + + cls.def("add_extension", + static_cast&)>( + &ov::runtime::Core::add_extension), + py::arg("extension")); + + cls.def("add_extension", + static_cast>&)>( + &ov::runtime::Core::add_extension), + py::arg("extensions")); cls.def_property_readonly("available_devices", &ov::runtime::Core::get_available_devices); } diff --git a/src/bindings/python/src/pyopenvino/core/extension.cpp b/src/bindings/python/src/pyopenvino/core/extension.cpp index 0d4bafd28e8..c86d1c08d3c 100644 --- a/src/bindings/python/src/pyopenvino/core/extension.cpp +++ b/src/bindings/python/src/pyopenvino/core/extension.cpp @@ -13,4 +13,5 @@ namespace py = pybind11; void regclass_Extension(py::module m) { py::class_> ext(m, "Extension", py::dynamic_attr()); + ext.def(py::init<>()); } diff --git a/src/bindings/python/tests/test_inference_engine/test_core.py b/src/bindings/python/tests/test_inference_engine/test_core.py index 12436b06738..b275309cf94 100644 --- a/src/bindings/python/tests/test_inference_engine/test_core.py +++ b/src/bindings/python/tests/test_inference_engine/test_core.py @@ -8,7 +8,8 @@ from sys import platform from pathlib import Path import openvino.runtime.opset8 as ov -from openvino.runtime import Model, Core, CompiledModel, Tensor, tensor_from_file, compile_model +from openvino.runtime import Model, Core, CompiledModel, Tensor, PartialShape, Extension,\ + tensor_from_file, compile_model from ..conftest import model_path, model_onnx_path, plugins_path, read_image @@ -244,48 +245,46 @@ def test_unregister_plugin(device): assert f"Device with '{device}' name is not registered in the InferenceEngine" in str(e.value) -@pytest.mark.skip(reason="dlSym cannot locate method 'create_extensions': libtemplate_extension.so") @pytest.mark.template_extension -def test_add_extension(device): - model = bytes(b""" +def test_add_extension_template_extension(device): + ir = bytes(b""" - + - - 2 - 2 - 2 + 1 + 3 + 22 + 22 - - + - 2 - 2 - 2 1 + 3 + 22 + 22 - - 2 - 2 - 2 + 1 + 3 + 22 + 22 - 2 - 2 - 2 1 + 3 + 22 + 22 @@ -298,23 +297,33 @@ def test_add_extension(device): core = Core() if platform == "win32": - core.add_extension(library_path="template_extension.dll") + core.add_extension(library_path="ov_template_extension.dll") else: - core.add_extension(library_path="libtemplate_extension.so") - func = core.read_model(model=model, init_from_buffer=True) - assert isinstance(func, Model) + core.add_extension(library_path="libov_template_extension.so") + model = core.read_model(model=ir) + assert isinstance(model, Model) - # input_blob = next(iter(network.input_info)) - # n, c, h, w = network.input_info[input_blob].input_data.shape + before_reshape = PartialShape([1, 3, 22, 22]) + after_reshape = PartialShape([8, 9, 33, 66]) + new_shapes = {"in_data": after_reshape} + assert model.input().partial_shape == before_reshape + model.reshape(new_shapes) + assert model.input().partial_shape == after_reshape - # input_values = np.ndarray(buffer=np.array([1, 2, 3, 4, 5, 6, 7, 8]), shape = (n, c, h, w), dtype=int) - # expected = np.ndarray(buffer=np.array([12, 13, 14, 15, 16, 17, 18, 19]), - # shape = (n, c, h, w), dtype=int) - # - # exec_network = core.compile_model(func, device) - # computed = exec_network.infer_new_request(inputs={input_blob : input_values}) - # output_blob = next(iter(network.outputs)) - # assert np.allclose(expected, computed[output_blob], atol=1e-2, rtol=1e-2) + # CVS-74584 + del model + + +def test_add_extension(): + class EmptyExtension(Extension): + def __init__(self) -> None: + super().__init__() + + core = Core() + core.add_extension(EmptyExtension()) + core.add_extension([EmptyExtension(), EmptyExtension()]) + model = core.read_model(model=test_net_xml, weights=test_net_bin) + assert isinstance(model, Model) def test_read_model_from_buffer_no_weights(device): From 2945232982fd8c9d9db980ea7ff8e1f7d7786afa Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Tue, 4 Jan 2022 20:22:11 +0900 Subject: [PATCH 43/78] Split hard sigmoid into eltw linear and eltw clip (#9500) + cldnn supports hard sigmoid activation function but onednn doesn't. + split it into eltwise linear and eltwise clip in add_onednn_optimization_attributes pass. --- src/plugins/intel_gpu/src/graph/program_node.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 767d1e80218..d8687afde9e 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -781,14 +781,21 @@ void program_node::init_onednn_primitive_attributes() { auto node = cldnn_post_ops[idx].node; if (node->is_type()) { - auto& a_node = node->as(); - if (!a_node.get_primitive()->additional_params_input.empty()) { + auto fused_desc = node->as().get_primitive();; + if (fused_desc->activation_function == cldnn::activation_func::relu_negative_slope + && !fused_desc->additional_params_input.empty()) { auto dep_idx = cldnn_post_ops[idx].dep_start_idx; int oc_dim = node->get_output_layout().size.feature.size(); post_ops.append_prelu(1 << oc_dim); update_onednn_post_op_list(onednn_post_op_type::binary_relu, dep_idx); + } else if (fused_desc->activation_function == cldnn::activation_func::hard_sigmoid) { + // Splits hard_sigmoid activation into eltwise_linear, min and max. + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, + fused_desc->additional_params.a, fused_desc->additional_params.b); + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); + update_onednn_post_op_list(onednn_post_op_type::eltwise_linear, empty_mem); + update_onednn_post_op_list(onednn_post_op_type::eltwise_clip, empty_mem); } else { - auto fused_desc = node->as().get_primitive(); dnnl::algorithm alg = onednn::convert_activation_func(fused_desc->activation_function); post_ops.append_eltwise(1.0f, alg, fused_desc->additional_params.a, fused_desc->additional_params.b); update_onednn_post_op_list(onednn_post_op_type::eltwise_act, empty_mem); From f255c195c5fd9354dfd86379667f99f2f63776cb Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Tue, 4 Jan 2022 22:09:23 +0900 Subject: [PATCH 44/78] [GPU] Add functional and pass test-cases (#8998) Signed-off-by: Min, Byungil --- .../test_module_fusing_reorder.cpp | 288 ++++++++++++++++++ .../module_tests/test_program_helpers.cpp | 87 ++++++ 2 files changed, 375 insertions(+) create mode 100644 inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp create mode 100644 inference-engine/thirdparty/clDNN/tests/module_tests/test_program_helpers.cpp diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp b/inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp new file mode 100644 index 00000000000..a36b3c8ad22 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp @@ -0,0 +1,288 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "test_utils.h" +#include "program_helpers.h" +#include "layout_optimizer.h" + +#include +#include +#include "intel_gpu/primitives/reorder.hpp" +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; +using namespace testing; + + +static void setting_node(program::ptr prog, const primitive_id& id, layout new_layout) { + auto itr = prog->get_processing_order().begin(); + while (itr != prog->get_processing_order().end()) { + auto node_ptr = *itr++; + if (node_ptr->id() == id) + node_ptr->set_output_layout(new_layout); + } +} + +// To test removal of reorder for mixed precision of Onednn conv kernel (conv: u8->fp32) +TEST(test_can_fuse_reorder, reorder_for_mixed_type_convolution_fsv32_onednn) +{ + build_options build_opt; + topology topology; +#ifdef ENABLE_ONEDNN_FOR_GPU + auto& engine = get_onednn_test_engine(); +#else + auto& engine = get_test_engine(); +#endif + + layout reorder_layout(data_types::u8, format::b_fs_yx_fsv32, {1, 32, 2, 2}, padding({0, }, 0)); + auto input = engine.allocate_memory({ data_types::u8, format::bfyx, {1, 3, 2, 2} }); + auto weights = engine.allocate_memory({ data_types::u8, format::bfyx, {1, 3, 2, 2} }); + auto bias = engine.allocate_memory({ data_types::u8, format::bfyx, {1, 3, 1, 1} }); + + topology.add(input_layout("input", input->get_layout())); + topology.add(data("weights", weights)); + topology.add(data("bias", bias)); + topology.add(reorder("reorder_input", "input", format::b_fs_yx_fsv32, data_types::u8)); + topology.add(cldnn::convolution("conv", { "reorder_input" }, { "weights" }, { "bias"}, 1, tensor{1}, tensor{0}, tensor{1}, {1, 32, 2, 2}, data_types::f32, false)); + topology.add(reorder("reorder_conv", "conv", reorder_layout)); + + program::ptr prog = program::build_program(engine, topology, build_opt, false, true); + layout_optimizer lo = layout_optimizer(); + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true); + + auto itr = prog->get_processing_order().begin(); + while (itr != prog->get_processing_order().end()) { + auto node_ptr = *itr++; + if (!node_ptr->is_type() || node_ptr->id() != "reorder_input") // target reorder + continue; + + auto& node = node_ptr->as(); + auto& input = node.input(); + for (auto usr : node_ptr->get_users()) { + auto temp = usr->get_output_layout(); + EXPECT_EQ(false, lo.can_fuse_reorder(input, *usr, node.input().get_output_layout().format, usr->get_output_layout().format)); + } + } +} + +// To test mixed precision of Cldnn conv kernel (conv: u8->fp32) +TEST(test_can_fuse_reorder, reorder_for_mixed_type_convolution_fsv32_cldnn) +{ + build_options build_opt; + topology topology; +#ifdef ENABLE_ONEDNN_FOR_GPU + auto& engine = get_onednn_test_engine(); +#else + auto& engine = get_test_engine(); +#endif + + layout reorder_layout(data_types::u8, format::b_fs_yx_fsv32, {1, 32, 2, 2}, padding({0, }, 0)); + auto input = engine.allocate_memory({ data_types::u8, format::bfyx, {1, 3, 2, 2} }); + auto weights = engine.allocate_memory({ data_types::u8, format::bfyx, {1, 3, 2, 2} }); + auto bias = engine.allocate_memory({ data_types::u8, format::bfyx, {1, 3, 1, 1} }); + + topology.add(input_layout("input", input->get_layout())); + topology.add(data("weights", weights)); + topology.add(data("bias", bias)); + topology.add(reorder("reorder_input", "input", format::b_fs_yx_fsv32, data_types::u8)); + topology.add(cldnn::convolution("conv", { "reorder_input" }, { "weights" }, { "bias"}, 1, tensor{1}, tensor{0}, tensor{1}, {1, 32, 2, 2}, data_types::f32, false)); + topology.add(reorder("reorder_conv", "conv", reorder_layout)); + + program::ptr prog = program::build_program(engine, topology, build_opt, false, true); + layout_optimizer lo = layout_optimizer(); + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, false); + + auto itr = prog->get_processing_order().begin(); + while (itr != prog->get_processing_order().end()) { + auto node_ptr = *itr++; + if (!node_ptr->is_type() || node_ptr->id() != "reorder_input") // target reorder + continue; + + auto& node = node_ptr->as(); + auto& input = node.input(); + for (auto usr : node_ptr->get_users()) { + auto temp = usr->get_output_layout(); + EXPECT_EQ(true, lo.can_fuse_reorder(input, *usr, node.input().get_output_layout().format, usr->get_output_layout().format)); + } + } +} + + +struct reorder_test_param { + format input_format; + format output_format; + data_types input_data_type; + data_types output_data_type; + tensor in_shape; + tensor out_shape; + tensor weight_shape; + tensor stride; + tensor pad; + data_types weights_type; + format weights_format; + bool expected_result; +}; + +template +class ReorderTest : public ::testing::TestWithParam { +public: +#ifdef ENABLE_ONEDNN_FOR_GPU + cldnn::engine& engine = get_onednn_test_engine(); +#else + cldnn::engine& engine = get_test_engine(); +#endif + + layout get_input_layout(T& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{pad_} }; + } + + bool check_supports_immad() { + return this->engine.get_device_info().supports_immad; + } +}; + +// Not to fuse a reorder if the next conv has deep depth input +class test_fused_reorder_deep_depth : public ReorderTest {}; +TEST_P(test_fused_reorder_deep_depth, no_removal_for_deep_depth_conv) +{ + build_options build_opt; + topology topology; + auto p = GetParam(); + + layout conv_layout(p.input_data_type, p.output_format, p.out_shape, padding({0, }, 0)); + layout reorder_layout(p.output_data_type, p.output_format, p.out_shape, padding({0, }, 0)); + auto input = engine.allocate_memory({ p.input_data_type, p.input_format, p.in_shape }); + auto weights = engine.allocate_memory({ p.input_data_type, p.input_format, p.weight_shape }); + auto bias = engine.allocate_memory({ p.input_data_type, p.input_format, p.weight_shape }); + + topology.add(input_layout("input", input->get_layout())); + topology.add(data("weights", weights)); + topology.add(reorder("reorder_input", "input", p.output_format, p.input_data_type)); + topology.add(cldnn::convolution("conv", { "reorder_input" }, { "weights" })); + topology.add(reorder("reorder_conv", "conv", reorder_layout)); + + program::ptr prog = program::build_program(engine, topology, build_opt, false, true); + layout_optimizer lo = layout_optimizer(); + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true); + setting_node(prog, "conv", conv_layout); + + auto itr = prog->get_processing_order().begin(); + while (itr != prog->get_processing_order().end()) { + auto node_ptr = *itr++; + if (!node_ptr->is_type() || node_ptr->id() != "reorder_input") // target reorder + continue; + + auto& node = node_ptr->as(); + auto& input = node.input(); + for (auto usr : node_ptr->get_users()) { + auto temp = usr->get_output_layout(); + EXPECT_EQ(p.expected_result, lo.can_fuse_reorder(input, *usr, node.input().get_output_layout().format, usr->get_output_layout().format)); + } + } +} + +INSTANTIATE_TEST_SUITE_P(testing_deep_depth_conv, test_fused_reorder_deep_depth, + ::testing::ValuesIn(std::vector{ + reorder_test_param{format::bfyx, format::b_fs_yx_fsv32, data_types::u8, data_types::u8, {1, 32, 8, 8}, {1, 32, 8, 8}, {1, 32, 1, 1}, + tensor{1}, tensor{0}, data_types::u8, format::goiyx, false}, + reorder_test_param{format::bfyx, format::b_fs_yx_fsv16, data_types::f16, data_types::f16, {1, 32, 8, 8}, {1, 32, 8, 8}, {1, 32, 1, 1}, + tensor{1}, tensor{0}, data_types::f16, format::goiyx, false}, + reorder_test_param{format::bfyx, format::bs_fs_yx_bsv32_fsv32, data_types::u8, data_types::u8, {32, 32, 8, 8}, {32, 32, 8, 8}, {1, 32, 1, 1}, + tensor{1}, tensor{0}, data_types::u8, format::goiyx, false}, + reorder_test_param{format::bfyx, format::bs_fs_yx_bsv32_fsv16, data_types::f16, data_types::f16, {32, 32, 8, 8}, {32, 32, 8, 8}, {1, 32, 1, 1}, + tensor{1}, tensor{0}, data_types::f16, format::goiyx, false}, + })); + +// To test removal of reorder for first convolution optimizing in cldnn kernel (shallow input depth to deep output depth) +class test_can_fuse_reorder_first_conv : public ReorderTest {}; +TEST_P(test_can_fuse_reorder_first_conv, reorder_for_firstconv_cldnn) +{ + build_options build_opt; + topology topology; + auto p = GetParam(); + + layout reorder_layout(p.output_data_type, p.output_format, p.out_shape, padding({0, }, 0)); + auto input = engine.allocate_memory({ p.input_data_type, p.input_format, p.in_shape }); + auto weights = engine.allocate_memory({ p.input_data_type, p.input_format, p.weight_shape }); + auto bias = engine.allocate_memory({ p.input_data_type, p.input_format, p.weight_shape }); + + topology.add(input_layout("input", input->get_layout())); + topology.add(data("weights", weights)); + topology.add(data("bias", bias)); + topology.add(reorder("reorder_input", "input", p.output_format, p.input_data_type)); + topology.add(cldnn::convolution("conv2", { "reorder_input" }, { "weights" }, { "bias"}, 1, tensor{1}, tensor{0}, tensor{1}, p.out_shape, p.input_data_type, false)); + topology.add(reorder("reorder_conv", "conv2", reorder_layout)); + + program::ptr prog = program::build_program(engine, topology, build_opt, false, true); + layout_optimizer lo = layout_optimizer(); + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, false); + + auto itr = prog->get_processing_order().begin(); + while (itr != prog->get_processing_order().end()) { + auto node_ptr = *itr++; + if (!node_ptr->is_type() || node_ptr->id() != "reorder_input") // target reorder + continue; + + auto& node = node_ptr->as(); + auto& input = node.input(); + for (auto usr : node_ptr->get_users()) { + auto temp = usr->get_output_layout(); + EXPECT_EQ(p.expected_result, lo.can_fuse_reorder(input, *usr, node.input().get_output_layout().format, usr->get_output_layout().format)); + } + } +} + +// To test removal of reorder for first convolution optimizing in onednn kernel (shallow input depth to deep output depth) +TEST_P(test_can_fuse_reorder_first_conv, reorder_for_firstconv_onednn) +{ + build_options build_opt; + topology topology; + auto p = GetParam(); + + layout conv_layout(p.input_data_type, p.output_format, p.out_shape, padding({0, }, 0)); + layout reorder_layout(p.output_data_type, p.output_format, p.out_shape, padding({0, }, 0)); + auto input = engine.allocate_memory({ p.input_data_type, p.input_format, p.in_shape }); + auto weights = engine.allocate_memory({ p.input_data_type, p.input_format, p.weight_shape }); + + topology.add(input_layout("input", input->get_layout())); + topology.add(data("weights", weights)); + topology.add(reorder("reorder_input", "input", p.output_format, p.input_data_type)); + topology.add(cldnn::convolution("conv", { "reorder_input" }, { "weights" })); + topology.add(reorder("reorder_conv", "conv", reorder_layout)); + + program::ptr prog = program::build_program(engine, topology, build_opt, false, true); + layout_optimizer lo = layout_optimizer(); + lo.set_optimization_attribute(layout_optimizer::optimization_attributes_type::use_onednn_impls, true); + setting_node(prog, "conv", conv_layout); + + auto itr = prog->get_processing_order().begin(); + while (itr != prog->get_processing_order().end()) { + auto node_ptr = *itr++; + if (!node_ptr->is_type() || node_ptr->id() != "reorder_input") // target reorder + continue; + + auto& node = node_ptr->as(); + auto& input = node.input(); + for (auto usr : node_ptr->get_users()) { + auto temp = usr->get_output_layout(); + EXPECT_EQ(p.expected_result, lo.can_fuse_reorder(input, *usr, node.input().get_output_layout().format, usr->get_output_layout().format)); + } + } +} + +INSTANTIATE_TEST_SUITE_P(testing_can_fuse_reorder_first_conv, test_can_fuse_reorder_first_conv, + ::testing::ValuesIn(std::vector{ + reorder_test_param{format::bfyx, format::b_fs_yx_fsv32, data_types::u8, data_types::u8, {1, 3, 8, 8}, {1, 32, 8, 8}, {1, 3, 1, 1}, + tensor{1}, tensor{0}, data_types::u8, format::goiyx, true}, + reorder_test_param{format::bfyx, format::b_fs_yx_fsv16, data_types::f16, data_types::f16, {1, 3, 8, 8}, {1, 32, 8, 8}, {1, 3, 1, 1}, + tensor{1}, tensor{0}, data_types::f16, format::goiyx, true}, + })); diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/test_program_helpers.cpp b/inference-engine/thirdparty/clDNN/tests/module_tests/test_program_helpers.cpp new file mode 100644 index 00000000000..4ab72718b95 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/tests/module_tests/test_program_helpers.cpp @@ -0,0 +1,87 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#include "test_utils.h" +#include "program_helpers.h" + +#include +#include +#include "intel_gpu/primitives/reorder.hpp" +#include "intel_gpu/primitives/crop.hpp" +#include + +#include +#include + +using namespace cldnn; +using namespace ::tests; +using namespace testing; + +struct test_param { + tensor in_shape; + tensor out_shape; + format input_format; + format output_format; + padding pad1; + padding pad2; + data_types input_data_type; + data_types output_data_type; +}; + +template +class ReorderTest : public ::testing::TestWithParam {}; + +class functional_test : public ReorderTest {}; +TEST_P(functional_test, test_are_layouts_identical) { + auto p = GetParam(); + + layout in_layout(p.input_data_type, p.input_format, p.in_shape, p.pad1); + layout in_layout_no_pad(p.input_data_type, p.input_format, p.in_shape, p.pad2); + layout in_bfyx_layout(p.input_data_type, format::bfyx, p.in_shape, p.pad2); + layout out_layout(p.output_data_type, p.output_format, p.out_shape, p.pad1); + layout out_layout_no_pad(p.output_data_type, p.output_format, p.out_shape, p.pad2); + layout out_bfyx_layout(p.output_data_type, format::bfyx, p.out_shape, p.pad2); + + auto test1 = program_helpers::are_layouts_identical(in_layout, out_layout); + EXPECT_EQ(true, test1.first); + EXPECT_EQ(true, test1.second); + auto test2 = program_helpers::are_layouts_identical(in_layout, layout(data_types::f32, p.output_format, p.out_shape, p.pad1)); + EXPECT_EQ(false, test2.first); + EXPECT_EQ(false, test2.second); + auto test3 = program_helpers::are_layouts_identical(in_bfyx_layout, out_bfyx_layout); + EXPECT_EQ(true, test3.first); + EXPECT_EQ(true, test3.second); + auto test4 = program_helpers::are_layouts_identical(in_bfyx_layout, layout(p.input_data_type, format::bfzyx, p.in_shape, p.pad2)); + EXPECT_EQ(false, test4.first); + EXPECT_EQ(true, test4.second); + auto test5 = program_helpers::are_layouts_identical(in_bfyx_layout, layout(p.input_data_type, format::bfzyx, p.in_shape, p.pad1)); + EXPECT_EQ(false, test5.first); + EXPECT_EQ(false, test5.second); + auto test6 = program_helpers::are_layouts_identical(in_layout, layout(p.input_data_type, p.input_format, {1, 32, 16, 16}, p.pad1)); + EXPECT_EQ(false, test6.first); + EXPECT_EQ(false, test6.second); + auto test7 = program_helpers::are_layouts_identical(in_bfyx_layout, layout(p.input_data_type, format::b_fs_yx_fsv32, p.in_shape, p.pad2)); + EXPECT_EQ(false, test7.first); + EXPECT_EQ(false, test7.second); + auto test8 = program_helpers::are_layouts_identical(layout(p.input_data_type, format::b_fs_yx_fsv16, p.in_shape, p.pad2), in_bfyx_layout); + EXPECT_EQ(false, test8.first); + EXPECT_EQ(false, test8.second); + auto test9 = program_helpers::are_layouts_identical(in_layout, layout(p.input_data_type, p.input_format, p.in_shape, p.pad2)); + EXPECT_EQ(false, test9.first); + EXPECT_EQ(false, test9.second); + tensor temp = p.in_shape; + temp = temp.sub(p.pad1.lower_size()); + auto test10 = program_helpers::are_layouts_identical(in_bfyx_layout, layout(p.input_data_type, format::bfzyx, temp, p.pad1)); + EXPECT_EQ(false, test10.first); + EXPECT_EQ(false, test10.second); +} + +INSTANTIATE_TEST_SUITE_P(same_in_out, + functional_test, + ::testing::ValuesIn(std::vector{ + test_param{{1, 32, 4, 4}, {1, 32, 4, 4}, format::b_fs_yx_fsv32, format::b_fs_yx_fsv32, padding({0, 0, 1, 1}, 0), padding({0, 0, 0, 0}, 0), data_types::f16, data_types::f16}, + test_param{{1, 32, 4, 4}, {1, 32, 4, 4}, format::bfyx, format::bfyx, padding({0, 0, 1, 1}, 0), padding({0, 0, 0, 0}, 0), data_types::u8, data_types::u8} + })); From e89db1c6de8eb551949330114d476a2a4be499ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dawid=20Ko=C5=BCykowski?= Date: Wed, 5 Jan 2022 12:10:16 +0100 Subject: [PATCH 45/78] Fix "Unexpected number of outputs after override_all_outputs" (#9454) --- .../test_frontend_onnx_editor.py | 113 ++++++++++++++++-- .../onnx/frontend/src/input_model.cpp | 79 ++++++++++-- .../onnx/frontend/src/input_model.hpp | 1 + 3 files changed, 173 insertions(+), 20 deletions(-) diff --git a/src/bindings/python/tests/test_frontend/test_frontend_onnx_editor.py b/src/bindings/python/tests/test_frontend/test_frontend_onnx_editor.py index fe02533d05e..2e9acaf9021 100644 --- a/src/bindings/python/tests/test_frontend/test_frontend_onnx_editor.py +++ b/src/bindings/python/tests/test_frontend/test_frontend_onnx_editor.py @@ -49,6 +49,16 @@ from openvino.frontend import FrontEndManager # | | # out1 out2 # +# +# ------Test input model 3------ +# in1 in2 +# | / \ +# +--------+ +------+ +# | Add | | Relu | +# +--------+ +------+ +# | | +# out1 out2 +# def create_test_onnx_models(): models = {} # Input model 1 @@ -91,6 +101,23 @@ def create_test_onnx_models(): models["input_model_2.onnx"] = make_model(graph, producer_name="ONNX Importer", opset_imports=[onnx.helper.make_opsetid("", 13)]) + # Input model 3 + add_2 = onnx.helper.make_node("Add", inputs=["in1", "in2"], outputs=["out1"], name="onnx_add_op") + relu_2 = onnx.helper.make_node("Relu", inputs=["in2"], outputs=["out2"]) + + input_tensors = [ + make_tensor_value_info("in1", onnx.TensorProto.FLOAT, (2, 2)), + make_tensor_value_info("in2", onnx.TensorProto.FLOAT, (2, 2)), + ] + output_tensors = [ + make_tensor_value_info("out1", onnx.TensorProto.FLOAT, (2, 2)), + make_tensor_value_info("out1", onnx.TensorProto.FLOAT, (2, 2)), + make_tensor_value_info("out2", onnx.TensorProto.FLOAT, (2, 2)), + ] + graph = make_graph([add_2, relu_2], "test_graph_3", input_tensors, output_tensors) + models["input_model_3.onnx"] = make_model(graph, producer_name="ONNX Importer", + opset_imports=[onnx.helper.make_opsetid("", 13)]) + # Expected for extract_subgraph input_tensors = [ make_tensor_value_info("in1", onnx.TensorProto.FLOAT, (2, 2)), @@ -188,6 +215,19 @@ def create_test_onnx_models(): models["test_override_all_outputs_2.onnx"] = make_model(graph, producer_name="ONNX Importer", opset_imports=[onnx.helper.make_opsetid("", 13)]) + # Expected for test_override_all_outputs 3 + input_tensors = [ + make_tensor_value_info("in1", onnx.TensorProto.FLOAT, (2, 2)), + make_tensor_value_info("in2", onnx.TensorProto.FLOAT, (2, 2)), + ] + output_tensors = [ + make_tensor_value_info("out1", onnx.TensorProto.FLOAT, (2, 2)), + make_tensor_value_info("out1", onnx.TensorProto.FLOAT, (2, 2)), + ] + graph = make_graph([add_2], "test_graph_3", input_tensors, output_tensors) + models["test_override_all_outputs_3.onnx"] = make_model(graph, producer_name="ONNX Importer", + opset_imports=[onnx.helper.make_opsetid("", 13)]) + # Expected for test_override_all_inputs input_tensors = [ make_tensor_value_info("in3", onnx.TensorProto.FLOAT, (2, 2)), @@ -594,6 +634,50 @@ def test_override_all_outputs_2(): assert res +def test_override_all_outputs_3(): + skip_if_onnx_frontend_is_disabled() + fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME) + assert fe + + model = fe.load("input_model_3.onnx") + assert model + + place1 = model.get_place_by_tensor_name(tensor_name="out1") + place2 = model.get_place_by_tensor_name(tensor_name="out1") + model.override_all_outputs(outputs=[place1, place2]) + result_func = fe.convert(model) + + expected_model = fe.load("test_override_all_outputs_3.onnx") + expected_func = fe.convert(expected_model) + + res = compare_functions(result_func, expected_func) + assert res + + +def test_override_all_outputs_invalid_place(): + skip_if_onnx_frontend_is_disabled() + fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME) + assert fe + + model = fe.load("input_model_3.onnx") + assert model + + model2 = fe.load("input_model.onnx") + assert model2 + invalid_place = model2.get_place_by_tensor_name(tensor_name="out3") + + place1 = model.get_place_by_tensor_name(tensor_name="out1") + place2 = model.get_place_by_tensor_name(tensor_name="out1") + model.override_all_outputs(outputs=[place1, place2, invalid_place]) + result_func = fe.convert(model) + + expected_model = fe.load("test_override_all_outputs_3.onnx") + expected_func = fe.convert(expected_model) + + res = compare_functions(result_func, expected_func) + assert res + + def test_override_all_inputs(): skip_if_onnx_frontend_is_disabled() fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME) @@ -618,26 +702,31 @@ def test_override_all_inputs(): assert res -def test_override_all_inputs_exceptions(): +def test_override_all_inputs_invalid_place(): skip_if_onnx_frontend_is_disabled() fe = fem.load_by_framework(framework=ONNX_FRONTEND_NAME) assert fe - model = fe.load("input_model.onnx") + model = fe.load("input_model_3.onnx") assert model - place1 = model.get_place_by_tensor_name(tensor_name="in1") - place2 = model.get_place_by_tensor_name(tensor_name="in2") - place3 = model.get_place_by_operation_name_and_input_port(operation_name="split1", input_port_index=0) - place4 = model.get_place_by_tensor_name(tensor_name="in3") + model2 = fe.load("input_model.onnx") + assert model2 - with pytest.raises(Exception) as e: - model.override_all_inputs(inputs=[place1, place2]) - assert "Unexpected number of inputs after override_all_inputs" in str(e) + out3_tensor = model2.get_place_by_tensor_name(tensor_name="out3") + invalid_place = out3_tensor.get_producing_operation().get_input_port(input_port_index=0) - with pytest.raises(Exception) as e: - model.override_all_inputs(inputs=[place3, place4]) - assert "Unexpected number of inputs after override_all_inputs" in str(e) + out1_tensor = model.get_place_by_tensor_name(tensor_name="out1") + place1 = out1_tensor.get_producing_operation().get_input_port(input_port_index=0) + place2 = out1_tensor.get_producing_operation().get_input_port(input_port_index=1) + model.override_all_inputs(inputs=[place1, place2, invalid_place]) + result_func = fe.convert(model) + + expected_model = fe.load("input_model_3.onnx") + expected_func = fe.convert(expected_model) + + res = compare_functions(result_func, expected_func) + assert res def test_is_input_output(): diff --git a/src/frontends/onnx/frontend/src/input_model.cpp b/src/frontends/onnx/frontend/src/input_model.cpp index d712b6c3f8f..81cef6f947f 100644 --- a/src/frontends/onnx/frontend/src/input_model.cpp +++ b/src/frontends/onnx/frontend/src/input_model.cpp @@ -7,6 +7,7 @@ #include #include +#include "ngraph/log.hpp" #include "place.hpp" using namespace ov; @@ -202,28 +203,90 @@ std::shared_ptr InputModel::convert() { } // Editor features +bool InputModel::is_correct_place(const ov::frontend::Place::Ptr& place) const { + if (const auto tensor = std::dynamic_pointer_cast(place)) { + return m_editor->is_correct_tensor_name(tensor->get_names()[0]); + } + if (const auto op = std::dynamic_pointer_cast(place)) { + return m_editor->is_correct_and_unambiguous_node(op->get_editor_node()); + } + if (const auto input_edge = std::dynamic_pointer_cast(place)) { + if (auto tensor = std::dynamic_pointer_cast(input_edge->get_source_tensor())) { + return m_editor->is_correct_tensor_name(tensor->get_names()[0]); + } + } + if (const auto output_edge = std::dynamic_pointer_cast(place)) { + if (auto tensor = std::dynamic_pointer_cast(output_edge->get_target_tensor())) { + return m_editor->is_correct_tensor_name(tensor->get_names()[0]); + } + } + return false; +} + void InputModel::override_all_outputs(const std::vector& outputs) { - extract_subgraph({}, outputs); - NGRAPH_CHECK(m_editor->model_outputs().size() == outputs.size(), - "Unexpected number of outputs after override_all_outputs"); - NGRAPH_CHECK(std::all_of(std::begin(outputs), - std::end(outputs), + std::vector expected_valid_outputs; + for (const auto& output : outputs) { + bool is_correct = is_correct_place(output); + if (!is_correct) + NGRAPH_WARN << "Name " << output->get_names().at(0) + << " of output node is not a correct node name. Ignoring this parameter."; + else + expected_valid_outputs.push_back(output); + } + + extract_subgraph({}, expected_valid_outputs); + + NGRAPH_CHECK(std::all_of(std::begin(expected_valid_outputs), + std::end(expected_valid_outputs), [](const ov::frontend::Place::Ptr& place) { return place->is_output(); }), "Not all provided arguments of override_all_outputs are new outputs of the model"); + + const auto current_outputs = get_outputs(); + NGRAPH_CHECK(std::all_of(std::begin(current_outputs), + std::end(current_outputs), + [&](const Place::Ptr& current_out) { + return std::find_if(std::begin(expected_valid_outputs), + std::end(expected_valid_outputs), + [&](const Place::Ptr& expected_out) { + return expected_out->is_equal(current_out); + }) != std::end(current_outputs); + }), + "Some other than expected outputs were created during override_all_outputs"); } void InputModel::override_all_inputs(const std::vector& inputs) { + std::vector expected_valid_inputs; + for (const auto& input : inputs) { + bool is_correct = is_correct_place(input); + if (!is_correct) + NGRAPH_WARN << "Name " << input->get_names().at(0) + << " of input node is not a correct node. Ignoring this parameter."; + else + expected_valid_inputs.push_back(input); + } + const auto outputs_before_extraction = m_editor->model_outputs(); - extract_subgraph({inputs}, {}); + extract_subgraph({expected_valid_inputs}, {}); + NGRAPH_CHECK(std::equal(std::begin(outputs_before_extraction), std::end(outputs_before_extraction), std::begin(m_editor->model_outputs())), "All outputs should be preserved after override_all_inputs. Provided inputs does " "not satisfy all outputs"); - NGRAPH_CHECK(m_editor->model_inputs().size() == inputs.size(), - "Unexpected number of inputs after override_all_inputs"); + + const auto current_inputs = get_inputs(); + NGRAPH_CHECK(std::all_of(std::begin(current_inputs), + std::end(current_inputs), + [&](const Place::Ptr& current_in) { + return std::find_if(std::begin(expected_valid_inputs), + std::end(expected_valid_inputs), + [&](const Place::Ptr& expected_in) { + return expected_in->is_equal(current_in); + }) != std::end(current_inputs); + }), + "Some other than expected inputs were created during override_all_inputs"); } void InputModel::extract_subgraph(const std::vector& inputs, diff --git a/src/frontends/onnx/frontend/src/input_model.hpp b/src/frontends/onnx/frontend/src/input_model.hpp index 863a253f02e..e8fe258259a 100644 --- a/src/frontends/onnx/frontend/src/input_model.hpp +++ b/src/frontends/onnx/frontend/src/input_model.hpp @@ -78,6 +78,7 @@ public: private: std::shared_ptr m_editor; + bool is_correct_place(const ov::frontend::Place::Ptr& place) const; std::unordered_map> m_additional_tensor_names; void add_tensor_names(std::shared_ptr& model); From 89f48e0558d523a5397583b9e6f27132203fd88b Mon Sep 17 00:00:00 2001 From: hyunback kim Date: Thu, 6 Jan 2022 12:41:09 +0900 Subject: [PATCH 46/78] [GPU] Enable implicit concat batch1 in oneDNN. (#9424) * [GPU] Enable implicit concat batch1 in oneDNN. * Use gpu_usm memory offset, enable implicit concat batch1 in oneDNN. And optimized_out node doesn't always have to be mutable input, so add to check whether mutable input is existed in optimized node. * Update to check use_usm condition in implicit concat. * Add the condition for implicit concat. * implicit concat's dependency should not be fused_op with eltwise. * Buffer reuse is required for onednn sum post operation, output padding did the buffer reuse failure. Signed-off-by: hyunback --- .../include/intel_gpu/runtime/memory.hpp | 2 +- .../graph_optimizer/add_required_reorders.cpp | 36 ++++++++++++++---- .../graph_optimizer/prepare_buffer_fusing.cpp | 37 +++++++++++++++--- .../impls/onednn/primitive_onednn_base.h | 10 +++-- .../src/graph/impls/onednn/utils.cpp | 31 +++++++++++++++ .../src/graph/impls/onednn/utils.hpp | 2 + .../intel_gpu/src/graph/layout_optimizer.cpp | 38 ++++++++++++++++--- src/plugins/intel_gpu/src/graph/network.cpp | 21 ++++++++-- .../intel_gpu/src/runtime/ocl/ocl_memory.cpp | 7 ++-- .../intel_gpu/src/runtime/ocl/ocl_memory.hpp | 4 +- 10 files changed, 156 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp index 1b8d845242a..74ef33affff 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory.hpp @@ -68,7 +68,7 @@ struct memory { virtual event::ptr copy_from(stream& /* stream */, const void* /* host_ptr */) = 0; #ifdef ENABLE_ONEDNN_FOR_GPU - virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */) { + virtual dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) { throw std::runtime_error("[CLDNN] Can't convert memory object to onednn"); } #endif diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp index 1acc27df9d3..d16d0592ef3 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp @@ -51,7 +51,6 @@ void add_required_reorders::run(program& p) { continue; // only nodes with dependencies if (usr->is_type()) continue; - if (usr->type()->does_an_implementation_exist(*usr)) { if (usr->get_preferred_impl_type() != impl_types::onednn) { continue; @@ -62,17 +61,40 @@ void add_required_reorders::run(program& p) { if (!input.is_in_data_flow() || input.is_constant()) continue; - if (static_cast(input.get_output_layout().data_padding)) { - cldnn::layout layout_wo_padding = input.get_output_layout(); - layout_wo_padding.data_padding = cldnn::padding{}; - auto new_reorder = std::make_shared(input.id() + "_padding_reorder_" + usr->id(), input.id(), layout_wo_padding); - auto& new_reorder_node = p.get_or_create(new_reorder); - p.add_intermediate(new_reorder_node, *usr, i); + auto in_padding = input.get_output_layout().data_padding; + if (static_cast(in_padding)) { + bool spatial_padding = false; + for (size_t i = 0; i < in_padding.lower_size().spatial.size(); ++i) { + spatial_padding |= (in_padding.lower_size().spatial[i] != 0); + } + for (size_t i = 0; i < in_padding.upper_size().spatial.size(); ++i) { + spatial_padding |= (in_padding.upper_size().spatial[i] != 0); + } + bool batch_padding = false; + for (size_t i = 0; i < in_padding.lower_size().batch.size(); ++i) { + batch_padding |= (in_padding.lower_size().batch[i] != 0); + } + for (size_t i = 0; i < in_padding.upper_size().batch.size(); ++i) { + batch_padding |= (in_padding.upper_size().batch[i] != 0); + } + if (spatial_padding || batch_padding) { + cldnn::layout layout_padding = input.get_output_layout(); + cldnn::layout layout_wo_padding = input.get_output_layout(); + layout_wo_padding.data_padding = cldnn::padding{}; + layout_wo_padding.data_padding.lower_size().feature = layout_padding.data_padding.lower_size().feature; + layout_wo_padding.data_padding.upper_size().feature = layout_padding.data_padding.upper_size().feature; + auto new_reorder = std::make_shared(input.id() + "_padding_reorder_" + usr->id(), input.id(), layout_wo_padding); + auto& new_reorder_node = p.get_or_create(new_reorder); + p.add_intermediate(new_reorder_node, *usr, i); + } else { + continue; + } } } continue; } } + bool correct_layout_selected = false; bool weights_data = (usr->is_type() || usr->is_type() || usr->is_type() || usr->is_type()); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index f77234357d1..e51dcc507a4 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -87,17 +87,42 @@ bool concat_in_place_optimization::match(concatenation_node& node) { if (node.has_fused_primitives() || !node.get_fused_activations_funcs().empty()) return false; + bool is_onednn_impl = false; + + for (auto& input : node.get_dependencies()) { + if (input->get_preferred_impl_type() == impl_types::onednn) { + for (auto& fused_op : input->get_fused_primitives()) { + if (fused_op.node->is_type() && fused_op.deps.size() == 1) { + auto& eltw_in = input->get_dependency(fused_op.dep_start_idx); + auto eltw_in_layout = eltw_in.get_output_layout(); + auto out_layout = input->get_output_layout(); + + if (!fused_op.node->as().get_primitive()->needs_onednn_sum_post_op(eltw_in_layout)) + continue; + if (program_helpers::are_layouts_identical_for_onednn_sum_post_op(eltw_in_layout, out_layout)) + return false; + } + } + is_onednn_impl = true; + } + } + + // Implicit concat for onednn only when use_usm and batch 1. + if (is_onednn_impl) { + bool use_usm = node.get_program().get_engine().use_unified_shared_memory(); + layout out_l = node.get_output_layout(); + + if (!use_usm) + return false; + if (out_l.size.batch[0] > 1) + return false; + } + // For in place concatenation input layouts and data types must match. auto output_format = node.get_output_layout().format; auto output_datatype = node.get_output_layout().data_type; auto concat_axis = node.get_primitive()->axis; - // oneDNN doens't support paddings and such concat optimizations - for (auto& input : node.get_dependencies()) { - if (input->get_preferred_impl_type() == impl_types::onednn) - return false; - } - for (auto& input : node.get_dependencies()) { if (input->is_type()) // reshapes should be optimized out. diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h index fcb6d356194..dc708609562 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/primitive_onednn_base.h @@ -156,12 +156,14 @@ protected: { auto& input = instance.input_memory(0); - args.insert({DNNL_ARG_SRC, input.get_onednn_memory(_pd.dnnl::primitive_desc_base::src_desc(0))}); + auto offset = onednn::get_offset(_pd.dnnl::primitive_desc_base::src_desc(0)); + args.insert({DNNL_ARG_SRC, input.get_onednn_memory(_pd.dnnl::primitive_desc_base::src_desc(0), offset)}); } { auto& output = instance.output_memory(); - args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0))}); + auto offset = onednn::get_offset(_pd.dnnl::primitive_desc_base::dst_desc(0)); + args.insert({DNNL_ARG_DST, output.get_onednn_memory(_pd.dnnl::primitive_desc_base::dst_desc(0), offset)}); } configure_post_ops_arguments(instance, args); @@ -200,7 +202,9 @@ protected: event = stream.create_user_event(false); } - _prim.execute(stream.get_onednn_stream(), _args[net_id]); + if (!instance.can_be_optimized()) { + _prim.execute(stream.get_onednn_stream(), _args[net_id]); + } if (profiling) { stream.finish(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index 72e2effc0e1..57beae44e24 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -108,6 +108,37 @@ void combine_bf_with_first_spatial_dim(cldnn::layout& l) { l.size.spatial[last_spatial_dim_idx] = 1; } +int64_t get_offset(dnnl::memory::desc desc) { + int64_t offset = 0; + int32_t padded_idx = -1; + for (int32_t i = 0; i < DNNL_MAX_NDIMS; ++i) { + if (desc.data.padded_offsets[i] > 0) { + padded_idx = i; + break; + } + } + if (padded_idx > -1) { + if (padded_idx != 1) + throw std::runtime_error(std::string("onednn only support feature padding. Unsupported padded_idx: ") + std::to_string(padded_idx)); + offset = desc.data.padded_offsets[padded_idx]; + for (int32_t i = padded_idx + 1; i < desc.data.ndims; ++i) { + offset *= desc.data.padded_dims[i]; + } + } + switch (desc.data.data_type) { + case dnnl_data_type_t::dnnl_s8: + case dnnl_data_type_t::dnnl_u8: + return offset; + case dnnl_data_type_t::dnnl_f16: + case dnnl_data_type_t::dnnl_bf16: + return (offset * 2); + case dnnl_data_type_t::dnnl_f32: + case dnnl_data_type_t::dnnl_s32: + return (offset * 4); + default: throw std::runtime_error(std::string("Unsupported offset for dnnl_data_type_t ") + dnnl_dt2str(desc.data.data_type)); + } +} + dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_tag target_fmt, bool flatten) { dnnl::memory::dims dims; dnnl::memory::dims padded_dims; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp index c89da4d8a34..1af7bb98881 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp @@ -34,6 +34,8 @@ dnnl::algorithm convert_activation_func(cldnn::activation_func func); // onednn -> cldnn cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped = false); +int64_t get_offset(dnnl::memory::desc desc); + // If the values in the tensor are identical, make it as per-tensor value template void make_per_tensor_if_possible(cldnn::data_node& node); diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 17feb992489..10a85f71a6d 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1178,13 +1178,39 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) { } bool layout_optimizer::are_layouts_suitable_for_onednn(program_node& node) { - auto in_layout = node.get_dependencies().front()->get_output_layout(); - auto out_layout = node.get_output_layout(); + auto in_padding = node.get_dependencies().front()->get_output_layout().data_padding; + auto out_padding = node.get_output_layout().data_padding; // Check if padding exists - if (node.get_preferred_impl_type() == impl_types::onednn && (in_layout.data_padding || out_layout.data_padding)) - return false; - else - return true; + if (node.get_preferred_impl_type() == impl_types::onednn && (in_padding || out_padding)) { + bool no_spatial_padding = true; + for (size_t i = 0; i < in_padding.lower_size().spatial.size(); ++i) { + no_spatial_padding &= (in_padding.lower_size().spatial[i] == 0); + } + for (size_t i = 0; i < in_padding.upper_size().spatial.size(); ++i) { + no_spatial_padding &= (in_padding.upper_size().spatial[i] == 0); + } + for (size_t i = 0; i < out_padding.lower_size().spatial.size(); ++i) { + no_spatial_padding &= (out_padding.lower_size().spatial[i] == 0); + } + for (size_t i = 0; i < out_padding.upper_size().spatial.size(); ++i) { + no_spatial_padding &= (out_padding.upper_size().spatial[i] == 0); + } + bool no_batch_padding = true; + for (size_t i = 0; i < in_padding.lower_size().batch.size(); ++i) { + no_batch_padding &= (in_padding.lower_size().batch[i] == 0); + } + for (size_t i = 0; i < in_padding.upper_size().batch.size(); ++i) { + no_batch_padding &= (in_padding.upper_size().batch[i] == 0); + } + for (size_t i = 0; i < out_padding.lower_size().batch.size(); ++i) { + no_batch_padding &= (out_padding.lower_size().batch[i] == 0); + } + for (size_t i = 0; i < out_padding.upper_size().batch.size(); ++i) { + no_batch_padding &= (out_padding.upper_size().batch[i] == 0); + } + return (no_spatial_padding && no_batch_padding); + } + return true; } impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format preferred_format) { diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 22438b529a6..c48c18fcff6 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef GPU_DEBUG_CONFIG #include @@ -835,11 +836,23 @@ void network::allocate_primitive_instance(program_node const& node) { return; auto inst = node.type()->create_instance(*this, node); - for (auto& dep : node.get_dependencies()) { - if (dep->is_type() || dep->is_type() || dep->can_be_optimized()) { - inst->set_mutable_input(true); - break; + + std::function is_mutable_input = [&is_mutable_input](const program_node& node) { + for (auto& dep : node.get_dependencies()) { + if (dep->is_type() || dep->is_type()) { + return true; + } + if (dep->can_be_optimized()) { + if (is_mutable_input(*dep)) { + return true; + } + } } + return false; + }; + + if (is_mutable_input(node)) { + inst->set_mutable_input(true); } _primitives[node.id()] = inst; diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp index 07e89ad5cfc..fb9adf2c6d5 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.cpp @@ -111,7 +111,7 @@ event::ptr gpu_buffer::copy_from(stream& stream, const void* host_ptr) { } #ifdef ENABLE_ONEDNN_FOR_GPU -dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc) { +dnnl::memory gpu_buffer::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) { auto onednn_engine = _engine->get_onednn_engine(); dnnl::memory dnnl_mem(desc, onednn_engine, DNNL_MEMORY_NONE); dnnl::ocl_interop::set_mem_object(dnnl_mem, _buffer.get()); @@ -396,9 +396,10 @@ event::ptr gpu_usm::copy_from(stream& stream, const void* host_ptr) { } #ifdef ENABLE_ONEDNN_FOR_GPU -dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc) { +dnnl::memory gpu_usm::get_onednn_memory(dnnl::memory::desc desc, int64_t offset) { auto onednn_engine = _engine->get_onednn_engine(); - dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm, _buffer.get()); + dnnl::memory dnnl_mem = dnnl::ocl_interop::make_memory(desc, onednn_engine, dnnl::ocl_interop::memory_kind::usm, + reinterpret_cast(_buffer.get()) + offset); return dnnl_mem; } #endif diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp index d37bd12d27b..f38a30c0640 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_memory.hpp @@ -43,7 +43,7 @@ struct gpu_buffer : public lockable_gpu_mem, public memory { event::ptr copy_from(stream& stream, const memory& other) override; event::ptr copy_from(stream& stream, const void* host_ptr) override; #ifdef ENABLE_ONEDNN_FOR_GPU - dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */) override; + dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override; #endif protected: @@ -116,7 +116,7 @@ struct gpu_usm : public lockable_gpu_mem, public memory { event::ptr copy_from(stream& stream, const void* host_ptr) override; #ifdef ENABLE_ONEDNN_FOR_GPU - dnnl::memory get_onednn_memory(dnnl::memory::desc desc) override; + dnnl::memory get_onednn_memory(dnnl::memory::desc /* desc */, int64_t offset = 0) override; #endif protected: From 2a476f6906df36de8805ad1b1dd307b0021e73d8 Mon Sep 17 00:00:00 2001 From: hyunback kim Date: Fri, 7 Jan 2022 18:38:07 +0900 Subject: [PATCH 47/78] [GPU] Enable unet2d enable on DG2 (#9522) * [GPU] Enable unet2d enable on DG2 Add to support is_os_yx_isa2_osa8_isv8_osv2 format, which is used in weight reorder. Signed-off-by: hyunback --- .../include/intel_gpu/runtime/tensor.hpp | 2 + .../graph/impls/onednn/convolution_onednn.cpp | 2 +- .../impls/onednn/deconvolution_onednn.cpp | 2 +- .../impls/onednn/fully_connected_onednn.cpp | 2 +- .../src/graph/impls/onednn/utils.cpp | 65 +++++++++++++------ .../src/graph/impls/onednn/utils.hpp | 5 +- .../src/graph/kernel_selector_helper.cpp | 4 ++ .../kernel_selector/common/tensor_type.cpp | 2 + .../src/kernel_selector/common/tensor_type.h | 1 + .../include/batch_headers/fetch_weights.cl | 16 +++++ .../core/cl_kernels/reorder_weights.cl | 2 + .../core/kernel_selector_common.cpp | 1 + 12 files changed, 76 insertions(+), 28 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp index aeea86c190e..2ef73d13810 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/tensor.hpp @@ -162,6 +162,7 @@ struct format { os_is_yx_osa2_isa8_osv8_isv2, os_is_yx_osa2_isa8_osv16_isv2, os_is_yx_osa2_isa8_osv16_isv4, + is_os_yx_isa2_osa8_isv8_osv2, is_o_yx_isv32, ///< format for weights for 1x1 MMAD convolutions is_o32_yx_isv32_swizzled_by_4, ///< format for weights for 1x1 MMAD convolutions os_is_y_x8_osv8_isv4, ///< format for weights for 1x1 MMAD convolutions @@ -301,6 +302,7 @@ struct format { { os_is_zyx_isa8_osv16_isv4, { 1, 1, 3, 0, "oizyx", "oixyz", {{1, 8}, {0, 16}, {1, 4}}}}, { os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 2, 0, "oiyx", "oixy?", {{0, 32}, {1, 32}}}}, { os_is_zyx_osa4_isa8_osv8_isv4_swizzled_by_4, { 1, 1, 3, 0, "oizyx", "oixyz", {{0, 32}, {1, 32}}}}, + { is_os_yx_isa2_osa8_isv8_osv2, { 1, 1, 2, 0, "ioyx", "ioxy?", {{1, 16}, {0, 16}}}}, { is_o_yx_isv32, { 1, 1, 2, 0, "oyxi", "oixy?", {{1, 32}}}}, { is_o32_yx_isv32_swizzled_by_4, { 1, 1, 2, 0, "oyxi", "oixy?", {}}}, { os_is_y_x8_osv8_isv4, { 1, 1, 2, 0, "oyxi", "oixy?", {}}}, diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp index 54e0328fdc9..1fccd2beba0 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/convolution_onednn.cpp @@ -117,7 +117,7 @@ protected: auto cldnn_prim = arg.get_primitive(); auto weights_layout = arg.get_dependency(1).get_output_layout(); auto grouped_weights = format::is_grouped(weights_layout.format) || arg.get_primitive()->grouped_weights_shape; - cldnn::format out_fmt = onednn::convert_format(onednn::get_format_by_desc(pd.weights_desc(0)), grouped_weights); + cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights); kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape); set_params(arg, r_params); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp index 6b65c181acd..e81c1124a73 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/deconvolution_onednn.cpp @@ -75,7 +75,7 @@ protected: auto cldnn_prim = arg.get_primitive(); auto weights_layout = arg.get_dependency(1).get_output_layout(); auto grouped_weights = format::is_grouped(weights_layout.format) || arg.get_primitive()->grouped_weights_shape; - cldnn::format out_fmt = onednn::convert_format(onednn::get_format_by_desc(pd.weights_desc(0)), grouped_weights); + cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0), grouped_weights); kernel_selector::WeightsLayout reqLayout = to_weights_layout(out_fmt, cldnn_prim->grouped_weights_shape); set_params(arg, r_params); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 81a0fcf32a3..5ef1616c07b 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -64,7 +64,7 @@ protected: auto cldnn_prim = arg.get_primitive(); auto weights_layout = arg.get_dependency(1).get_output_layout(); - cldnn::format out_fmt = onednn::convert_format(onednn::get_format_by_desc(pd.weights_desc(0))); + cldnn::format out_fmt = onednn::find_format(pd.weights_desc(0)); kernel_selector::WeightsLayout req_layout = to_weights_layout(out_fmt, false); // set engine info & forcing diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index 57beae44e24..f9b2407b68a 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -222,7 +222,7 @@ static bool isSame(dnnl::memory::desc desc, dnnl::memory::format_tag fmt) { return true; } -dnnl::memory::format_tag get_format_by_desc(dnnl::memory::desc desc) { +static dnnl::memory::format_tag get_format_by_desc(dnnl::memory::desc desc) { // TODO [OneDNN]: Previously it was a field of tdesc, but now the brute // force search here. Please avoid of using this method. const auto ndims = desc.dims().size(); @@ -239,25 +239,8 @@ dnnl::memory::format_tag get_format_by_desc(dnnl::memory::desc desc) { return dnnl::memory::format_tag::undef; } -dnnl::algorithm convert_activation_func(cldnn::activation_func func) { - switch (func) { - case cldnn::activation_func::relu: return dnnl::algorithm::eltwise_relu; - case cldnn::activation_func::relu_negative_slope: return dnnl::algorithm::eltwise_relu; - case cldnn::activation_func::gelu: return dnnl::algorithm::eltwise_gelu; - case cldnn::activation_func::elu: return dnnl::algorithm::eltwise_elu; - case cldnn::activation_func::mish: return dnnl::algorithm::eltwise_mish; - case cldnn::activation_func::swish: return dnnl::algorithm::eltwise_swish; - case cldnn::activation_func::hswish: return dnnl::algorithm::eltwise_hardswish; - case cldnn::activation_func::abs: return dnnl::algorithm::eltwise_abs; - case cldnn::activation_func::exp: return dnnl::algorithm::eltwise_exp; - case cldnn::activation_func::logistic: return dnnl::algorithm::eltwise_logistic; - case cldnn::activation_func::clamp: return dnnl::algorithm::eltwise_clip; - case cldnn::activation_func::hyperbolic_tan: return dnnl::algorithm::eltwise_tanh; - default: throw std::runtime_error("Unsupported activation func for onednn primitive " + std::to_string(static_cast(func))); - } -} - -cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped) { +// onednn -> cldnn +static cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped) { if (is_grouped) { switch (fmt) { case dnnl::memory::format_tag::abcde: return cldnn::format::goiyx; @@ -278,7 +261,7 @@ cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped) { switch (fmt) { case dnnl::memory::format_tag::ab: return cldnn::format::oiyx; case dnnl::memory::format_tag::abcd: return cldnn::format::oiyx; - case dnnl::memory::format_tag::bacd: return cldnn::format::oiyx; + case dnnl::memory::format_tag::bacd: return cldnn::format::ioyx; case dnnl::memory::format_tag::BAcd16b16a: return cldnn::format::is_os_yx_isv16_osv16; case dnnl::memory::format_tag::ABcd16b16a: return cldnn::format::os_is_yx_isv16_osv16; case dnnl::memory::format_tag::abcde: return cldnn::format::oizyx; @@ -299,6 +282,46 @@ cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped) { } } +cldnn::format find_format(dnnl::memory::desc desc, bool is_grouped) { + auto onednn_desc = get_format_by_desc(desc); + + if (onednn_desc != dnnl::memory::format_tag::undef) { + return convert_format(onednn_desc, is_grouped); + } else { + if (is_grouped) { + throw std::runtime_error(std::string("Unsupported grouped onednn dnnl::memory::desc find_format")); + } else { + auto blk = desc.data.format_desc.blocking; + + if (desc.data.ndims == 4 && desc.data.format_desc.blocking.inner_nblks == 4 + && blk.inner_blks[0] == 2 && blk.inner_blks[1] == 8 && blk.inner_blks[2] == 8 && blk.inner_blks[3] == 2 + && blk.inner_idxs[0] == 1 && blk.inner_idxs[1] == 0 && blk.inner_idxs[2] == 1 && blk.inner_idxs[3] == 0) { + return cldnn::format::is_os_yx_isa2_osa8_isv8_osv2; + } else { + throw std::runtime_error(std::string("Unsupported onednn dnnl::memory::desc find_format")); + } + } + } +} + +dnnl::algorithm convert_activation_func(cldnn::activation_func func) { + switch (func) { + case cldnn::activation_func::relu: return dnnl::algorithm::eltwise_relu; + case cldnn::activation_func::relu_negative_slope: return dnnl::algorithm::eltwise_relu; + case cldnn::activation_func::gelu: return dnnl::algorithm::eltwise_gelu; + case cldnn::activation_func::elu: return dnnl::algorithm::eltwise_elu; + case cldnn::activation_func::mish: return dnnl::algorithm::eltwise_mish; + case cldnn::activation_func::swish: return dnnl::algorithm::eltwise_swish; + case cldnn::activation_func::hswish: return dnnl::algorithm::eltwise_hardswish; + case cldnn::activation_func::abs: return dnnl::algorithm::eltwise_abs; + case cldnn::activation_func::exp: return dnnl::algorithm::eltwise_exp; + case cldnn::activation_func::logistic: return dnnl::algorithm::eltwise_logistic; + case cldnn::activation_func::clamp: return dnnl::algorithm::eltwise_clip; + case cldnn::activation_func::hyperbolic_tan: return dnnl::algorithm::eltwise_tanh; + default: throw std::runtime_error("Unsupported activation func for onednn primitive " + std::to_string(static_cast(func))); + } +} + template void make_per_tensor_if_possible(cldnn::data_node& node) { auto ptr = node.get_attached_memory_ptr(); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp index 1af7bb98881..5534b376bc7 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.hpp @@ -28,11 +28,8 @@ dnnl::memory::dims flatten_tensor(cldnn::tensor t); dnnl::memory::data_type convert_data_type(cldnn::data_types dt); dnnl::memory::format_tag convert_data_format(cldnn::format fmt); dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_tag target_fmt = dnnl::memory::format_tag::undef, bool flatten = false); -dnnl::memory::format_tag get_format_by_desc(dnnl::memory::desc desc); dnnl::algorithm convert_activation_func(cldnn::activation_func func); - -// onednn -> cldnn -cldnn::format convert_format(dnnl::memory::format_tag fmt, bool is_grouped = false); +cldnn::format find_format(dnnl::memory::desc desc, bool is_grouped = false); int64_t get_offset(dnnl::memory::desc desc); diff --git a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp index 540e84a81ea..afb3db8ce6f 100644 --- a/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/kernel_selector_helper.cpp @@ -389,6 +389,8 @@ kernel_selector::weights_layout to_weights_layout(format f, bool is_grouped) { return kernel_selector::weights_layout::g_os_zyx_is_osv32_isv16; case format::g_os_zyx_is_osv32_isv32: return kernel_selector::weights_layout::g_os_zyx_is_osv32_isv32; + case format::is_os_yx_isa2_osa8_isv8_osv2: + return kernel_selector::weights_layout::is_os_yx_isa2_osa8_isv8_osv2; default: throw std::invalid_argument("Unable to convert tensor layout " + fmt_to_str(f) + " to weights layout"); } @@ -506,6 +508,8 @@ cldnn::format::type from_weights_layout(kernel_selector::weights_layout l) { return cldnn::format::is_os_zyx_isv16_osv16; case kernel_selector::weights_layout::is_os_yx_isv16_osv16: return cldnn::format::is_os_yx_isv16_osv16; + case kernel_selector::weights_layout::is_os_yx_isa2_osa8_isv8_osv2: + return cldnn::format::is_os_yx_isa2_osa8_isv8_osv2; case kernel_selector::weights_layout::os_is_yx_osv8_isv2: return cldnn::format::os_is_yx_osv8_isv2; case kernel_selector::weights_layout::os_is_yx_osv8_isv4: diff --git a/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.cpp b/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.cpp index ce7ec16ad4e..e4d6be68a7c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.cpp @@ -117,6 +117,7 @@ WeightsTensor::WeightsChannelArray WeightsTensor::weightsChannelArray {{ { WeightsLayout::os_is_yx_isv16_osv16, { 0, 1, -1, 2, 3, -1 } }, { WeightsLayout::is_os_zyx_isv16_osv16, { 0, 1, 2, 4, 3, -1 } }, { WeightsLayout::is_os_yx_isv16_osv16, { 0, 1, -1, 3, 2, -1 } }, + { WeightsLayout::is_os_yx_isa2_osa8_isv8_osv2, { 0, 1, -1, 3, 2, -1 } }, { WeightsLayout::os_is_osv32_isv32_swizzled_by_4, { -1, -1, -1, 0, 1, -1 } }, { WeightsLayout::os_is_zyx_isv8_osv16_isv2, { 0, 1, 2, 3, 4, -1 } }, { WeightsLayout::os_is_yx_isv8_osv16_isv2, { 0, 1, -1, 2, 3, -1 } }, @@ -534,6 +535,7 @@ NDims WeightsTensor::GetSimpleDims(const std::vector& d, WeightsLayout l newDims[3] = RoundUp(newDims[3], 32); break; case os_is_yx_osa2_isa8_osv8_isv2: + case is_os_yx_isa2_osa8_isv8_osv2: newDims[2] = RoundUp(newDims[2], 16); newDims[3] = RoundUp(newDims[3], 16); break; diff --git a/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.h b/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.h index fb57e4592dc..8e47e885356 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common/tensor_type.h @@ -116,6 +116,7 @@ enum WeightsLayout { os_is_yx_osa2_isa8_osv8_isv2, os_is_yx_osa2_isa8_osv16_isv4, os_is_yx_osa2_isa8_osv16_isv2, + is_os_yx_isa2_osa8_isv8_osv2, g_os_is_yx_osa2_isa8_osv16_isv4, g_os_is_yx_osa2_isa8_osv16_isv2, os_is_yx_osa4_isa8_osv8_isv4_swizzled_by_4, // for MMAD convolution swizzled from ofm 0..7 to 0,4,8,12,16,20,24,28, diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/include/batch_headers/fetch_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/include/batch_headers/fetch_weights.cl index 03a3ad913b8..6d1f0e89525 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/include/batch_headers/fetch_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/include/batch_headers/fetch_weights.cl @@ -742,6 +742,12 @@ inline uint get_g_os_is_yx_osa2_isa8_osv16_isv2(uint g, uint o, uint i, uint y, return idx; } +inline uint get_g_is_os_yx_isa2_osa8_isv8_osv2(uint g, uint o, uint i, uint z, uint y, uint x, + uint size_x, uint size_y, uint size_z, uint size_ifm, uint size_ofm, uint offset) +{ + return get_g_os_is_yx_osa2_isa8_osv8_isv2(g, i, o, z, y, x, size_x, size_y, size_z, size_ofm, size_ifm, offset); +} + #define GET_FILTER_OS_IS_YX_OSA4_ISA8_OSV8_ISV4_INDEX(prefix, o, i, y, x) \ get_g_os_is_yx_osa4_isa8_osv8_isv4( \ 0, o, i, 0, y, x, \ @@ -895,6 +901,16 @@ inline uint get_g_os_is_yx_osa2_isa8_osv16_isv2(uint g, uint o, uint i, uint y, CAT(prefix, _OFM_NUM), \ CAT(prefix, _OFFSET)) +#define GET_FILTER_IS_OS_YX_ISA2_OSA8_ISV8_OSV2_INDEX(prefix, o, i, y, x) \ + get_g_is_os_yx_isa2_osa8_isv8_osv2( \ + 0, o, i, 0, y, x, \ + CAT(prefix, _SIZE_X), \ + CAT(prefix, _SIZE_Y), \ + 1, \ + CAT(prefix, _IFM_NUM), \ + CAT(prefix, _OFM_NUM), \ + CAT(prefix, _OFFSET)) + inline uint get_is_o_yx_isv32_index(uint o, uint i, uint y, uint x, uint i_size, uint o_size, uint x_size, uint y_size) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reorder_weights.cl b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reorder_weights.cl index 57c06239a7c..11d587c9fbd 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reorder_weights.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/core/cl_kernels/reorder_weights.cl @@ -203,6 +203,8 @@ inline uint FUNC(get_output_index)(uint g, uint o, uint i, uint z, uint y, uint return GET_FILTER_OS_IS_YX_ISA8_OSV8_ISV4_SWIZZLED_BY_4_INDEX(OUTPUT, g, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_YX_OSA2_ISA8_OSV8_ISV2 return GET_FILTER_OS_IS_YX_OSA2_ISA8_OSV8_ISV2_INDEX(OUTPUT, o, i, y, x); +#elif defined OUTPUT_LAYOUT_IS_OS_YX_ISA2_OSA8_ISV8_OSV2 + return GET_FILTER_IS_OS_YX_ISA2_OSA8_ISV8_OSV2_INDEX(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_YX_OSA4_ISA8_OSV8_ISV2 return GET_FILTER_OS_IS_YX_OSA4_ISA8_OSV8_ISV2_INDEX(OUTPUT, o, i, y, x); #elif defined OUTPUT_LAYOUT_OS_IS_ZYX_OSA4_ISA8_OSV8_ISV2 diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/kernel_selector_common.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/kernel_selector_common.cpp index 75349b31f3e..24b50546b06 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/kernel_selector_common.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/core/kernel_selector_common.cpp @@ -395,6 +395,7 @@ std::string toString(WeightsLayout layout) { case WeightsLayout::os_is_yx_osa2_isa8_osv16_isv2: return "OS_IS_YX_OSA2_ISA8_OSV16_ISV2"; case WeightsLayout::g_os_is_yx_osa2_isa8_osv16_isv2: return "G_OS_IS_YX_OSA2_ISA8_OSV16_ISV2"; case WeightsLayout::os_is_yx_osa2_isa8_osv8_isv2: return "OS_IS_YX_OSA2_ISA8_OSV8_ISV2"; + case WeightsLayout::is_os_yx_isa2_osa8_isv8_osv2: return "IS_OS_YX_ISA2_OSA8_ISV8_OSV2"; case WeightsLayout::g_os_is_yx_isv16_osv16: return "G_OS_IS_YX_ISV16_OSV16"; case WeightsLayout::g_os_is_yx_osv16_isv4: return "G_OS_IS_YX_OSV16_ISV4"; case WeightsLayout::g_os_is_zyx_osv16_isv16: return "G_OS_IS_ZYX_OSV16_ISV16"; From 9e41208791aecbf12a000f5b739683631e246551 Mon Sep 17 00:00:00 2001 From: Andrei Molotkov Date: Mon, 10 Jan 2022 12:07:12 +0300 Subject: [PATCH 48/78] [GPU] QueryNetwork method correction to work with dynamic shapes (#9462) --- src/inference/src/ie_core.cpp | 21 +++- src/plugins/intel_gpu/src/plugin/plugin.cpp | 124 +++----------------- 2 files changed, 36 insertions(+), 109 deletions(-) diff --git a/src/inference/src/ie_core.cpp b/src/inference/src/ie_core.cpp index 29c543f97c7..86f3fcb89dc 100644 --- a/src/inference/src/ie_core.cpp +++ b/src/inference/src/ie_core.cpp @@ -699,10 +699,25 @@ public: opNames.emplace(op->get_friendly_name()); for (const auto& op : func->get_ops()) { - if (opNames.find(op->get_friendly_name()) == opNames.end() || - (!res.supportedLayersMap.count(op->get_friendly_name()) && - std::dynamic_pointer_cast(op))) + if (opNames.find(op->get_friendly_name()) == opNames.end()) { res.supportedLayersMap[op->get_friendly_name()] = defDevice; + } + } + + for (const auto& op : func->get_ops()) { + if (!res.supportedLayersMap.count(op->get_friendly_name()) && + std::dynamic_pointer_cast(op)) { + bool are_all_users_supported = true; + for (const auto& user : op->output(0).get_target_inputs()) { + if (!res.supportedLayersMap.count(user.get_node()->get_friendly_name())) { + are_all_users_supported = false; + break; + } + } + if (are_all_users_supported) { + res.supportedLayersMap[op->get_friendly_name()] = defDevice; + } + } } return res; } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index f402d362c27..9adca79a60a 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -380,44 +380,18 @@ QueryNetworkResult Plugin::QueryNetwork(const CNNNetwork& network, std::unordered_set supported; std::unordered_set unsupported; - std::unordered_set splitNames; - std::unordered_set concatNames; std::unordered_set constantsNames; - std::unordered_set depLayerNames; - - std::vector> splits; - std::vector> concats; std::vector> constants; - std::vector> nextLayerDependent; auto layerIsSupported = [&](std::shared_ptr node) { if (node->is_dynamic()) { return false; } - if (ngraph::is_type(node) || - ngraph::is_type(node) || + if (ngraph::is_type(node) || ngraph::is_type(node) || ngraph::is_type(node)) { return false; } - if (ngraph::is_type(node)) { - splitNames.emplace(node->get_friendly_name()); - splits.push_back(node); - return false; - } - if (ngraph::is_type(node)) { - concatNames.emplace(node->get_friendly_name()); - concats.push_back(node); - return false; - } - if (ngraph::is_type(node) || - ngraph::is_type(node) || - ngraph::is_type(node) || - ngraph::is_type(node)) { - depLayerNames.emplace(node->get_friendly_name()); - nextLayerDependent.push_back(node); - return false; - } if (ngraph::is_type(node)) { constantsNames.emplace(node->get_friendly_name()); constants.push_back(node); @@ -431,10 +405,18 @@ QueryNetworkResult Plugin::QueryNetwork(const CNNNetwork& network, // Get ops after transformations and check if it's supported // Transformations might lead to the situation when single node is merged to multiple operations, // so we mark original op as supported only if all nodes that it was merged into are supported + bool wasNodeAlreadyChecked = false; + bool isSupported = false; for (auto&& op : ops) { + wasNodeAlreadyChecked = false; + isSupported = false; for (auto&& fusedLayerName : ngraph::getFusedNamesVector(op)) { if (InferenceEngine::details::contains(originalOpNames, fusedLayerName)) { - if (layerIsSupported(op)) { + if (!wasNodeAlreadyChecked) { + isSupported = layerIsSupported(op); + wasNodeAlreadyChecked = true; + } + if (isSupported) { supported.emplace(fusedLayerName); } else { unsupported.emplace(fusedLayerName); @@ -450,77 +432,7 @@ QueryNetworkResult Plugin::QueryNetwork(const CNNNetwork& network, } unsupported.clear(); - // Check set of heuristics to produce more efficient hetero sub-graph. Note: checks order is important. - // 1. Split is marked as supported when all output ops can be offloaded to GPU - for (const auto & op : splits) { - bool is_supported = true; - for (size_t i = 0; i < op->get_output_size(); i++) { - auto outTensors = op->get_output_target_inputs(i); - for (auto& t : outTensors) { - auto output = t.get_node(); - const auto& name = output->get_friendly_name(); - if (!InferenceEngine::details::contains(supported, name) && - !InferenceEngine::details::contains(depLayerNames, name) && - !InferenceEngine::details::contains(concatNames, name) && - !InferenceEngine::details::contains(splitNames, name)) { - is_supported = false; - break; - } - } - } - if (is_supported) { - supported.emplace(op->get_friendly_name()); - } - } - - // 2. Concat is marked as supported when all inputs can be offloaded to GPU - for (const auto& op : concats) { - bool is_supported = true; - for (size_t i = 0; i < op->get_input_size(); i++) { - auto input = op->get_input_node_shared_ptr(i); - const auto& name = input->get_friendly_name(); - if (!InferenceEngine::details::contains(supported, name) && - !InferenceEngine::details::contains(depLayerNames, name) && - !InferenceEngine::details::contains(concatNames, name)) { - is_supported = false; - break; - } - } - if (is_supported) { - supported.emplace(op->get_friendly_name()); - } - } - - // 3. Some layers are marked as supported when all inputs and outputs can be offloaded to GPU - for (const auto& op : nextLayerDependent) { - bool is_supported = true; - // both inputs and output should be GPU to remain on GPU - for (size_t i = 0; i < op->get_input_size(); i++) { - auto input = op->get_input_node_shared_ptr(i); - const auto& name = input->get_friendly_name(); - // All inputs must be supported or be a constant - if (!InferenceEngine::details::contains(supported, name) && !InferenceEngine::details::contains(constantsNames, name)) { - is_supported = false; - break; - } - } - for (size_t i = 0; i < op->get_output_size(); i++) { - auto outTensors = op->get_output_target_inputs(i); - for (auto& t : outTensors) { - auto output = t.get_node(); - const auto& name = output->get_friendly_name(); - if (!InferenceEngine::details::contains(supported, name)) { - is_supported = false; - break; - } - } - } - if (is_supported) { - supported.emplace(op->get_friendly_name()); - } - } - - // 4. Constants are marked as supported when all outputs can be offloaded to GPU + // 1. Constants are marked as supported when all outputs can be offloaded to GPU for (const auto& op : constants) { bool is_supported = true; for (size_t i = 0; i < op->get_output_size(); i++) { @@ -558,14 +470,14 @@ QueryNetworkResult Plugin::QueryNetwork(const CNNNetwork& network, } if (ngraph::op::is_constant(node) || ngraph::op::is_parameter(node)) { - if (!InferenceEngine::details::contains(supported, node->output(0).get_target_inputs().begin()->get_node()->get_friendly_name())) { - supported.erase(node->get_friendly_name()); - } - } else if (ngraph::op::is_output(node)) { - if (!InferenceEngine::details::contains(supported, node->input_values().begin()->get_node()->get_friendly_name())) { - supported.erase(node->get_friendly_name()); - } + if (!InferenceEngine::details::contains(supported, node->output(0).get_target_inputs().begin()->get_node()->get_friendly_name())) { + supported.erase(node->get_friendly_name()); } + } else if (ngraph::op::is_output(node)) { + if (!InferenceEngine::details::contains(supported, node->input_values().begin()->get_node()->get_friendly_name())) { + supported.erase(node->get_friendly_name()); + } + } } for (auto&& layerName : supported) { From cb0084718a1aac159244be2c1da1e093a6d0fd0d Mon Sep 17 00:00:00 2001 From: Ilya Sharikov Date: Mon, 10 Jan 2022 13:50:03 +0300 Subject: [PATCH 49/78] Update list of OMZ models for stress tests (#9391) --- .../desktop_references_config.xml | 48 -------------- .../nightly_configs/desktop_test_config.xml | 12 +--- .../nightly_configs/myriad_test_config.xml | 12 +--- .../weekly_configs/desktop_test_config.xml | 62 ++----------------- 4 files changed, 8 insertions(+), 126 deletions(-) diff --git a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml index 69b444db0a1..8e98c96a534 100644 --- a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml +++ b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_references_config.xml @@ -224,18 +224,6 @@ - - - - - - - - - - - - @@ -272,18 +260,6 @@ - - - - - - - - - - - - @@ -486,18 +462,6 @@ - - - - - - - - - - - - @@ -558,18 +522,6 @@ - - - - - - - - - - - - diff --git a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml index 98b47574b56..f8d171c3f67 100644 --- a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml +++ b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/desktop_test_config.xml @@ -12,17 +12,14 @@ - + - - - @@ -30,7 +27,6 @@ - @@ -52,17 +48,14 @@ - + - - - @@ -70,7 +63,6 @@ - diff --git a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/myriad_test_config.xml b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/myriad_test_config.xml index 7409c6287da..6e8337c369b 100644 --- a/tests/stress_tests/.automation/memcheck_tests/nightly_configs/myriad_test_config.xml +++ b/tests/stress_tests/.automation/memcheck_tests/nightly_configs/myriad_test_config.xml @@ -11,17 +11,14 @@ - + - - - @@ -29,7 +26,6 @@ - @@ -51,17 +47,14 @@ - + - - - @@ -69,7 +62,6 @@ - diff --git a/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml b/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml index 1dd9795ffb4..ed189715b2a 100644 --- a/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml +++ b/tests/stress_tests/.automation/memcheck_tests/weekly_configs/desktop_test_config.xml @@ -7,7 +7,6 @@ - @@ -15,35 +14,24 @@ - - - - - - - - - - - - + @@ -52,21 +40,15 @@ - - - - - - @@ -85,27 +67,18 @@ - - + - - - - - - - - @@ -167,7 +140,6 @@ - @@ -175,35 +147,24 @@ - - - - - - - - - - - - + @@ -212,21 +173,15 @@ - - - - - - @@ -245,28 +200,19 @@ - - + - - - - - - - - From 371eaba7cd8937cfa46070deea58009314dc5298 Mon Sep 17 00:00:00 2001 From: Ilya Znamenskiy Date: Mon, 10 Jan 2022 14:04:04 +0300 Subject: [PATCH 50/78] [GPU] Fix of gws/lws inconsistency for some reorder cases (#9467) --- .../reorder/reorder_kernel_base.cpp | 19 ++++++++++--------- .../core/common/kernel_selector_utils.cpp | 13 ++++++++++++- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp index 2762223ef83..8b34b37fb70 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/core/actual_kernels/reorder/reorder_kernel_base.cpp @@ -152,10 +152,13 @@ ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_para DispatchData dispatchData; auto& input = params.inputs[0]; + auto& output = params.output; + auto input_l = input.GetLayout(); + auto output_l = output.GetLayout(); DataTensor input_tensor = input; // Image formats reorders use read_image and write_image functions that operate on 4 channels at once, and support only single batch, // make sure that reorder size is equal to spatials sizes only - if (params.inputs[0].GetLayout() == DataLayout::image_2d_rgba || params.output.GetLayout() == DataLayout::image_2d_rgba) { + if (input_l == DataLayout::image_2d_rgba || output_l == DataLayout::image_2d_rgba) { std::vector input_sizes(4, 1); input_sizes[0] = input.X().v; input_sizes[1] = input.Y().v; @@ -165,7 +168,7 @@ ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_para dispatchData.gws = GetTensorFriendlyWorkGroups(input_tensor); dispatchData.lws = GetOptimalLocalWorkGroupSizes(dispatchData.gws, params.engineInfo); - if (params.inputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32) { + if (input_l == DataLayout::fs_b_yx_fsv32) { std::vector sizes = { 32, 16, 8, 4 }; for (auto& s : sizes) { if (dispatchData.gws[2] % s == 0) { @@ -175,13 +178,11 @@ ReorderKernelBase::DispatchData ReorderKernelBase::SetDefault(const reorder_para break; } } - } - - if ((params.output.GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 || - params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 || - params.output.GetLayout() == DataLayout::b_fs_yx_fsv16 || - params.output.GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16) && - params.inputs[0].Feature().v % 16 == 0) { + } else if ((output_l == DataLayout::bs_fs_yx_bsv16_fsv16 || + output_l == DataLayout::bs_fs_yx_bsv32_fsv32 || + output_l == DataLayout::b_fs_yx_fsv16 || + output_l == DataLayout::bs_fs_yx_bsv32_fsv16) && + input.Feature().v % 16 == 0 && dispatchData.gws[1] % 16 == 0) { dispatchData.lws[0] = 1; dispatchData.lws[1] = 16; dispatchData.lws[2] = 1; diff --git a/src/plugins/intel_gpu/src/kernel_selector/core/common/kernel_selector_utils.cpp b/src/plugins/intel_gpu/src/kernel_selector/core/common/kernel_selector_utils.cpp index 82ef6112177..fcd2fcb0e48 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/core/common/kernel_selector_utils.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/core/common/kernel_selector_utils.cpp @@ -180,12 +180,23 @@ JitConstants GetTensorFriendlyWorkGroupsJit(const DataTensor& t) { std::vector GetTensorFriendlyWorkGroups(const DataTensor& t) { std::vector sizes; + auto x = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::X); auto y = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::Y); auto z = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::Z); auto w = DataTensor::Channelndex(t.GetLayout(), Tensor::DataChannelName::W); + + auto primary_spatial_axis = x; + if (y < primary_spatial_axis && y != -1) primary_spatial_axis = y; + if (z < primary_spatial_axis && z != -1) primary_spatial_axis = z; + if (w < primary_spatial_axis && w != -1) primary_spatial_axis = w; + for (size_t i = 0; i < t.GetDims().size(); i++) { const auto& o = t.GetDims()[i]; - if (y == static_cast(i) || z == static_cast(i) || w == static_cast(i)) { + auto cur_axis_is_spatial = x == static_cast(i) || + y == static_cast(i) || + z == static_cast(i) || + w == static_cast(i); + if (cur_axis_is_spatial && primary_spatial_axis != static_cast(i)) { sizes.back() *= o.v; } else { sizes.push_back(o.v); From aecbd549f8dd52abf42308d01d19f1b45d41c67f Mon Sep 17 00:00:00 2001 From: Anastasia Popova Date: Mon, 10 Jan 2022 14:36:36 +0300 Subject: [PATCH 51/78] Support of partial shapes with boundaries in MO IR reader. (#9223) * Added support of partial shapes boundaries in MO IR reader. * Added comments. --- .../convert_model/Converting_Model.md | 5 +++- .../ir_reader/extenders/parameter_extender.py | 27 +++++++++++++++++-- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/docs/MO_DG/prepare_model/convert_model/Converting_Model.md b/docs/MO_DG/prepare_model/convert_model/Converting_Model.md index a8a22189f18..f1ddbc33539 100644 --- a/docs/MO_DG/prepare_model/convert_model/Converting_Model.md +++ b/docs/MO_DG/prepare_model/convert_model/Converting_Model.md @@ -75,7 +75,10 @@ Framework-agnostic parameters: shape to the layout required by Inference Engine (N,C,H,W). The shape should not contain undefined dimensions (? or -1) and should fit the dimensions - defined in the input operation of the graph. If there + defined in the input operation of the graph. Boundaries + of undefined dimension can be specified with ellipsis, + for example [1,1..10,128,128]. One boundary can be undefined, + for example [1,..100] or [1,3,1..,1..]. If there are multiple inputs in the model, --input_shape should contain definition of shape for each input separated by a comma, for example: [1,3,227,227],[2,4] for a diff --git a/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/parameter_extender.py b/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/parameter_extender.py index 23a04c6e488..488c4a79b4e 100644 --- a/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/parameter_extender.py +++ b/tools/mo/openvino/tools/mo/utils/ir_reader/extenders/parameter_extender.py @@ -3,6 +3,7 @@ from openvino.tools.mo.front.common.partial_infer.utils import int64_array, shape_array, dynamic_dimension_value from openvino.tools.mo.middle.passes.convert_data_type import destination_type_to_np_data_type +from openvino.tools.mo.utils.cli_parser import parse_dimension from openvino.tools.mo.utils.graph import Node from openvino.tools.mo.utils.ir_reader.extender import Extender @@ -18,7 +19,29 @@ class Parameter_extender(Extender): op.shape = int64_array([]) else: Extender.attr_to_list(op, 'shape') + shape = op.shape.copy() + has_shapes_with_boundaries = False for i, dim in enumerate(op.shape): if dim == -1 or (isinstance(dim, str) and ".." in dim): - op.shape[i] = -1 - op.shape = shape_array([d if d != -1 else dynamic_dimension_value for d in op.shape]) + shape[i] = -1 + if ".." in dim: + has_shapes_with_boundaries = True + shape = shape_array([d if d != -1 else dynamic_dimension_value for d in shape]) + + if has_shapes_with_boundaries: + shape_list = [] + for i, dim in enumerate(op.shape): + if not isinstance(dim, str): + shape_list.append(dim) + else: + shape_list.append(parse_dimension(dim)) + + # This value is used only for serialization of partial shapes with boundaries + # for Parameter node. + # 'user_shape' is not used in shape inference, as propagation of partial shapes with boundaries + # is not implemented in MO. + op['user_shape'] = tuple(shape_list) + + # If 'user_shape' is not set, 'shape' attribute is used for serialization. + # 'shape' is also used for shape inference. + op.shape = shape From b6951bfb2c3bff8346969a3111e7046e9c2de6e4 Mon Sep 17 00:00:00 2001 From: Yuan Hu Date: Mon, 10 Jan 2022 19:54:21 +0800 Subject: [PATCH 52/78] change INT8 to VPUX as first priority (#9261) Signed-off-by: Hu, Yuan2 --- src/plugins/auto/plugin.cpp | 9 +++++++-- src/tests/unit/auto/select_device_test.cpp | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/plugins/auto/plugin.cpp b/src/plugins/auto/plugin.cpp index 0588142b2a3..ac754e87aaa 100644 --- a/src/plugins/auto/plugin.cpp +++ b/src/plugins/auto/plugin.cpp @@ -442,8 +442,13 @@ DeviceInformation MultiDeviceInferencePlugin::SelectDevice(const std::vector VPUX > iGPU > MYRIAD > CPU std::list devices; - devices.splice(devices.end(), dGPU); - devices.splice(devices.end(), VPUX); + if (networkPrecision == "INT8") { + devices.splice(devices.end(), VPUX); + devices.splice(devices.end(), dGPU); + } else { + devices.splice(devices.end(), dGPU); + devices.splice(devices.end(), VPUX); + } devices.splice(devices.end(), iGPU); devices.splice(devices.end(), MYRIAD); devices.splice(devices.end(), CPU); diff --git a/src/tests/unit/auto/select_device_test.cpp b/src/tests/unit/auto/select_device_test.cpp index ac019375b81..526c5470b70 100644 --- a/src/tests/unit/auto/select_device_test.cpp +++ b/src/tests/unit/auto/select_device_test.cpp @@ -48,7 +48,7 @@ const DeviceInformation MYRIAD_INFO = {CommonTestUtils::DEVICE_MYRIAD, {}, 2, "0 const DeviceInformation KEEMBAY_INFO = {CommonTestUtils::DEVICE_KEEMBAY, {}, 2, "01", "VPUX_01" }; const std::vector fp32DeviceVector = {DGPU_INFO, IGPU_INFO, CPU_INFO, MYRIAD_INFO}; const std::vector fp16DeviceVector = {DGPU_INFO, IGPU_INFO, MYRIAD_INFO, CPU_INFO}; -const std::vector int8DeviceVector = {KEEMBAY_INFO, CPU_INFO}; +const std::vector int8DeviceVector = {KEEMBAY_INFO, DGPU_INFO, IGPU_INFO, CPU_INFO}; const std::vector binDeviceVector = {DGPU_INFO, IGPU_INFO, CPU_INFO}; const std::vector batchedblobDeviceVector = {DGPU_INFO, IGPU_INFO}; std::map> devicesMap = {{"FP32", fp32DeviceVector}, @@ -161,7 +161,7 @@ public: plugin->SetCore(core); IE_SET_METRIC(OPTIMIZATION_CAPABILITIES, cpuCability, {"FP32", "FP16", "INT8", "BIN"}); - IE_SET_METRIC(OPTIMIZATION_CAPABILITIES, gpuCability, {"FP32", "FP16", "BATCHED_BLOB", "BIN"}); + IE_SET_METRIC(OPTIMIZATION_CAPABILITIES, gpuCability, {"FP32", "FP16", "BATCHED_BLOB", "BIN", "INT8"}); IE_SET_METRIC(OPTIMIZATION_CAPABILITIES, myriadCability, {"FP16"}); IE_SET_METRIC(OPTIMIZATION_CAPABILITIES, vpuxCability, {"INT8"}); From 00361b761750b96e0fd94abf65faaf083b621a5a Mon Sep 17 00:00:00 2001 From: Yuan Hu Date: Mon, 10 Jan 2022 19:57:37 +0800 Subject: [PATCH 53/78] try to fix klocwork issue (#9207) Signed-off-by: Hu, Yuan2 --- .../interface/ie_iexecutable_network_internal.hpp | 2 +- .../cpp_interfaces/interface/ie_iinfer_request_internal.hpp | 2 +- src/plugins/auto/executable_network.hpp | 2 +- src/plugins/auto/utils/log.hpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp index bd0c074eb4c..eda5f574677 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_iexecutable_network_internal.hpp @@ -149,7 +149,7 @@ public: virtual std::shared_ptr GetContext() const; protected: - ~IExecutableNetworkInternal() = default; + virtual ~IExecutableNetworkInternal() = default; /** * @brief Creates an inference request internal implementation. diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_iinfer_request_internal.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_iinfer_request_internal.hpp index c771c2bbc60..82d2ac3331b 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_iinfer_request_internal.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_iinfer_request_internal.hpp @@ -245,7 +245,7 @@ protected: /** * @brief Destroys the object. */ - ~IInferRequestInternal(); + virtual ~IInferRequestInternal(); /** * @brief Checks and executes input data pre-processing if needed. diff --git a/src/plugins/auto/executable_network.hpp b/src/plugins/auto/executable_network.hpp index 2c963d912d8..c6c218eda35 100644 --- a/src/plugins/auto/executable_network.hpp +++ b/src/plugins/auto/executable_network.hpp @@ -143,7 +143,7 @@ private: private: std::shared_ptr _core; InferenceEngine::IStreamsExecutor::Ptr _executor; - MultiDeviceInferencePlugin* _multiPlugin; + MultiDeviceInferencePlugin* _multiPlugin = nullptr; AutoContext _context; bool _workModeIsAUTO = {false}; mutable std::once_flag _oc; diff --git a/src/plugins/auto/utils/log.hpp b/src/plugins/auto/utils/log.hpp index 7f7c8eec2c9..76d388950f0 100644 --- a/src/plugins/auto/utils/log.hpp +++ b/src/plugins/auto/utils/log.hpp @@ -199,7 +199,7 @@ inline void Log::doLog(bool on, bool isTraceCallStack, LogLevel level, const cha } char buffer[255]; std::string compatibleString = "%s" + std::string(fmt); - std::snprintf (&buffer[0], sizeof(buffer), compatibleString.c_str(), "", args...); + std::snprintf(&buffer[0], sizeof(buffer), compatibleString.c_str(), "", args...); stream << ' ' << buffer << suffix << colorEnd(level); std::lock_guard autoLock(mutex); print(stream); From 8d8ceeb5d7ee801213590db3d00ee1c148152313 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 10 Jan 2022 15:08:11 +0300 Subject: [PATCH 54/78] [GPU] Fixed invalid vector element access in reduce test (#9538) --- .../shared_test_classes/src/subgraph/reduce_eltwise.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tests/functional/shared_test_classes/src/subgraph/reduce_eltwise.cpp b/src/tests/functional/shared_test_classes/src/subgraph/reduce_eltwise.cpp index 14c2cf95ff6..9778b16ec47 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/reduce_eltwise.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/reduce_eltwise.cpp @@ -56,7 +56,8 @@ void ReduceEltwiseTest::SetUp() { auto reduce = std::make_shared(paramOuts[0], reductionAxesNode, keepDims); - std::vector constShape(reduce.get()->get_output_size(), 1); + std::vector constShape(reduce.get()->get_output_partial_shape(0).rank().get_length(), 1); + ASSERT_GT(constShape.size(), 2); constShape[2] = inputShape.back(); auto constant = ngraph::builder::makeConstant(ngPrc, constShape, {}, true); auto eltw = ngraph::builder::makeEltwise(reduce, constant, ngraph::helpers::EltwiseTypes::MULTIPLY); From 04386bb667ea92bf47ad3f162b9b0a73fe2a2e62 Mon Sep 17 00:00:00 2001 From: Svetlana Dolinina Date: Mon, 10 Jan 2022 17:15:21 +0300 Subject: [PATCH 55/78] fixed tensor shapes to work correctly if shape of the first arg less then shape of the second arg; (#9368) added according unit test --- src/core/src/op/divide.cpp | 9 +++++++-- src/core/tests/type_prop/reshape.cpp | 18 ++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/core/src/op/divide.cpp b/src/core/src/op/divide.cpp index e6ae6a0fc0e..d49cce71378 100644 --- a/src/core/src/op/divide.cpp +++ b/src/core/src/op/divide.cpp @@ -66,6 +66,11 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo const auto& input1 = node->input_value(0); const auto& input2 = node->input_value(1); + // broadcast shapes to allocate tensors of correct size for operations with both inputs + PartialShape input_shape = input1.get_partial_shape(); + NGRAPH_CHECK(PartialShape::broadcast_merge_into(input_shape, input2.get_partial_shape(), node->get_autob()), + "Argument shapes in divide operation are inconsistent."); + const auto& input2_low = input2.get_tensor().get_lower_value(); if (input2_low == nullptr) return false; @@ -103,7 +108,7 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo return status; if (!is_upper) { - auto value1 = std::make_shared(input1.get_element_type(), input1.get_shape()); + auto value1 = std::make_shared(input1.get_element_type(), input_shape); status = op::v1::Select().evaluate({value1}, {input2_positive_up_mask, input1_low, input1_up}); if (!status) return status; @@ -130,7 +135,7 @@ bool evaluate_bound(const Node* node, const HostTensorVector& output_values, boo if (!status) return status; } else { - auto value1 = std::make_shared(input1.get_element_type(), input1.get_shape()); + auto value1 = std::make_shared(input1.get_element_type(), input_shape); status = op::v1::Select().evaluate({value1}, {input2_positive_up_mask, input1_up, input1_low}); if (!status) return status; diff --git a/src/core/tests/type_prop/reshape.cpp b/src/core/tests/type_prop/reshape.cpp index 6eab452ef6c..80d847fa3de 100644 --- a/src/core/tests/type_prop/reshape.cpp +++ b/src/core/tests/type_prop/reshape.cpp @@ -166,7 +166,7 @@ TEST(type_prop, interval_value_propagation_mul_div_rhs_scalar) { ASSERT_EQ(r->get_output_partial_shape(0), PartialShape({Dimension(2, 8), Dimension(8, 32), 4})); } -TEST(type_prop, interval_value_propagation_mul_div_lhs_1D) { +TEST(type_prop, interval_value_propagation_mul_lhs_1D_div) { auto param = make_shared(element::f32, PartialShape{Dimension(2, 8), Dimension(4, 16), 6}); auto shape_of = make_shared(param); auto cast_fp = make_shared(shape_of, element::f32); @@ -180,7 +180,7 @@ TEST(type_prop, interval_value_propagation_mul_div_lhs_1D) { ASSERT_EQ(r->get_output_partial_shape(0), PartialShape({Dimension(2, 8), Dimension(8, 32), 4})); } -TEST(type_prop, interval_value_propagation_mul_div_rhs_1D) { +TEST(type_prop, interval_value_propagation_mul_rhs_1D_div) { auto param = make_shared(element::f32, PartialShape{Dimension(2, 8), Dimension(4, 16), 6}); auto shape_of = make_shared(param); auto cast_fp = make_shared(shape_of, element::f32); @@ -194,6 +194,20 @@ TEST(type_prop, interval_value_propagation_mul_div_rhs_1D) { ASSERT_EQ(r->get_output_partial_shape(0), PartialShape({Dimension(2, 8), Dimension(8, 32), 4})); } +TEST(type_prop, interval_value_propagation_mul_div_lhs_1D) { + auto param = make_shared(element::f32, PartialShape{Dimension(2, 8), Dimension(4, 16), 6}); + auto shape_of = make_shared(param); + auto cast_fp = make_shared(shape_of, element::f32); + auto mul = make_shared(cast_fp, op::Constant::create(element::f32, {1}, {2})); + auto div = make_shared(op::Constant::create(element::f32, {}, {192}), mul); + auto cast_int = make_shared(div, element::i32); + + auto r = make_shared(param, cast_int, false); + + ASSERT_EQ(r->get_element_type(), element::f32); + ASSERT_EQ(r->get_output_partial_shape(0), PartialShape({Dimension(12, 48), Dimension(6, 24), 16})); +} + TEST(type_prop, interval_value_propagation_reduce) { auto param = make_shared(element::f32, PartialShape{Dimension(1, 8), 2, 3}); auto shape_of = make_shared(param); From 8fe5484645ab2194daf585b61680e64fdb78e749 Mon Sep 17 00:00:00 2001 From: Anastasia Kuporosova Date: Mon, 10 Jan 2022 17:32:55 +0300 Subject: [PATCH 56/78] [Python API] Remove offline transformations from old python api (#9121) * [Python API] Remove offline transformations from old python api * try to fix import error * try to fix pylint * try to fix pylint2 * Use new api in graph_utils * Fix pylint * Try to fix pylint * Use serialize from pass manager * try to skip tests * try to use new ir Co-authored-by: AlexeyLebedev1 --- .../ie_bridges/python/CMakeLists.txt | 1 - .../offline_transformations/CMakeLists.txt | 64 ------------- .../offline_transformations/__init__.py | 32 ------- .../offline_transformations_api.pyx | 51 ---------- .../offline_transformations_api_impl.cpp | 94 ------------------- .../offline_transformations_api_impl.hpp | 34 ------- .../offline_transformations_api_impl_defs.pxd | 27 ------ .../python/tests/test_offline_api.py | 63 ------------- .../ie_bridges/python/wheel/CMakeLists.txt | 2 +- .../openvino/tools/pot/graph/graph_utils.py | 15 +-- .../multiple_outputs_net_example_dldt.xml | 4 +- 11 files changed, 12 insertions(+), 375 deletions(-) delete mode 100644 inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt delete mode 100644 inference-engine/ie_bridges/python/src/openvino/offline_transformations/__init__.py delete mode 100644 inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx delete mode 100644 inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp delete mode 100644 inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp delete mode 100644 inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd delete mode 100644 inference-engine/ie_bridges/python/tests/test_offline_api.py diff --git a/inference-engine/ie_bridges/python/CMakeLists.txt b/inference-engine/ie_bridges/python/CMakeLists.txt index df85a0897f0..404def10358 100644 --- a/inference-engine/ie_bridges/python/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/CMakeLists.txt @@ -64,7 +64,6 @@ endfunction() set (PYTHON_BRIDGE_SRC_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory (src/openvino/inference_engine) -add_subdirectory (src/openvino/offline_transformations) if(ENABLE_WHEEL) add_subdirectory(wheel) diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt deleted file mode 100644 index e8526a96258..00000000000 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/CMakeLists.txt +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (C) 2018-2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 -# - -set(TARGET_NAME "offline_transformations_api") - -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_transformations) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_transformations) -set(CMAKE_COMPILE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_transformations) -set(CMAKE_PDB_OUTPUT_DIRECTORY ${PYTHON_BRIDGE_OUTPUT_DIRECTORY}/offline_transformations) - -set(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl_defs.pxd - ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api.pyx - ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api_impl.cpp) - -set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/offline_transformations_api.pyx - PROPERTIES CYTHON_IS_CXX ON) - -# create target - -cython_add_module(${TARGET_NAME} ${SOURCES}) - -add_dependencies(${TARGET_NAME} ie_api) -ov_python_disable_intel_warnings(${TARGET_NAME}) - -if(COMMAND ie_add_vs_version_file) - ie_add_vs_version_file(NAME ${TARGET_NAME} - FILEDESCRIPTION "Offline Transformatoins Python library") -endif() - -if(InferenceEngineDeveloperPackage_FOUND) - list(APPEND link_libraries IE::offline_transformations) -else() - list(APPEND link_libraries offline_transformations) -endif() - -target_include_directories(${TARGET_NAME} SYSTEM PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../inference_engine") -target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") -target_link_libraries(${TARGET_NAME} PRIVATE openvino::runtime ${link_libraries}) - -# Compatibility with python 2.7 which has deprecated "register" specifier -if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") - target_compile_options(${TARGET_NAME} PRIVATE "-Wno-error=register") -endif() - -add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME} - EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx") - -# perform copy -add_custom_command(TARGET ${TARGET_NAME} - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${PYTHON_BRIDGE_SRC_ROOT}/src/openvino/offline_transformations/__init__.py ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/__init__.py -) - -# install - -install(TARGETS ${TARGET_NAME} - RUNTIME DESTINATION ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}/openvino/offline_transformations COMPONENT ${PYTHON_COMPONENT} - LIBRARY DESTINATION ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}/openvino/offline_transformations COMPONENT ${PYTHON_COMPONENT}) - -install(PROGRAMS __init__.py - DESTINATION ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}/openvino/offline_transformations - COMPONENT ${PYTHON_COMPONENT}) diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/__init__.py b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/__init__.py deleted file mode 100644 index 9b0d2fd8f91..00000000000 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/__init__.py +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (C) 2018-2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import os -import sys - -if sys.platform == 'win32': - # Installer, yum, pip installs openvino dlls to the different directories - # and those paths need to be visible to the openvino modules - # - # If you're using a custom installation of openvino, - # add the location of openvino dlls to your system PATH. - # - # looking for the libs in the pip installation path by default. - openvino_libs = [os.path.join(os.path.dirname(__file__), '..', '..', 'openvino', 'libs')] - # setupvars.bat script set all libs paths to OPENVINO_LIB_PATHS environment variable. - openvino_libs_installer = os.getenv('OPENVINO_LIB_PATHS') - if openvino_libs_installer: - openvino_libs.extend(openvino_libs_installer.split(';')) - for lib in openvino_libs: - lib_path = os.path.join(os.path.dirname(__file__), lib) - if os.path.isdir(lib_path): - # On Windows, with Python >= 3.8, DLLs are no longer imported from the PATH. - if (3, 8) <= sys.version_info: - os.add_dll_directory(os.path.abspath(lib_path)) - else: - os.environ['PATH'] = os.path.abspath(lib_path) + ';' + os.environ['PATH'] - -from .offline_transformations_api import * - -__all__ = ['ApplyMOCTransformations'] diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx deleted file mode 100644 index 4b42bee7805..00000000000 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api.pyx +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (C) 2018-2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from .cimport offline_transformations_api_impl_defs as C -from ..inference_engine.ie_api cimport IENetwork - -from libcpp cimport bool -from libcpp.string cimport string -from libcpp.map cimport map -from libc.stdint cimport int64_t - - -def ApplyMOCTransformations(IENetwork network, bool cf): - C.ApplyMOCTransformations(network.impl, cf) - - -def ApplyPOTTransformations(IENetwork network, string device): - C.ApplyPOTTransformations(network.impl, device) - - -def ApplyMakeStatefulTransformation(IENetwork network, param_res_names : dict): - cdef map[string, string] c_param_res_names - for param_name, res_name in param_res_names.items(): - if type(param_name) != str or type(res_name) != str: - raise TypeError("Only string keys and values are allowed!") - c_param_res_names[param_name.encode()] = res_name.encode() - C.ApplyMakeStatefulTransformation(network.impl, c_param_res_names) - - -def ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer = True): - C.ApplyLowLatencyTransformation(network.impl, use_const_initializer) - - -def CompressModelTransformation(IENetwork network): - C.CompressModelTransformation(network.impl) - - -def ApplyPruningTransformation(IENetwork network): - C.ApplyPruningTransformation(network.impl) - - -def GenerateMappingFile(IENetwork network, string path, bool extract_names): - C.GenerateMappingFile(network.impl, path, extract_names) - - -def Serialize(IENetwork network, string path_to_xml, string path_to_bin): - C.Serialize(network.impl, path_to_xml, path_to_bin) - - -def CheckAPI(): - C.CheckAPI() diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp deleted file mode 100644 index 8ffb098e296..00000000000 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "offline_transformations_api_impl.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -void InferenceEnginePython::ApplyMOCTransformations(InferenceEnginePython::IENetwork network, bool cf) { - ngraph::pass::Manager manager; - manager.register_pass(cf); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::ApplyPOTTransformations(InferenceEnginePython::IENetwork network, std::string device) { - ngraph::pass::Manager manager; - manager.register_pass(std::move(device)); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, - bool use_const_initializer) { - ngraph::pass::Manager manager; - manager.register_pass(use_const_initializer); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::ApplyMakeStatefulTransformation(InferenceEnginePython::IENetwork network, - std::map& param_res_names) { - ngraph::pass::Manager manager; - manager.register_pass(param_res_names); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::ApplyPruningTransformation(InferenceEnginePython::IENetwork network) { - ngraph::pass::Manager manager; - manager.register_pass(); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::GenerateMappingFile(InferenceEnginePython::IENetwork network, - std::string path, - bool extract_names) { - ngraph::pass::Manager manager; - manager.register_pass(path, extract_names); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::CompressModelTransformation(InferenceEnginePython::IENetwork network) { - ngraph::pass::Manager manager; - manager.register_pass(); - manager.register_pass(); - manager.register_pass(); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::Serialize(InferenceEnginePython::IENetwork network, - std::string path_to_xml, - std::string path_to_bin) { - ngraph::pass::Manager manager; - manager.register_pass(path_to_xml, path_to_bin); - manager.run_passes(network.actual->getFunction()); -} - -void InferenceEnginePython::CheckAPI() { - std::shared_ptr f; - { - auto input = std::make_shared(ngraph::element::f32, ngraph::Shape{1, 1000, 4}); - auto reshape = - std::make_shared(input, std::make_shared(input), true); - f = std::make_shared(ngraph::NodeVector{reshape}, ngraph::ParameterVector{input}); - } - ngraph::pass::Manager m; - m.register_pass(); - m.run_passes(f); - - assert(f->get_results().size() == 1); - auto reshape = f->get_result()->input_value(0).get_node_shared_ptr(); - assert(std::dynamic_pointer_cast(reshape->input_value(0).get_node_shared_ptr())); - assert(std::dynamic_pointer_cast(reshape->input_value(1).get_node_shared_ptr())); -} diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp deleted file mode 100644 index 5de87dc2999..00000000000 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include -#include - -#include "Python.h" -#include "ie_api_impl.hpp" - -namespace InferenceEnginePython { - -void ApplyMOCTransformations(InferenceEnginePython::IENetwork network, bool cf); - -void ApplyPOTTransformations(InferenceEnginePython::IENetwork network, std::string device); - -void ApplyLowLatencyTransformation(InferenceEnginePython::IENetwork network, bool use_const_initializer = true); - -void ApplyMakeStatefulTransformation(InferenceEnginePython::IENetwork network, - std::map& param_res_names); - -void ApplyPruningTransformation(InferenceEnginePython::IENetwork network); - -void GenerateMappingFile(InferenceEnginePython::IENetwork network, std::string path, bool extract_names); - -void CompressModelTransformation(InferenceEnginePython::IENetwork network); - -void Serialize(InferenceEnginePython::IENetwork network, std::string path_to_xml, std::string path_to_bin); - -void CheckAPI(); - -}; // namespace InferenceEnginePython diff --git a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd b/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd deleted file mode 100644 index 41755a0b2e0..00000000000 --- a/inference-engine/ie_bridges/python/src/openvino/offline_transformations/offline_transformations_api_impl_defs.pxd +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (C) 2018-2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from libcpp cimport bool -from libcpp.string cimport string -from libcpp.map cimport map - -from ..inference_engine.ie_api_impl_defs cimport IENetwork - -cdef extern from "offline_transformations_api_impl.hpp" namespace "InferenceEnginePython": - cdef void ApplyMOCTransformations(IENetwork network, bool cf) - - cdef void ApplyPOTTransformations(IENetwork network, string device) - - cdef void ApplyLowLatencyTransformation(IENetwork network, bool use_const_initializer) - - cdef void ApplyMakeStatefulTransformation(IENetwork network, map[string, string]& in_out_names) - - cdef void ApplyPruningTransformation(IENetwork network) - - cdef void CompressModelTransformation(IENetwork network) - - cdef void GenerateMappingFile(IENetwork network, string path, bool extract_names) - - cdef void Serialize(IENetwork network, string path_to_xml, string path_to_bin) - - cdef void CheckAPI() diff --git a/inference-engine/ie_bridges/python/tests/test_offline_api.py b/inference-engine/ie_bridges/python/tests/test_offline_api.py deleted file mode 100644 index 4c6b7f88415..00000000000 --- a/inference-engine/ie_bridges/python/tests/test_offline_api.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (C) 2018-2021 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from openvino.inference_engine import IECore, IENetwork -from openvino.offline_transformations import ApplyMOCTransformations, ApplyLowLatencyTransformation, \ - ApplyPruningTransformation, ApplyMakeStatefulTransformation - -import ngraph as ng -from ngraph.impl.op import Parameter -from ngraph.impl import Function, Shape, Type - -from conftest import model_path - - -test_net_xml, test_net_bin = model_path() - -def get_test_cnnnetwork(): - param = ng.parameter(Shape([1, 3, 22, 22]), name="parameter") - relu = ng.relu(param) - res = ng.result(relu, name='result') - func = Function([res], [param], 'test') - caps = Function.to_capsule(func) - - cnnNetwork = IENetwork(caps) - assert cnnNetwork != None - return cnnNetwork - - -def test_moc_transformations(): - net = get_test_cnnnetwork() - ApplyMOCTransformations(net, False) - - f = ng.function_from_cnn(net) - assert f != None - assert len(f.get_ops()) == 3 - - -def test_low_latency_transformations(): - net = get_test_cnnnetwork() - ApplyLowLatencyTransformation(net) - - f = ng.function_from_cnn(net) - assert f != None - assert len(f.get_ops()) == 3 - - -def test_make_stateful_transformations(): - net = get_test_cnnnetwork() - ApplyMakeStatefulTransformation(net, {"parameter": "result"}) - - f = ng.function_from_cnn(net) - assert f != None - assert len(f.get_parameters()) == 0 - assert len(f.get_results()) == 0 - - -def test_pruning_transformations(): - net = get_test_cnnnetwork() - ApplyPruningTransformation(net) - - f = ng.function_from_cnn(net) - assert f != None - assert len(f.get_ops()) == 3 diff --git a/inference-engine/ie_bridges/python/wheel/CMakeLists.txt b/inference-engine/ie_bridges/python/wheel/CMakeLists.txt index 7bb7bc2ab79..69e49deffb6 100644 --- a/inference-engine/ie_bridges/python/wheel/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/wheel/CMakeLists.txt @@ -20,7 +20,7 @@ endif() # create target for openvino.wheel -set(openvino_wheel_deps ie_api offline_transformations_api) +set(openvino_wheel_deps ie_api) foreach(_target ov_runtime_libraries ie_plugins _pyngraph pyopenvino) if(TARGET ${_target}) list(APPEND openvino_wheel_deps ${_target}) diff --git a/tools/pot/openvino/tools/pot/graph/graph_utils.py b/tools/pot/openvino/tools/pot/graph/graph_utils.py index 795a3a6caa5..01b7c8de530 100644 --- a/tools/pot/openvino/tools/pot/graph/graph_utils.py +++ b/tools/pot/openvino/tools/pot/graph/graph_utils.py @@ -7,14 +7,16 @@ from copy import deepcopy from openvino.tools.mo.graph.graph import Graph from openvino.tools.mo.utils.ir_reader.restore_graph import restore_graph_from_ir, save_restored_graph from openvino.tools.mo.utils.logger import init_logger -from openvino.inference_engine import IECore # pylint: disable=E0611 -from openvino.offline_transformations import ApplyPOTTransformations # pylint: disable=import-error,no-name-in-module +from openvino.runtime import Core # pylint: disable=E0401,E0611 +from openvino.runtime.passes import Manager # pylint: disable=E0401,E0611 +from openvino.offline_transformations_pybind import apply_pot_transformations # pylint: disable=import-error,no-name-in-module from ..graph.passes import ModelPreprocessor, remove_converts, add_removed_converts from ..utils.logger import stdout_redirect init_logger('ERROR', False) -ie = IECore() +core = Core() +pass_manager = Manager() def load_graph(model_config, target_device='ANY'): @@ -28,11 +30,12 @@ def load_graph(model_config, target_device='ANY'): xml_path = model_config.model if target_device in special_transform_devices: - network = ie.read_network(model=xml_path, weights=bin_path) - ApplyPOTTransformations(network, target_device.encode('utf-8')) + model = core.read_model(model=xml_path, weights=bin_path) + apply_pot_transformations(model, target_device.encode('utf-8')) bin_path = serialized_bin_path xml_path = serialized_xml_path - network.serialize(xml_path, bin_path) + pass_manager.register_pass(pass_name="Serialize", xml_path=xml_path, bin_path=bin_path) + pass_manager.run_passes(model) if not os.path.exists(xml_path): raise RuntimeError('Input model xml should link to an existing file. Please, provide a correct path.') diff --git a/tools/pot/tests/data/reference_models/multiple_outputs_net_example_dldt.xml b/tools/pot/tests/data/reference_models/multiple_outputs_net_example_dldt.xml index 1d8619041a5..caba76d4fa8 100644 --- a/tools/pot/tests/data/reference_models/multiple_outputs_net_example_dldt.xml +++ b/tools/pot/tests/data/reference_models/multiple_outputs_net_example_dldt.xml @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76959e68b61e35bca1e4d0a815a2fdd2a50fbfa1ca6d4cb6218d6d57f76603b2 -size 23366 +oid sha256:5ed7c8ba0078d053ca7408709468b5f8450c8c4236938bc4107911f969a6f1ed +size 23441 From af105b86f8e78cdaa5df16a0c3f4913f1745f383 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 10 Jan 2022 17:51:33 +0300 Subject: [PATCH 57/78] [CPU] Fixed Replicate via ov::Model (#9252) --- src/bindings/python/tests/__init__.py | 2 -- .../python/tests/test_onnx/test_zoo_models.py | 2 -- .../python/tests_compatibility/__init__.py | 2 -- .../test_onnx/test_zoo_models.py | 2 -- src/plugins/intel_cpu/src/mkldnn_graph.cpp | 19 +++++----- src/plugins/intel_cpu/src/mkldnn_graph.h | 2 +- .../subgraph_tests/simple_if.cpp | 25 +++++++++++++ .../subgraph/simple_if.hpp | 5 +++ .../src/subgraph/simple_if.cpp | 35 +++++++++++++++++++ 9 files changed, 74 insertions(+), 20 deletions(-) diff --git a/src/bindings/python/tests/__init__.py b/src/bindings/python/tests/__init__.py index ebb26410104..6f04c205919 100644 --- a/src/bindings/python/tests/__init__.py +++ b/src/bindings/python/tests/__init__.py @@ -60,8 +60,6 @@ xfail_issue_38708 = xfail_test(reason="RuntimeError: While validating ONNX node xfail_issue_38710 = xfail_test(reason="RuntimeError: data has zero dimension which is not allowed") xfail_issue_38713 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations: " "ai.onnx.preview.training.Momentum") -xfail_issue_45457 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v5::Loop " - "Not constant termination condition body output is not supported") xfail_issue_38724 = xfail_test(reason="RuntimeError: While validating ONNX node '': " "tf_crop_and_resize - this type of coordinate transformation mode " "is not supported. Choose one of the following modes: " diff --git a/src/bindings/python/tests/test_onnx/test_zoo_models.py b/src/bindings/python/tests/test_onnx/test_zoo_models.py index d012e9ced3a..4d9199cb1db 100644 --- a/src/bindings/python/tests/test_onnx/test_zoo_models.py +++ b/src/bindings/python/tests/test_onnx/test_zoo_models.py @@ -13,7 +13,6 @@ from tests.test_onnx.utils.model_importer import ModelImportRunner from tests import ( xfail_issue_67415, xfail_issue_38701, - xfail_issue_45457, xfail_issue_37957, xfail_issue_39669, xfail_issue_37973, @@ -193,7 +192,6 @@ if len(zoo_models) > 0: (xfail_issue_39669, "test_MSFT_opset9_cgan_cgan_cpu"), (xfail_issue_47495, "test_MSFT_opset10_BERT_Squad_bertsquad10_cpu"), - (xfail_issue_45457, "test_MSFT_opset10_mlperf_ssd_resnet34_1200_ssd_resnet34_mAP_20.2_cpu"), (xfail_issue_63643, "test_MSFT_opset10_mlperf_ssd_mobilenet_300_ssd_mobilenet_v1_coco_2018_01_28_cpu"), ] for test_case in import_xfail_list + execution_xfail_list: diff --git a/src/bindings/python/tests_compatibility/__init__.py b/src/bindings/python/tests_compatibility/__init__.py index 1cb86e1a3b7..2081db27c98 100644 --- a/src/bindings/python/tests_compatibility/__init__.py +++ b/src/bindings/python/tests_compatibility/__init__.py @@ -65,8 +65,6 @@ xfail_issue_38708 = xfail_test(reason="RuntimeError: While validating ONNX node xfail_issue_38710 = xfail_test(reason="RuntimeError: data has zero dimension which is not allowed") xfail_issue_38713 = xfail_test(reason="RuntimeError: nGraph does not support the following ONNX operations: " "ai.onnx.preview.training.Momentum") -xfail_issue_45457 = xfail_test(reason="RuntimeError: Unsupported dynamic ops: v5::Loop " - "Not constant termination condition body output is not supported") xfail_issue_38722 = xfail_test(reason="RuntimeError: While validating ONNX nodes MatMulInteger " "and QLinearMatMul " "Input0 scale and input0 zero point shape must be same and 1") diff --git a/src/bindings/python/tests_compatibility/test_onnx/test_zoo_models.py b/src/bindings/python/tests_compatibility/test_onnx/test_zoo_models.py index f24c16e5a28..baa6f853c34 100644 --- a/src/bindings/python/tests_compatibility/test_onnx/test_zoo_models.py +++ b/src/bindings/python/tests_compatibility/test_onnx/test_zoo_models.py @@ -12,7 +12,6 @@ from tests_compatibility.test_onnx.utils.model_importer import ModelImportRunner from tests_compatibility import ( xfail_issue_38701, - xfail_issue_45457, xfail_issue_37957, xfail_issue_38084, xfail_issue_39669, @@ -183,7 +182,6 @@ if len(zoo_models) > 0: (xfail_issue_39669, "test_MSFT_opset9_cgan_cgan_cpu"), (xfail_issue_47495, "test_MSFT_opset10_BERT_Squad_bertsquad10_cpu"), - (xfail_issue_45457, "test_MSFT_opset10_mlperf_ssd_resnet34_1200_ssd_resnet34_mAP_20.2_cpu"), (xfail_issue_63643, "test_MSFT_opset10_mlperf_ssd_mobilenet_300_ssd_mobilenet_v1_coco_2018_01_28_cpu"), ] for test_case in import_xfail_list + execution_xfail_list: diff --git a/src/plugins/intel_cpu/src/mkldnn_graph.cpp b/src/plugins/intel_cpu/src/mkldnn_graph.cpp index f9374c48610..56ebf892143 100644 --- a/src/plugins/intel_cpu/src/mkldnn_graph.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_graph.cpp @@ -85,7 +85,7 @@ template void MKLDNNGraph::CreateGraph(const std::shared_ptr &subgraph, const MKLDNNExtensionManager::Ptr& extMgr) { +void MKLDNNGraph::Replicate(const std::shared_ptr &subgraph, const MKLDNNExtensionManager::Ptr& extMgr) { this->_name = "subgraph"; this->reuse_io_tensors = false; @@ -93,7 +93,7 @@ void MKLDNNGraph::Replicate(const std::shared_ptr &subgr ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(subgraph); // Map data object onto producer node - std::map, std::pair> op2node; + std::map, MKLDNNNodePtr> op2node; // nodes which has no consumers (output or just unused). But doesn't marked as graph output. // Will be stored as fake output separately. @@ -130,13 +130,13 @@ void MKLDNNGraph::Replicate(const std::shared_ptr &subgr outputNodesMap[inputID] = node; } + op2node[op] = node; + for (size_t port = 0; port < op->get_input_size(); port++) { auto parentOp = op->get_input_node_shared_ptr(port); + auto parentNode = op2node[parentOp]; - auto portInfo = op2node[parentOp]; - auto parentNode = portInfo.first; - - MKLDNNEdgePtr edge(new MKLDNNEdge(parentNode, node, getParentOutputPort(op, parentOp, port), port)); + MKLDNNEdgePtr edge(new MKLDNNEdge(parentNode, node, getParentOutputPort(op, parentOp, port), static_cast(port))); node->addEdge(edge); graphEdges.push_back(edge); } @@ -145,9 +145,7 @@ void MKLDNNGraph::Replicate(const std::shared_ptr &subgr ngraph::op::v0::Result::get_type_info_static(), ngraph::op::v3::Assign::get_type_info_static(), ngraph::op::v6::Assign::get_type_info_static())) { - int outPortIdx = 0; for (int oi = 0; oi < op->get_output_size(); oi++) { - op2node[op->output(oi).get_node_shared_ptr()] = {node, outPortIdx++}; if (op->get_output_target_inputs(oi).empty()) { unusedOutputs.push_back(op->output(oi)); } @@ -157,9 +155,8 @@ void MKLDNNGraph::Replicate(const std::shared_ptr &subgr // Add stub output node for unused data for (auto unusedOutput : unusedOutputs) { - auto portInfo = op2node[unusedOutput.get_node_shared_ptr()]; - auto parentNode = portInfo.first; - auto port = portInfo.second; + auto parentNode = op2node[unusedOutput.get_node_shared_ptr()]; + const auto port = unusedOutput.get_index(); const auto nodeName = std::string("stub_") + std::to_string(unusedOutput.get_index()) + "_" + parentNode->getName(); const MKLDNNNodePtr outNode = std::make_shared(parentNode->outputShapes[port], parentNode->getOriginalOutputPrecisionAtPort(port), diff --git a/src/plugins/intel_cpu/src/mkldnn_graph.h b/src/plugins/intel_cpu/src/mkldnn_graph.h index 93f1b9b1bbc..16e45427d7f 100644 --- a/src/plugins/intel_cpu/src/mkldnn_graph.h +++ b/src/plugins/intel_cpu/src/mkldnn_graph.h @@ -220,7 +220,7 @@ protected: static mkldnn::engine eng; void Replicate(const InferenceEngine::CNNNetwork &network, const MKLDNNExtensionManager::Ptr& extMgr); - void Replicate(const std::shared_ptr &subgraph, const MKLDNNExtensionManager::Ptr& extMgr); + void Replicate(const std::shared_ptr &subgraph, const MKLDNNExtensionManager::Ptr& extMgr); void InitGraph(); void InitNodes(); void InitDescriptors(); diff --git a/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/simple_if.cpp b/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/simple_if.cpp index 5678295d0ca..f9607230c3f 100644 --- a/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/simple_if.cpp +++ b/src/tests/functional/plugin/cpu/shared_tests_instances/subgraph_tests/simple_if.cpp @@ -152,4 +152,29 @@ TEST_P(SimpleIfNotConstConditionAndDimsIncreaseTest, CompareWithRefs) { run(); }; +// the axis of split in test suit "SimpleIfNotConstConditionUnusedOutputPortsTest" is hardcoded as 1, so shape[axis] should be static +std::vector> inputShapes_4 = { + { + {{}, {{5, 7}}}, + }, + { + { + {-1, 5, -1}, + {{10, 5, 10}, {2, 5, 5}, {1, 5, 5}} + }, + }, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_If, SimpleIfNotConstConditionUnusedOutputPortsTest, + ::testing::Combine( + ::testing::ValuesIn(inputShapes_4), + ::testing::ValuesIn(inTypes), + ::testing::ValuesIn(conditions), + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + SimpleIfNotConstConditionUnusedOutputPortsTest::getTestCaseName); + +TEST_P(SimpleIfNotConstConditionUnusedOutputPortsTest, CompareWithRefs) { + run(); +}; + } // namespace diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/simple_if.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/simple_if.hpp index 61aab99df09..94c97aef512 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/simple_if.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/subgraph/simple_if.hpp @@ -58,4 +58,9 @@ protected: void compare(const std::vector &expected, const std::vector &actual) override; }; +class SimpleIfNotConstConditionUnusedOutputPortsTest : public SimpleIfNotConstConditionTest { +protected: + void SetUp() override; +}; + } // namespace SubgraphTestsDefinitions diff --git a/src/tests/functional/shared_test_classes/src/subgraph/simple_if.cpp b/src/tests/functional/shared_test_classes/src/subgraph/simple_if.cpp index 4cc469d17b2..acaa0cbcb05 100644 --- a/src/tests/functional/shared_test_classes/src/subgraph/simple_if.cpp +++ b/src/tests/functional/shared_test_classes/src/subgraph/simple_if.cpp @@ -250,4 +250,39 @@ void SimpleIfNotConstConditionAndDimsIncreaseTest::compare(const std::vector shapes; + ov::test::ElementType inType; + std::tie(shapes, inType, condition, targetDevice) = this->GetParam(); + + init_input_shapes(shapes); + for (auto &target : targetStaticShapes) + target.emplace_back(ov::Shape{}); + auto params = ngraph::builder::makeDynamicParams(inType, inputDynamicShapes); + params.emplace_back(std::make_shared(ov::element::Type_t::boolean, ov::Shape{})); + + auto p1 = std::make_shared(inType, inputDynamicShapes[0]); + auto p2 = std::make_shared(inType, inputDynamicShapes[0]); + + const size_t axis = 1; + const size_t dim = inputDynamicShapes[0][axis].get_length(); // should be static for this test suit + auto thenOp = ngraph::builder::makeSplit(p1, inType, dim, axis); + auto thenRes = std::make_shared(thenOp->output(dim / 2)); + + auto elseOp = ngraph::builder::makeSplit(p2, inType, dim, axis); + auto elseRes = std::make_shared(elseOp->output(dim - 1)); + + auto thenBody = std::make_shared(ov::OutputVector{thenRes}, ov::ParameterVector{p1}); + auto elseBody = std::make_shared(ov::OutputVector{elseRes}, ov::ParameterVector{p2}); + + auto ifOp = std::make_shared(params[1]); + ifOp->set_then_body(thenBody); + ifOp->set_else_body(elseBody); + ifOp->set_input(params[0], p1, p2); + auto ifRes = ifOp->set_output(thenRes, elseRes); + + ov::ResultVector results{std::make_shared(ifRes)}; + function = std::make_shared(results, params, "SimpleIfNotConstConditionUnusedOutputPortsTest"); +} + } // namespace SubgraphTestsDefinitions From c1206ef447722b5d764bf15860f4b0a67ea85140 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 10 Jan 2022 23:46:57 +0800 Subject: [PATCH 58/78] [CPU] SoftMax cache (#9480) * [CPUCache]SoftMax cache * [CpuCache]fix bf16 tests * [CPUCache]apply review comments * [CPUCache]fix compilation --- .../src/nodes/mkldnn_softmax_node.cpp | 82 +++- .../plugin/cpu/single_layer_tests/softmax.cpp | 433 +++++++++--------- 2 files changed, 276 insertions(+), 239 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_softmax_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_softmax_node.cpp index 3732640e09e..e4c3af91dc6 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_softmax_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_softmax_node.cpp @@ -10,11 +10,45 @@ #include #include #include "memory_desc/dnnl_blocked_memory_desc.h" +#include using namespace mkldnn; using namespace MKLDNNPlugin; using namespace InferenceEngine; +namespace { +struct SoftmaxKey { + DnnlMemoryDescCPtr inp0; + impl_desc_type implType; + size_t axis; + + size_t hash() const; + bool operator==(const SoftmaxKey& rhs) const; +}; + +size_t SoftmaxKey::hash() const { + using namespace dnnl::impl; + using namespace dnnl::impl::primitive_hashing; + + size_t seed = 0; + + seed = hash_combine(seed, get_md_hash(inp0->getDnnlDesc().data)); + seed = hash_combine(seed, implType); + seed = hash_combine(seed, axis); + return seed; +} + +bool SoftmaxKey::operator==(const SoftmaxKey& rhs) const { + bool retVal = true; + if (inp0 != rhs.inp0) { + retVal = retVal && inp0 && rhs.inp0 && inp0->getDnnlDesc() == rhs.inp0->getDnnlDesc(); + } + + retVal = retVal && implType == rhs.implType && axis == rhs.axis; + return retVal; +} +} // namespace + bool MKLDNNSoftMaxNode::isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept { try { if (!std::dynamic_pointer_cast(op)) { @@ -108,32 +142,44 @@ void MKLDNNSoftMaxNode::createDescriptor(const std::vector &input void MKLDNNSoftMaxNode::prepareParams() { auto inpDesc = getParentEdgeAt(0)->getMemory().GetDescWithType(); - const auto& in_candidate = inpDesc->getDnnlDesc(); - MKLDNNDescriptor desc(std::shared_ptr( - new softmax_forward::desc(prop_kind::forward_scoring, in_candidate, axis))); + const NodeDesc* selected_pd = getSelectedPrimitiveDescriptor(); - const NodeDesc *selected_pd = getSelectedPrimitiveDescriptor(); if (selected_pd == nullptr) IE_THROW() << "Preferable primitive descriptor is not set for node " << getName() << "."; - softmax_forward::primitive_desc prim_desc; - primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(getEngine()); + SoftmaxKey key = {inpDesc, selected_pd->getImplementationType(), axis}; + auto engine = getEngine(); + auto builder = [&engine](const SoftmaxKey& key) -> std::shared_ptr { + softmax_forward::primitive_desc prim_desc; + MKLDNNDescriptor desc(std::shared_ptr( + new softmax_forward::desc(prop_kind::forward_scoring, key.inp0->getDnnlDesc(), key.axis))); + primitive_desc_iterator itpd = desc.createPrimitiveDescriptorIterator(engine); - while (itpd) { - impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); - if (impl_type == selected_pd->getImplementationType() || - // At least for oneDNN v2.4 the softmax primitive is optimized for the cases where the dimension of the softmax axis is physically dense. - // There could be situations where it is not possible to detect the optimized case in advance in case of dynamic shapes, but - // in runtime the shape could be suitable for the optimized implementation, so we have to select the optimized one. - (ref_any == selected_pd->getImplementationType() && (impl_type & jit))) { - prim_desc = itpd.get(); - break; + while (itpd) { + impl_desc_type impl_type = parse_impl_name(itpd.impl_info_str()); + if (impl_type == key.implType || + // At least for oneDNN v2.4 the softmax primitive is optimized for the cases where the dimension of the + // softmax axis is physically dense. There could be situations where it is not possible to detect the + // optimized case in advance in case of dynamic shapes, but in runtime the shape could be suitable for + // the optimized implementation, so we have to select the optimized one. + (ref_any == key.implType && (impl_type & jit))) { + prim_desc = itpd.get(); + break; + } + if (!itpd.next_impl()) + return nullptr; } - if (!itpd.next_impl()) - IE_THROW() << "Primitive descriptor was not found for node " << getName() << "."; + return std::make_shared(prim_desc); + }; + + auto cache = getRuntimeCache(); + auto result = cache->getOrCreate(key, builder); + + if (!result.first) { + IE_THROW() << "Primitive descriptor was not found for node " << getName() << "."; } - prim.reset(new softmax_forward(prim_desc)); + prim = result.first; auto src = getParentEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive(); auto dst = getChildEdgesAtPort(0)[0]->getMemoryPtr()->GetPrimitive(); diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/softmax.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/softmax.cpp index e7bf7e0d93a..a74f006bd3b 100644 --- a/src/tests/functional/plugin/cpu/single_layer_tests/softmax.cpp +++ b/src/tests/functional/plugin/cpu/single_layer_tests/softmax.cpp @@ -1,222 +1,213 @@ -//// Copyright (C) 2018-2021 Intel Corporation -//// SPDX-License-Identifier: Apache-2.0 -//// +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 // -//#include -//#include "test_utils/cpu_test_utils.hpp" -// -//using namespace InferenceEngine; -//using namespace CPUTestUtils; -// -//namespace CPULayerTestsDefinitions { -//using ShapesDefenition = std::pair, std::vector>>; -// -//struct SoftMaxConfig { -// ShapesDefenition inputShapes; -// size_t axis; -//}; -// -//typedef std::tuple< -// InferenceEngine::Precision, // netPrecision -// SoftMaxConfig, // softmaxTestConfig -// std::string, // targetDevice -// CPUSpecificParams -//> softmaxCPUTestParams; -// -//class SoftMaxLayerCPUTest : public testing::WithParamInterface, -// virtual public LayerTestsUtils::LayerTestsCommon, public CPUTestsBase { -//public: -// static std::string getTestCaseName(const testing::TestParamInfo& obj) { -// CPUSpecificParams cpuParams; -// InferenceEngine::Precision netPrecision; -// SoftMaxConfig config; -// std::string targetDevice; -// std::tie(netPrecision, config, targetDevice, cpuParams) = obj.param; -// -// std::ostringstream result; -// result << "netPRC=" << netPrecision.name() << "_"; -// if (!config.inputShapes.first.empty()) { -// result << "IS=" << CommonTestUtils::partialShape2str(config.inputShapes.first) << "_"; -// } -// result << "TS="; -// for (const auto& shape : config.inputShapes.second) { -// result << "("; -// if (!shape.empty()) { -// auto itr = shape.begin(); -// do { -// result << CommonTestUtils::vec2str(*itr); -// } while (++itr != shape.end() && result << "_"); -// } -// result << ")_"; -// } -// result << "axis=" << config.axis << "_"; -// result << "trgDev=" << targetDevice; -// result << CPUTestsBase::getTestCaseName(cpuParams); -// -// return result.str(); -// } -// -//protected: -// void SetUp() override { -// InferenceEngine::Precision netPrecision; -// SoftMaxConfig config; -// CPUSpecificParams cpuParams; -// std::tie(netPrecision, config, targetDevice, cpuParams) = this->GetParam(); -// -// inPrc = outPrc = netPrecision; -// -// std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; -// if (selectedType.empty()) { -// selectedType = getPrimitiveType(); -// } -// selectedType.push_back('_'); -// selectedType += inPrc.name(); -// -// const auto ngPrc = FuncTestUtils::PrecisionUtils::convertIE2nGraphPrc(netPrecision); -// -// targetStaticShapes = config.inputShapes.second; -// inputDynamicShapes = config.inputShapes.first; -// -// auto inputShape = targetStaticShapes.front().front(); -// -// auto params = ngraph::builder::makeParams(ngPrc, {inputShape}); -// -// const auto paramOuts = -// ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); -// -// const auto softMax = std::make_shared(paramOuts.at(0), config.axis); -// -// function = makeNgraphFunction(ngPrc, params, softMax, "SoftMax"); -// } -//}; -// -//TEST_P(SoftMaxLayerCPUTest, CompareWithRefs) { -// SKIP_IF_CURRENT_TEST_IS_DISABLED() -// -// Run(); -// CheckPluginRelatedResults(executableNetwork, "Softmax"); -//} -// -//namespace { -////not optimized cpu spec -//const auto notOptimizedCPUSpec = CPUSpecificParams{{}, {}, {}, "ref_any"}; -// -//const std::vector optimizedConfigsFP32 = { -// //Static shapes -// {ShapesDefenition{{}, {{{1, 100}}}}, 1}, -// {ShapesDefenition{{}, {{{10, 10}}}}, 1}, -// {ShapesDefenition{{}, {{{100, 1}}}}, 0}, -// {ShapesDefenition{{}, {{{100, 1}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 1}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 5}}}}, 2}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5}}}}, 0}, -// {ShapesDefenition{{}, {{{5, 5, 1, 1}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 5, 1}}}}, 2}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5}}}}, 2}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5}}}}, 3}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5}}}}, 0}, -// {ShapesDefenition{{}, {{{5, 5, 1, 1, 1}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 5, 1, 1}}}}, 2}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5}}}}, 2}, -// {ShapesDefenition{{}, {{{5, 5, 5, 1, 1}}}}, 3}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5}}}}, 3}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 1}}}}, 4}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5}}}}, 4}, -// //Dynamic shapes -// {ShapesDefenition{ -// { //dynamic shape -// {-1, -1} -// }, -// { //target static shapes -// {{10, 10}}, -// {{15, 15}}, -// {{10, 5}} -// }}, 1}, -// {ShapesDefenition{ -// { //dynamic shape -// {{1, 100}, {1, 100}} -// }, -// { //target static shapes -// {{10, 10}}, -// {{15, 15}}, -// {{10, 5}} -// }}, 1}, -// {ShapesDefenition{ -// { //dynamic shape -// {-1, -1, 1, 1, 1} -// }, -// { //target static shapes -// {{5, 5, 1, 1, 1}}, -// {{10, 7, 1, 1, 1}} -// }}, 1}, -//}; -// -//const std::vector notOptimizedConfigsFP32 { -// //Static shapes -// {ShapesDefenition{{}, {{{1, 100}}}}, 0}, -// {ShapesDefenition{{}, {{{10, 10}}}}, 0}, -// {ShapesDefenition{{}, {{{10, 10, 10}}}}, 0}, -// {ShapesDefenition{{}, {{{10, 10, 10}}}}, 1}, -// //Dynamic shapes -// {ShapesDefenition{ -// { //dynamic shape -// {-1, -1} -// }, -// { //target static shapes -// {{10, 1}}, {{15, 15}}, {{10, 5}} -// }}, 0}, -// {ShapesDefenition{ -// { //dynamic shape -// {{1, 100}, {1, 100}, -1} -// }, -// { //target static shapes -// {{10, 10, 10}}, {{10, 10, 1}}, {{10, 5, 10}} -// }}, 1}, -//}; -// -//const std::vector unsupportedConfigsFP32 { -// //Static shapes -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5, 5}}}}, 0}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5, 5}}}}, 1}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5, 5}}}}, 2}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5, 5}}}}, 3}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5, 5}}}}, 4}, -// {ShapesDefenition{{}, {{{5, 5, 5, 5, 5, 5}}}}, 5}, -// //Dynamic shapes -// {ShapesDefenition{ -// { //dynamic shape -// {-1, -1, -1, -1, -1, -1} -// }, -// { //target static shapes -// {{5, 5, 5, 5, 5, 5}}, {{7, 7, 7, 7, 7, 7}} -// }}, 4}, -//}; -// -//const auto OptimizedParams = testing::Combine( -// testing::Values(Precision::FP32, Precision::BF16), -// testing::ValuesIn(optimizedConfigsFP32), -// testing::Values(CommonTestUtils::DEVICE_CPU), -// testing::Values(emptyCPUSpec)); -// -//INSTANTIATE_TEST_SUITE_P(smoke_SoftMax_Optimized_CPU, SoftMaxLayerCPUTest, OptimizedParams, SoftMaxLayerCPUTest::getTestCaseName); -// -//const auto NotOptimizedParams = testing::Combine( -// testing::Values(Precision::FP32, Precision::BF16), -// testing::ValuesIn(notOptimizedConfigsFP32), -// testing::Values(CommonTestUtils::DEVICE_CPU), -// testing::Values(notOptimizedCPUSpec)); -// -//INSTANTIATE_TEST_SUITE_P(smoke_SoftMax_CPU, SoftMaxLayerCPUTest, NotOptimizedParams, SoftMaxLayerCPUTest::getTestCaseName); -// -//const auto UnsupportedParams = testing::Combine( -// testing::Values(Precision::FP32, Precision::BF16), -// testing::ValuesIn(unsupportedConfigsFP32), -// testing::Values(CommonTestUtils::DEVICE_CPU), -// testing::Values(notOptimizedCPUSpec)); -// -//INSTANTIATE_TEST_SUITE_P(smoke_SoftMax_Unsupported_CPU, SoftMaxLayerCPUTest, UnsupportedParams, SoftMaxLayerCPUTest::getTestCaseName); -// -//} // namespace -//} // namespace CPULayerTestsDefinitions + +#include + +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "test_utils/cpu_test_utils.hpp" + +using namespace InferenceEngine; +using namespace CPUTestUtils; +using namespace ov::test; + +namespace CPULayerTestsDefinitions { + +struct SoftMaxConfig { + ov::test::InputShape inputShape; + size_t axis; +}; + +typedef std::tuple + softmaxCPUTestParams; + +class SoftMaxLayerCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + CPUSpecificParams cpuParams; + ElementType inType; + SoftMaxConfig config; + std::string targetDevice; + std::tie(inType, config, targetDevice, cpuParams) = obj.param; + + std::ostringstream result; + result << "netPRC=" << inType << "_"; + result << "IS=" << CommonTestUtils::partialShape2str({config.inputShape.first}) << "_"; + result << "TS="; + for (const auto& shape : config.inputShape.second) { + result << "("; + result << CommonTestUtils::vec2str(shape); + result << ")_"; + } + result << "axis=" << config.axis << "_"; + result << "trgDev=" << targetDevice; + result << CPUTestsBase::getTestCaseName(cpuParams); + + return result.str(); + } + +protected: + void SetUp() override { + ElementType inType; + SoftMaxConfig config; + CPUSpecificParams cpuParams; + std::tie(inType, config, targetDevice, cpuParams) = this->GetParam(); + + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + if (selectedType.empty()) { + selectedType = getPrimitiveType(); + } + + if (inType == ElementType::bf16) { + rel_threshold = 1e-2f; + } + selectedType = makeSelectedTypeStr(selectedType, inType); + init_input_shapes({config.inputShape}); + auto params = ngraph::builder::makeDynamicParams(inType, inputDynamicShapes); + + const auto paramOuts = + ngraph::helpers::convert2OutputVector(ngraph::helpers::castOps2Nodes(params)); + + const auto softMax = std::make_shared(paramOuts.at(0), config.axis); + + function = makeNgraphFunction(inType, params, softMax, "SoftMax"); + } +}; + +TEST_P(SoftMaxLayerCPUTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED() + run(); + CheckPluginRelatedResults(executableNetwork, "Softmax"); +} + +namespace { +// not optimized cpu spec +const auto notOptimizedCPUSpec = CPUSpecificParams{{}, {}, {"ref_any"}, "ref_any"}; + +const std::vector optimizedConfigsFP32 = { + // Static shapes + {ov::test::InputShape{ov::PartialShape{1, 100}, {ov::Shape{1, 100}}}, 1}, + {ov::test::InputShape{ov::PartialShape{10, 10}, {ov::Shape{10, 10}}}, 1}, + {ov::test::InputShape{ov::PartialShape{100, 1}, {ov::Shape{100, 1}}}, 0}, + {ov::test::InputShape{ov::PartialShape{100, 1}, {ov::Shape{100, 1}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 1}, {ov::Shape{5, 5, 1}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5}, {ov::Shape{5, 5, 5}}}, 2}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5}}}, 0}, + {ov::test::InputShape{ov::PartialShape{5, 5, 1, 1}, {ov::Shape{5, 5, 1, 1}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 1}, {ov::Shape{5, 5, 5, 1}}}, 2}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5}}}, 2}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5}}}, 3}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5}}}, 0}, + {ov::test::InputShape{ov::PartialShape{5, 5, 1, 1, 1}, {ov::Shape{5, 5, 1, 1, 1}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 1, 1}, {ov::Shape{5, 5, 5, 1, 1}}}, 2}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5}}}, 2}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 1, 1}, {ov::Shape{5, 5, 5, 1, 1}}}, 3}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5}}}, 3}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 1}, {ov::Shape{5, 5, 5, 5, 1}}}, 4}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5}}}, 4}, + // Dynamic shapes + {ov::test::InputShape{// dynamic shape + ov::PartialShape{-1, -1}, + {// target static shapes + ov::Shape{10, 10}, + ov::Shape{15, 15}, + ov::Shape{10, 10}, + ov::Shape{10, 5}}}, + 1}, + {ov::test::InputShape{// dynamic shape + ov::PartialShape{-1, -1, 1, 1, 1}, + {// target static shapes + ov::Shape{5, 5, 1, 1, 1}, + ov::Shape{10, 7, 1, 1, 1}, + ov::Shape{5, 5, 1, 1, 1}}}, + 1}, +}; + +const std::vector notOptimizedConfigsFP32{ + // Static shapes + {ov::test::InputShape{ov::PartialShape{1, 100}, {ov::Shape{1, 100}}}, 0}, + {ov::test::InputShape{ov::PartialShape{10, 10}, {ov::Shape{10, 10}}}, 0}, + {ov::test::InputShape{ov::PartialShape{10, 10, 10}, {ov::Shape{10, 10, 10}}}, 0}, + {ov::test::InputShape{ov::PartialShape{10, 10, 10}, {ov::Shape{10, 10, 10}}}, 1}, + // Dynamic shapes + {ov::test::InputShape{// dynamic shape + ov::PartialShape{-1, -1}, + {// target static shapes + ov::Shape{10, 1}, + ov::Shape{15, 15}, + ov::Shape{10, 5}, + ov::Shape{15, 15}}}, + 0}, + {ov::test::InputShape{// dynamic shape + ov::PartialShape{ov::Dimension{1, 100}, ov::Dimension{1, 100}, -1}, + {// target static shapes + ov::Shape{10, 10, 10}, + ov::Shape{10, 10, 1}, + ov::Shape{10, 5, 10}, + ov::Shape{10, 10, 1}}}, + 1}, +}; + +const std::vector unsupportedConfigsFP32{ + // Static shapes + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5, 5}}}, 0}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5, 5}}}, 1}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5, 5}}}, 2}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5, 5}}}, 3}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5, 5}}}, 4}, + {ov::test::InputShape{ov::PartialShape{5, 5, 5, 5, 5, 5}, {ov::Shape{5, 5, 5, 5, 5, 5}}}, 5}, + // Dynamic shapes + {ov::test::InputShape{// dynamic shape + ov::PartialShape{-1, -1, -1, -1, -1, -1}, + {// target static shapes + ov::Shape{5, 5, 5, 5, 5, 5}, + ov::Shape{7, 7, 7, 7, 7, 7}, + ov::Shape{5, 5, 5, 5, 5, 5}}}, + 4}, +}; + +const auto avx512 = CPUSpecificParams{{}, {}, {"jit_avx512"}, "jit_avx512"}; +const auto avx2 = CPUSpecificParams{{}, {}, {"jit_avx2"}, "jit_avx2"}; +const auto sse42 = CPUSpecificParams{{}, {}, {"jit_sse42"}, "jit_sse42"}; +const auto ref = CPUSpecificParams{{}, {}, {"ref_any"}, "ref_any"}; + +const std::vector vecCpuConfigs = {ref, sse42, avx2, avx512}; +const auto OptimizedParams = testing::Combine(testing::Values(ElementType::f32, ElementType::bf16), + testing::ValuesIn(optimizedConfigsFP32), + testing::Values(CommonTestUtils::DEVICE_CPU), + testing::ValuesIn(filterCPUInfoForDevice(vecCpuConfigs))); + +INSTANTIATE_TEST_SUITE_P(smoke_SoftMax_Optimized_CPU, + SoftMaxLayerCPUTest, + OptimizedParams, + SoftMaxLayerCPUTest::getTestCaseName); + +const auto NotOptimizedParams = testing::Combine(testing::Values(ElementType::f32, ElementType::bf16), + testing::ValuesIn(notOptimizedConfigsFP32), + testing::Values(CommonTestUtils::DEVICE_CPU), + testing::Values(notOptimizedCPUSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_SoftMax_CPU, + SoftMaxLayerCPUTest, + NotOptimizedParams, + SoftMaxLayerCPUTest::getTestCaseName); + +const auto UnsupportedParams = testing::Combine(testing::Values(ElementType::f32, ElementType::bf16), + testing::ValuesIn(unsupportedConfigsFP32), + testing::Values(CommonTestUtils::DEVICE_CPU), + testing::Values(notOptimizedCPUSpec)); + +INSTANTIATE_TEST_SUITE_P(smoke_SoftMax_Unsupported_CPU, + SoftMaxLayerCPUTest, + UnsupportedParams, + SoftMaxLayerCPUTest::getTestCaseName); + +} // namespace +} // namespace CPULayerTestsDefinitions From 0c2b53eba3d68d5a1b33faa3897836f3cbcb5993 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 10 Jan 2022 20:14:22 +0300 Subject: [PATCH 59/78] [GPU] Moved several module tests to proper folder (#9544) --- .../tests/module_tests/test_module_fusing_reorder.cpp | 4 +++- .../intel_gpu}/tests/module_tests/test_program_helpers.cpp | 0 .../{src => }/tests/test_cases/convert_color_gpu_test.cpp | 0 .../experimental_detectron_roi_feature_extractor_gpu_test.cpp | 0 .../{src => }/tests/test_cases/random_uniform_gpu_test.cpp | 0 .../intel_gpu/{src => }/tests/test_cases/range_gpu_test.cpp | 0 src/plugins/intel_gpu/{src => }/tests/test_cases/slice.cpp | 0 7 files changed, 3 insertions(+), 1 deletion(-) rename {inference-engine/thirdparty/clDNN => src/plugins/intel_gpu}/tests/module_tests/test_module_fusing_reorder.cpp (99%) rename {inference-engine/thirdparty/clDNN => src/plugins/intel_gpu}/tests/module_tests/test_program_helpers.cpp (100%) rename src/plugins/intel_gpu/{src => }/tests/test_cases/convert_color_gpu_test.cpp (100%) rename src/plugins/intel_gpu/{src => }/tests/test_cases/experimental_detectron_roi_feature_extractor_gpu_test.cpp (100%) rename src/plugins/intel_gpu/{src => }/tests/test_cases/random_uniform_gpu_test.cpp (100%) rename src/plugins/intel_gpu/{src => }/tests/test_cases/range_gpu_test.cpp (100%) rename src/plugins/intel_gpu/{src => }/tests/test_cases/slice.cpp (100%) diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp b/src/plugins/intel_gpu/tests/module_tests/test_module_fusing_reorder.cpp similarity index 99% rename from inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp rename to src/plugins/intel_gpu/tests/module_tests/test_module_fusing_reorder.cpp index a36b3c8ad22..8e32a0ec603 100644 --- a/inference-engine/thirdparty/clDNN/tests/module_tests/test_module_fusing_reorder.cpp +++ b/src/plugins/intel_gpu/tests/module_tests/test_module_fusing_reorder.cpp @@ -114,7 +114,7 @@ TEST(test_can_fuse_reorder, reorder_for_mixed_type_convolution_fsv32_cldnn) } } - +namespace { struct reorder_test_param { format input_format; format output_format; @@ -130,6 +130,8 @@ struct reorder_test_param { bool expected_result; }; +} // namespace namespace + template class ReorderTest : public ::testing::TestWithParam { public: diff --git a/inference-engine/thirdparty/clDNN/tests/module_tests/test_program_helpers.cpp b/src/plugins/intel_gpu/tests/module_tests/test_program_helpers.cpp similarity index 100% rename from inference-engine/thirdparty/clDNN/tests/module_tests/test_program_helpers.cpp rename to src/plugins/intel_gpu/tests/module_tests/test_program_helpers.cpp diff --git a/src/plugins/intel_gpu/src/tests/test_cases/convert_color_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/convert_color_gpu_test.cpp similarity index 100% rename from src/plugins/intel_gpu/src/tests/test_cases/convert_color_gpu_test.cpp rename to src/plugins/intel_gpu/tests/test_cases/convert_color_gpu_test.cpp diff --git a/src/plugins/intel_gpu/src/tests/test_cases/experimental_detectron_roi_feature_extractor_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/experimental_detectron_roi_feature_extractor_gpu_test.cpp similarity index 100% rename from src/plugins/intel_gpu/src/tests/test_cases/experimental_detectron_roi_feature_extractor_gpu_test.cpp rename to src/plugins/intel_gpu/tests/test_cases/experimental_detectron_roi_feature_extractor_gpu_test.cpp diff --git a/src/plugins/intel_gpu/src/tests/test_cases/random_uniform_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/random_uniform_gpu_test.cpp similarity index 100% rename from src/plugins/intel_gpu/src/tests/test_cases/random_uniform_gpu_test.cpp rename to src/plugins/intel_gpu/tests/test_cases/random_uniform_gpu_test.cpp diff --git a/src/plugins/intel_gpu/src/tests/test_cases/range_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/range_gpu_test.cpp similarity index 100% rename from src/plugins/intel_gpu/src/tests/test_cases/range_gpu_test.cpp rename to src/plugins/intel_gpu/tests/test_cases/range_gpu_test.cpp diff --git a/src/plugins/intel_gpu/src/tests/test_cases/slice.cpp b/src/plugins/intel_gpu/tests/test_cases/slice.cpp similarity index 100% rename from src/plugins/intel_gpu/src/tests/test_cases/slice.cpp rename to src/plugins/intel_gpu/tests/test_cases/slice.cpp From b744c11b88ba4c0a3956df587c8d734fec5b9b53 Mon Sep 17 00:00:00 2001 From: Vladimir Zinoviev Date: Mon, 10 Jan 2022 21:09:10 +0300 Subject: [PATCH 60/78] [LPT] INT16, INT32 leftovers (#7653) --- .../low_precision/layer_transformation.hpp | 69 ++++++++++++++----- .../src/fake_quantize_decomposition.cpp | 8 +-- .../src/layer_transformation.cpp | 52 +++++++------- .../src/low_precision.cpp | 10 ++- .../src/network_helper.cpp | 9 ++- .../src/quantization_details.cpp | 8 ++- src/plugins/intel_cpu/src/mkldnn_plugin.cpp | 9 +-- 7 files changed, 101 insertions(+), 64 deletions(-) diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp index dfd87cac8a4..7befc214a7d 100644 --- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp @@ -38,15 +38,34 @@ namespace ngraph { namespace pass { namespace low_precision { - +namespace precision_set { + const std::vector int8_support = { + ngraph::element::u8, ngraph::element::i8 + }; + const std::vector int8_int16_int32_support = { + ngraph::element::u8, ngraph::element::i8, + ngraph::element::u16, ngraph::element::i16, + ngraph::element::u32, ngraph::element::i32 + }; +} +enum levels : size_t { + int4 = 16, + int4_narrow_range = 15, + int8 = 256, + int8_narrow_range = 255, + int16 = 65536, + int16_narrow_range = 65535, + int32 = size_t(4294967296), // for ARM and ia32 platforms where this number bigger than size_t but never used + int32_narrow_range = 4294967295 +}; class LP_TRANSFORMATIONS_API DataPrecision { public: DataPrecision() : precision(element::undefined), min(0.f), max(0.f), hasZeroPoint(false) {} explicit DataPrecision(const element::Type& precision) { this->precision = precision; - min = getMinValue(precision, 256); - max = getMaxValue(precision, 256); + min = getMinValue(precision, levels::int8); + max = getMaxValue(precision, levels::int8); hasZeroPoint = false; } @@ -66,7 +85,7 @@ public: element::i16, element::u16, element::i32, element::u32 }; - return lowPrecision.count(precision) == 1; + return lowPrecision.find(precision) != lowPrecision.end(); } static float getMinValue(const element::Type precision, const size_t levels) { @@ -80,17 +99,31 @@ public: return -8.f; case element::i8: switch (levels) { - case 16: + case low_precision::levels::int4: return -8.f; - case 255: - return -127.f; - default: + case low_precision::levels::int4_narrow_range: + return -7.f; + case low_precision::levels::int8: return -128.f; + case low_precision::levels::int8_narrow_range: + return -127.f; } case element::i16: - return levels == 65535 ? -32767.f : -32768.f; + switch (levels) { + case low_precision::levels::int16: + return -32768.f; + case low_precision::levels::int16_narrow_range: + return -32767.f; + } + break; case element::i32: - return -2147483647.f; // -2147483647.f == -2147483648.f + switch (levels) { + case low_precision::levels::int32: + return -2147483648.f; + case low_precision::levels::int32_narrow_range: + return -2147483647.f; + } + break; case element::f16: return -1.0e15f; case element::f32: @@ -140,14 +173,14 @@ public: // Return maximum value for quantization level. Quantization level is maximum value for precision. static float getMaxValue(const size_t maxLevelsForPrecision) { - if (maxLevelsForPrecision == 255ul) { - return 254.f; - } else if (maxLevelsForPrecision == 256ul) { - return 255.f; - } else if (maxLevelsForPrecision == 16ul) { - return 15.f; - } else if (maxLevelsForPrecision == 15ul) { - return 14.f; + std::set validLevels = { + levels::int4, levels::int4_narrow_range, + levels::int8, levels::int8_narrow_range, + levels::int16, levels::int16_narrow_range, + levels::int32, levels::int32_narrow_range + }; + if (validLevels.find(maxLevelsForPrecision) != validLevels.end()) { + return maxLevelsForPrecision - 1.f; } else { THROW_TRANSFORMATION_EXCEPTION << "unexpected quantization level " << maxLevelsForPrecision; } diff --git a/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp b/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp index 6bbb398bca5..43b67d205f3 100644 --- a/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp +++ b/src/common/low_precision_transformations/src/fake_quantize_decomposition.cpp @@ -122,12 +122,12 @@ DataPrecision getDataPrecisionByOutputPort(std::shared_ptr const auto& precisions = precisionsAttribute.as().value(); std::vector precisionsForLevels{}; switch (levels) { - case 65536: - case 65535: + case low_precision::levels::int16: + case low_precision::levels::int16_narrow_range: precisionsForLevels = {element::u16, element::i16}; break; - case static_cast(4294967296): - case 4294967295: + case low_precision::levels::int32: + case low_precision::levels::int32_narrow_range: precisionsForLevels = {element::u32, element::i32}; break; default: diff --git a/src/common/low_precision_transformations/src/layer_transformation.cpp b/src/common/low_precision_transformations/src/layer_transformation.cpp index fb7c91b1b21..1ba9700255e 100644 --- a/src/common/low_precision_transformations/src/layer_transformation.cpp +++ b/src/common/low_precision_transformations/src/layer_transformation.cpp @@ -24,7 +24,7 @@ namespace low_precision { constexpr char LayerTransformation::originalLayerPostfix[]; // order defines default precision -std::vector LayerTransformation::defaultPrecisions = { ngraph::element::u8, ngraph::element::i8 }; +std::vector LayerTransformation::defaultPrecisions = precision_set::int8_support; std::mutex LayerTransformation::defaultPrecisionsMutex; LayerTransformation::LayerTransformation(const Params& params) : @@ -210,6 +210,9 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails( bool hasZeroPoint = false; bool thereIsAtLeastOneNormalValue = false; + + std::vector fullRangeLevels = { levels::int4, levels::int8, levels::int16, levels::int32 }; + for (size_t i = 0; i < outputLowValues.size(); ++i) { if ((std::fabs(outputLowValues[i]) < zeroThreshold) && (std::fabs(outputHighValues[i]) < zeroThreshold)) { // both values are too small to identify preferable precision @@ -226,9 +229,8 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails( hasNegative = true; if (outputHighValues[i] != 0.f) { - const float expectedRatio = - (quantizationLevels == 16 || quantizationLevels == 256 || - quantizationLevels == 65536 || quantizationLevels == 4294967296) ? asymmetricIntervalSideRatio : -1.f; + auto it = std::find(fullRangeLevels.begin(), fullRangeLevels.end(), quantizationLevels); + const float expectedRatio = it != fullRangeLevels.end() ? asymmetricIntervalSideRatio : -1.f; const float actualRatio = outputLowValues[i] / outputHighValues[i]; const float actual = std::fabs((actualRatio - expectedRatio) / std::min(actualRatio, expectedRatio)); if (actual > quantizationIntervalAsymmetryThreshold) { @@ -272,37 +274,35 @@ LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails( if (!hasZeroPoint) { if (signedPrecision && (!unsignedPrecision)) { switch (quantizationLevels) { - case 256: - case 255: - case 16: + case levels::int4: + case levels::int8: + case levels::int8_narrow_range: resultPrecision = element::i8; break; - case 65536: - case 65535: + case levels::int16: + case levels::int16_narrow_range: resultPrecision = element::i16; break; - case static_cast(4294967296): - case 4294967295: + case levels::int32: + case levels::int32_narrow_range: resultPrecision = element::i32; - break; } } if ((!signedPrecision) && unsignedPrecision) { switch (quantizationLevels) { - case 256: - case 255: - case 16: + case levels::int4: + case levels::int8: + case levels::int8_narrow_range: resultPrecision = element::u8; break; - case 65536: - case 65535: + case levels::int16: + case levels::int16_narrow_range: resultPrecision = element::u16; break; - case static_cast(4294967296): - case 4294967295: + case levels::int32: + case levels::int32_narrow_range: resultPrecision = element::u32; - break; } } } @@ -337,16 +337,16 @@ DataPrecision LayerTransformation::getDataPrecision( std::vector resultPrecisions = precisions; std::vector FQPrecisions; switch (quantizationDetails.levels) { - case 255: - case 256: + case levels::int8: + case levels::int8_narrow_range: FQPrecisions = {element::u8, element::i8}; break; - case 65535: - case 65536: + case levels::int16: + case levels::int16_narrow_range: FQPrecisions = {element::u16, element::i16}; break; - case 4294967295: - case static_cast(4294967296): + case levels::int32: + case levels::int32_narrow_range: FQPrecisions = {element::u32, element::i32}; } resultPrecisions = NetworkHelper::precisionIntersection(precisions, FQPrecisions); diff --git a/src/common/low_precision_transformations/src/low_precision.cpp b/src/common/low_precision_transformations/src/low_precision.cpp index 5d21e3d4bd3..5d5bb3a02a5 100644 --- a/src/common/low_precision_transformations/src/low_precision.cpp +++ b/src/common/low_precision_transformations/src/low_precision.cpp @@ -292,12 +292,10 @@ bool ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent( const std::set& levels) { std::vector> nodes = function->get_ops(); for (auto& node : nodes) { - for (size_t i = 0; i < node->inputs().size(); ++i) { - const auto fakeQuantize = as_type_ptr(node); - if (fakeQuantize != nullptr) { - if (levels.count(fakeQuantize->get_levels()) == 1) { - return true; - } + const auto fakeQuantize = as_type_ptr(node); + if (fakeQuantize != nullptr) { + if (levels.count(fakeQuantize->get_levels()) == 1) { + return true; } } } diff --git a/src/common/low_precision_transformations/src/network_helper.cpp b/src/common/low_precision_transformations/src/network_helper.cpp index d7b5a9890d0..492de2f0e47 100644 --- a/src/common/low_precision_transformations/src/network_helper.cpp +++ b/src/common/low_precision_transformations/src/network_helper.cpp @@ -22,6 +22,7 @@ #include "low_precision/rt_info/precision_preserved_attribute.hpp" #include "low_precision/rt_info/intervals_alignment_attribute.hpp" #include "low_precision/rt_info/quantization_alignment_attribute.hpp" +#include "ngraph/opsets/opset6.hpp" namespace ngraph { namespace pass { @@ -61,7 +62,9 @@ bool NetworkHelper::isConstantPath(const std::shared_ptr& op) { ov::is_type(node) || ov::is_type(node) || ov::is_type(node) || - ov::is_type(node); + ov::is_type(node) || + ov::is_type(node) || + ov::is_type(node); }; if (isNotConstantPathOperation(op)) { @@ -1730,8 +1733,8 @@ bool NetworkHelper::checkZeroPoint(const std::shared_ptr& node, const Data const auto intNode = ov::is_type(parent) ? parent : node; const auto type = intNode->get_input_element_type(0); if (type == element::u8 || type == element::i8) { - min = DataPrecision::getMinValue(type, 256) - 0.5f; - max = DataPrecision::getMaxValue(type, 256) + 0.5f; + min = DataPrecision::getMinValue(type, levels::int8) - 0.5f; + max = DataPrecision::getMaxValue(type, levels::int8) + 0.5f; } else { return type == element::f32 || type == element::f16; } diff --git a/src/common/low_precision_transformations/src/quantization_details.cpp b/src/common/low_precision_transformations/src/quantization_details.cpp index ee5ed04ad26..cec290ae3c6 100644 --- a/src/common/low_precision_transformations/src/quantization_details.cpp +++ b/src/common/low_precision_transformations/src/quantization_details.cpp @@ -19,6 +19,7 @@ #include #include +#include namespace ngraph { namespace pass { @@ -162,7 +163,12 @@ bool QuantizationDetails::empty() const noexcept { } bool QuantizationDetails::isSupportedLevel(const size_t level) { - static const std::unordered_set supported_levels = { 16, 255, 256, 65536, 65535, static_cast(4294967296), 4294967295 }; + static const std::unordered_set supported_levels = { + levels::int4, levels::int4_narrow_range, + levels::int8, levels::int8_narrow_range, + levels::int16, levels::int16_narrow_range, + levels::int32, levels::int32_narrow_range + }; return supported_levels.find(level) != supported_levels.end(); } diff --git a/src/plugins/intel_cpu/src/mkldnn_plugin.cpp b/src/plugins/intel_cpu/src/mkldnn_plugin.cpp index 6b3640290d8..9f189e69bb1 100644 --- a/src/plugins/intel_cpu/src/mkldnn_plugin.cpp +++ b/src/plugins/intel_cpu/src/mkldnn_plugin.cpp @@ -446,14 +446,11 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr bool updatePrecision = true; bool hasINT16orINT32Levels = ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent( nGraphFunc, - {65535, 65536, 4294967295, 4294967296}); + {levels::int16, levels::int16_narrow_range, + levels::int32, levels::int32_narrow_range}); if (hasINT16orINT32Levels) { updatePrecision = false; - LowPrecision::setDefaultPrecisions({ - ngraph::element::u8, ngraph::element::i8, - ngraph::element::u16, ngraph::element::i16, - ngraph::element::u32, ngraph::element::i32, - }); + LowPrecision::setDefaultPrecisions(precision_set::int8_int16_int32_support); supportedPrecisions = std::vector({}); } From b4bd4e743bb0c8f59cc37363ee7f723e604f2063 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 10 Jan 2022 21:44:11 +0300 Subject: [PATCH 61/78] [GPU] Fixed uninitialized field issue in pooling (#9542) --- src/plugins/intel_gpu/include/intel_gpu/primitives/pooling.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/pooling.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/pooling.hpp index b228f9bb601..2f672f4c17e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/pooling.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/pooling.hpp @@ -233,7 +233,7 @@ struct pooling : public primitive_base { /// @brief first dimension of input that should be used to calculate the upper bound of index output int64_t axis; /// @brief type of index output - data_types index_element_type; + data_types index_element_type = data_types::i32; bool maxPoolOpset8Features{false}; protected: From fc4185e92a7310e5accbcf53dc30169ba7b38567 Mon Sep 17 00:00:00 2001 From: Fedor Zharinov Date: Mon, 10 Jan 2022 23:37:46 +0300 Subject: [PATCH 62/78] Compiled network loading is fixed (#9547) * compiled network loading is fixed * StyleFix --- samples/cpp/benchmark_app/main.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/samples/cpp/benchmark_app/main.cpp b/samples/cpp/benchmark_app/main.cpp index 68f138f2aa5..7e32df4320f 100644 --- a/samples/cpp/benchmark_app/main.cpp +++ b/samples/cpp/benchmark_app/main.cpp @@ -516,7 +516,14 @@ int main(int argc, char* argv[]) { // -------------------------------------------------------- next_step(); auto startTime = Time::now(); - compiledModel = core.compile_model(FLAGS_m, device_name, {}); + + std::ifstream modelStream(FLAGS_m); + if (!modelStream.is_open()) { + throw std::runtime_error("Cannot open model file " + FLAGS_m); + } + compiledModel = core.import_model(modelStream, device_name, {}); + modelStream.close(); + auto duration_ms = double_to_string(get_duration_ms_till_now(startTime)); slog::info << "Import network took " << duration_ms << " ms" << slog::endl; if (statistics) From e0485c1ad2c5093fd2f217d16349d3bbbd7fc4cf Mon Sep 17 00:00:00 2001 From: "Min, Byungil" Date: Tue, 11 Jan 2022 13:18:40 +0900 Subject: [PATCH 63/78] Add reshape in front of a fully connected node for using bf input (#9449) Signed-off-by: Min, Byungil --- .../graph/graph_optimizer/prepare_padding.cpp | 2 +- .../graph/graph_optimizer/reorder_inputs.cpp | 24 ++++++++++++++++--- .../test_cases/fully_connected_gpu_test.cpp | 3 ++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_padding.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_padding.cpp index 190a32824dd..9daaba2df21 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_padding.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_padding.cpp @@ -28,7 +28,7 @@ void prepare_padding::run(program& p) { continue; auto add_required_padding = [&p](program_node& node, padding& needed_padding) { - // Add extra reorder for cldnn primitive to handle required padding if needed + // Add extra reorder if a previous node or one of its user nodes is an onednn kernel not to add padding to the onednn kernel auto& input = node.get_dependency(0); bool is_usr_onednn = false; for (auto& input_usr : input.get_users()) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 4e2b1892b60..71a31fa3246 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -13,6 +13,7 @@ #include "binary_convolution_inst.h" #include "mvn_inst.h" #include "to_string_utils.h" +#include "reshape_inst.h" #include #include @@ -575,14 +576,31 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) } }; + const auto reorder_input_fully_connected = [&p, &lo, &rf](typed_program_node& fc_node) { + auto& weights = fc_node.weights(); + auto& input = fc_node.input(); + auto input_layout = input.get_output_layout(); + // Change input data of fully-connected node from bx to bf + if (format::is_simple_data_format(input_layout.format) && weights.is_constant() && input_layout.format.dimension() == 4 && + input_layout.size.feature[0] == 1 && input_layout.size.spatial[0] != 1 && input_layout.size.spatial[1] == 1) { + auto new_tensor = input_layout.size; + new_tensor.feature[0] = input_layout.size.spatial[0]; + new_tensor.spatial[0] = 1; + auto new_reshape = std::make_shared("reorder:Reshape_bf_" + fc_node.id() + "_for_input", input.id(), new_tensor); + auto& new_reorder_node = p.get_or_create(new_reshape); + p.add_intermediate(new_reorder_node, fc_node, 0); + } + }; + for (auto& prim : p.get_processing_order()) { - program_helpers::do_for_types( + program_helpers::do_for_types( *prim, reorder_input_detection_output, reorder_input_binary_convolution, reorder_input_and_weights_deconvolution, - reorder_weights_convolution); - } + reorder_weights_convolution, + reorder_input_fully_connected); + } for (auto n : p.get_processing_order()) { if (n->is_in_data_flow() && fmt_map.count(n) != 0) { diff --git a/src/plugins/intel_gpu/tests/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/fully_connected_gpu_test.cpp index 22844430c86..d42cbd86683 100644 --- a/src/plugins/intel_gpu/tests/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/fully_connected_gpu_test.cpp @@ -1618,8 +1618,9 @@ TEST(fully_connected_onednn_gpu, no_biases_int8) { auto& engine = get_onednn_test_engine(); + // Change input data of fully-connected node from bx to bf auto input_prim = engine.allocate_memory({ data_types::f32, format::bfyx, { input_b, 1, input_x, 1 } }); - auto weights_prim = engine.allocate_memory({ data_types::i8, format::bfyx, { weight_b, 1, weight_x, 1 } }); + auto weights_prim = engine.allocate_memory({ data_types::i8, format::bfyx, { weight_b, weight_x, 1, 1 } }); set_values(input_prim, { 8.4f, 2.3f, -4.49f }); set_values(weights_prim, { 2, 1, 0, -3, -2, 1, 0, -2, -4, -5, 10, 8 }); From e095a90cdfd50c8c7ea4983b0dd55653d54abab9 Mon Sep 17 00:00:00 2001 From: Ilya Churaev Date: Tue, 11 Jan 2022 09:36:13 +0300 Subject: [PATCH 64/78] Handle names collisions for old IR with new API (#9388) * Handle names collisions for old IR with new API * Fixed load model * Try to fix tests * Try to fix tests * Try to fix build * Try to fix tests * Fixed tests * Revert "Fixed tests" This reverts commit 35da3072104c7b4337842b16f72ff864f390f7d8. * Refactoring * Fixed functional test * Try to fix CPU tests Co-authored-by: Ilya Lavrenov --- .../interface/ie_iplugin_internal.hpp | 3 +- .../interface/ie_iplugin_internal.cpp | 43 ++++++--- src/inference/src/ie_core.cpp | 4 +- src/inference/src/ie_network_reader.cpp | 54 +++++++---- .../rt_info_deserialization.cpp | 90 +++++++++++++++++++ .../executable_network/exec_network_base.hpp | 49 +++++++++- .../ov_executable_network/exec_graph_info.hpp | 2 +- .../exec_network_base.hpp | 45 ++++++++++ 8 files changed, 258 insertions(+), 32 deletions(-) diff --git a/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp b/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp index 39232e0ef86..0d20e90e87e 100644 --- a/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp +++ b/src/inference/dev_api/cpp_interfaces/interface/ie_iplugin_internal.hpp @@ -92,7 +92,8 @@ INFERENCE_ENGINE_API_CPP(OutputsDataMap) copyInfo(const OutputsDataMap& networkO */ INFERENCE_ENGINE_API_CPP(void) SetExeNetworkInfo(const std::shared_ptr& exeNetwork, - const std::shared_ptr& function); + const std::shared_ptr& function, + bool new_api); /** * @interface IInferencePlugin diff --git a/src/inference/src/cpp_interfaces/interface/ie_iplugin_internal.cpp b/src/inference/src/cpp_interfaces/interface/ie_iplugin_internal.cpp index 1933b50dded..6c0875337af 100644 --- a/src/inference/src/cpp_interfaces/interface/ie_iplugin_internal.cpp +++ b/src/inference/src/cpp_interfaces/interface/ie_iplugin_internal.cpp @@ -286,18 +286,21 @@ void IInferencePlugin::SetExeNetworkInfo(const std::shared_ptr& exeNetwork, const std::shared_ptr& function) { - InferenceEngine::SetExeNetworkInfo(exeNetwork, function); + bool newAPI = this->GetCore() && this->GetCore()->isNewAPI(); + InferenceEngine::SetExeNetworkInfo(exeNetwork, function, newAPI); exeNetwork->SetPointerToPlugin(shared_from_this()); } void SetExeNetworkInfo(const std::shared_ptr& exeNetwork, - const std::shared_ptr& function) { + const std::shared_ptr& function, + bool new_api) { OPENVINO_ASSERT(exeNetwork != nullptr); OPENVINO_ASSERT(function != nullptr); std::vector> const_params; std::vector> const_results; + std::unordered_set leaf_names; bool add_operation_names = false; const auto& rt_info = function->get_rt_info(); const auto it = rt_info.find("version"); @@ -307,6 +310,14 @@ void SetExeNetworkInfo(const std::shared_ptr& exeNet // getInputs / getOutputs. Since these functions are designed to be used in new API only // always need to add operation names for IR v10 add_operation_names = ir_version == 10; + + for (const auto& vals : {function->inputs(), function->outputs()}) { + for (const auto& val : vals) { + for (const auto& name : val.get_names()) { + leaf_names.insert(name); + } + } + } } const auto& inputsInfo = exeNetwork->GetInputsInfo(); @@ -315,14 +326,21 @@ void SetExeNetworkInfo(const std::shared_ptr& exeNet OPENVINO_ASSERT(outputsInfo.size() == function->get_output_size()); for (const auto& param : function->get_parameters()) { + const auto& param_name = param->get_friendly_name(); auto new_param = ov::as_type_ptr(param->copy_with_new_inputs({})); - new_param->set_friendly_name(param->get_friendly_name()); - if (add_operation_names) - new_param->output(0).get_tensor().add_names({new_param->get_friendly_name()}); + new_param->set_friendly_name(param_name); + if (add_operation_names) { + OPENVINO_ASSERT(!new_api || leaf_names.find(param_name) == leaf_names.end() || + param->output(0).get_names().find(param_name) != param->output(0).get_names().end(), + "Model operation names have collisions with tensor names.", + " Please use MO to generate new IR version, it should allow to avoid the issue"); + leaf_names.insert(param_name); + new_param->output(0).get_tensor().add_names({param_name}); + } // WA: use CNNNetwork's precisions since plugins sometimes override their precisions // after transformation pipeline is run new_param->set_element_type( - InferenceEngine::details::convertPrecision(inputsInfo.at(new_param->get_friendly_name())->getPrecision())); + InferenceEngine::details::convertPrecision(inputsInfo.at(param_name)->getPrecision())); new_param->set_layout(param->get_layout()); new_param->output(0).get_rt_info() = param->output(0).get_rt_info(); new_param->validate_and_infer_types(); @@ -331,15 +349,20 @@ void SetExeNetworkInfo(const std::shared_ptr& exeNet for (const auto& result : function->get_results()) { auto fake_param = std::make_shared(result->get_output_element_type(0), result->get_output_partial_shape(0)); - const std::string param_name = ngraph::op::util::create_ie_output_name(result->input_value(0)); - fake_param->set_friendly_name(param_name); + const std::string res_name = ngraph::op::util::create_ie_output_name(result->input_value(0)); + fake_param->set_friendly_name(res_name); fake_param->set_element_type( - InferenceEngine::details::convertPrecision(outputsInfo.at(param_name)->getPrecision())); + InferenceEngine::details::convertPrecision(outputsInfo.at(res_name)->getPrecision())); fake_param->validate_and_infer_types(); auto new_result = result->copy_with_new_inputs({fake_param}); new_result->set_friendly_name(result->get_friendly_name()); if (add_operation_names) { - new_result->output(0).get_tensor().add_names({fake_param->get_friendly_name()}); + OPENVINO_ASSERT(!new_api || leaf_names.find(res_name) == leaf_names.end() || + result->output(0).get_names().find(res_name) != result->output(0).get_names().end(), + "Model operation names have collisions with tensor names.", + " Please use MO to generate new IR version, it should allow to avoid the issue"); + leaf_names.insert(res_name); + new_result->output(0).get_tensor().add_names({res_name}); } auto r = std::dynamic_pointer_cast(new_result); OPENVINO_ASSERT(r, "Internal error. SetNetworkInfo failure casting output copy to Result"); diff --git a/src/inference/src/ie_core.cpp b/src/inference/src/ie_core.cpp index 86f3fcb89dc..ec9bf929eef 100644 --- a/src/inference/src/ie_core.cpp +++ b/src/inference/src/ie_core.cpp @@ -512,7 +512,7 @@ public: res = compile_model_impl(network, plugin, parsed._config, context, hash); } else { // Temporary workaround until all plugins support caching of original model inputs - InferenceEngine::SetExeNetworkInfo(res._ptr, network.getFunction()); + InferenceEngine::SetExeNetworkInfo(res._ptr, network.getFunction(), isNewAPI()); } } else { res = compile_model_impl(network, plugin, parsed._config, context, {}); @@ -589,7 +589,7 @@ public: res = compile_model_impl(network, plugin, parsed._config, nullptr, hash, {}, forceDisableCache); } else { // Temporary workaround until all plugins support caching of original model inputs - InferenceEngine::SetExeNetworkInfo(res._ptr, network.getFunction()); + InferenceEngine::SetExeNetworkInfo(res._ptr, network.getFunction(), isNewAPI()); } } else { res = compile_model_impl(network, plugin, parsed._config, nullptr, {}, {}, forceDisableCache); diff --git a/src/inference/src/ie_network_reader.cpp b/src/inference/src/ie_network_reader.cpp index bd971a0dfb7..ccd720e24a3 100644 --- a/src/inference/src/ie_network_reader.cpp +++ b/src/inference/src/ie_network_reader.cpp @@ -302,11 +302,32 @@ CNNNetwork convert_to_cnnnetwork(std::shared_ptr& function, const int64_t ir_version = it->second.as(); if (ir_version == 10 && newAPI) { + std::unordered_set leaf_names; const auto inputs = function->inputs(); for (size_t i = 0; i < inputs.size(); ++i) { const auto ngraph_type = inputs[i].get_element_type(); const auto legacy_type = details::toLegacyType(ngraph_type, true); prepost.input(i).tensor().set_element_type(legacy_type); + for (const auto& name : inputs[i].get_names()) { + OPENVINO_ASSERT(leaf_names.find(name) == leaf_names.end(), + "Model tensor names have collisions.", + " Please use MO to generate new IR version, it should allow to avoid the issue"); + leaf_names.insert(name); + } + } + + const auto outputs = function->outputs(); + for (size_t i = 0; i < outputs.size(); ++i) { + const auto ngraph_type = outputs[i].get_element_type(); + const auto legacy_type = details::toLegacyType(ngraph_type, false); + + prepost.output(i).tensor().set_element_type(legacy_type); + for (const auto& name : outputs[i].get_names()) { + OPENVINO_ASSERT(leaf_names.find(name) == leaf_names.end(), + "Model tensor names have collisions.", + " Please use MO to generate new IR version, it should allow to avoid the issue"); + leaf_names.insert(name); + } } // in order to support the following scenarios for IR v10 cases: @@ -317,29 +338,28 @@ CNNNetwork convert_to_cnnnetwork(std::shared_ptr& function, // f.reshape({ { "input_operation_name", ov::PartialShape{} } }); // we need to add operation names as tensor names for inputs and outputs { - std::vector result_names; - std::vector> prevPorts; - result_names.reserve(function->get_results().size()); - prevPorts.reserve(function->get_results().size()); - for (const auto& result : function->get_results()) { - result_names.emplace_back(ngraph::op::util::create_ie_output_name(result->input_value(0))); - result->output(0).get_tensor().add_names({result_names.back()}); - prevPorts.emplace_back(result->input_value(0)); + auto res_name = ngraph::op::util::create_ie_output_name(result->input_value(0)); + OPENVINO_ASSERT( + leaf_names.find(res_name) == leaf_names.end() || + result->output(0).get_names().find(res_name) != result->output(0).get_names().end(), + "Model operation names have collisions with tensor names.", + " Please use MO to generate new IR version, it should allow to avoid the issue"); + leaf_names.insert(res_name); + result->output(0).get_tensor().add_names({res_name}); } for (const auto& param : function->get_parameters()) { - param->output(0).get_tensor().add_names({param->get_friendly_name()}); + auto param_name = param->get_friendly_name(); + OPENVINO_ASSERT( + leaf_names.find(param_name) == leaf_names.end() || + param->output(0).get_names().find(param_name) != param->output(0).get_names().end(), + "Model operation names have collisions with tensor names.", + " Please use MO to generate new IR version, it should allow to avoid the issue"); + leaf_names.insert(param_name); + param->output(0).get_tensor().add_names({param_name}); } } - const auto outputs = function->outputs(); - for (size_t i = 0; i < outputs.size(); ++i) { - const auto ngraph_type = outputs[i].get_element_type(); - const auto legacy_type = details::toLegacyType(ngraph_type, false); - - prepost.output(i).tensor().set_element_type(legacy_type); - } - function = prepost.build(); // Set version to 10 diff --git a/src/tests/functional/inference_engine/ir_serialization/rt_info_deserialization.cpp b/src/tests/functional/inference_engine/ir_serialization/rt_info_deserialization.cpp index c6727ad2311..191684934ba 100644 --- a/src/tests/functional/inference_engine/ir_serialization/rt_info_deserialization.cpp +++ b/src/tests/functional/inference_engine/ir_serialization/rt_info_deserialization.cpp @@ -202,6 +202,96 @@ TEST_F(RTInfoDeserialization, NodeV10) { } } +TEST_F(RTInfoDeserialization, NamesCollisionV10) { + std::string model = R"V0G0N( + + + + + + + + + + + + + + + 1 + 3 + 22 + 22 + + + + + + + + + + + 1 + 3 + 22 + 22 + + + + + 1 + 3 + 22 + 22 + + + + + + + 1 + 3 + 22 + 22 + + + + + + + + + +)V0G0N"; + auto f = getWithIRFrontend(model); + ASSERT_NE(nullptr, f); + + auto check_version = [](const std::shared_ptr& f, int version_ref) { + auto& rt_info = f->get_rt_info(); + ASSERT_TRUE(rt_info.count("version")); + ASSERT_TRUE(rt_info.at("version").is()); + ASSERT_EQ(rt_info.at("version").as(), version_ref); + }; + check_version(f, 10); + + // read IR v10 with old API + { + InferenceEngine::Core core; + auto f_10 = core.ReadNetwork(model, InferenceEngine::Blob::CPtr()); + ASSERT_NE(nullptr, f_10.getFunction()); + + auto res = compare_functions(f, f_10.getFunction()); + EXPECT_TRUE(res.first) << res.second; + } + + // read IR v10 with new API and check that CNNNetwork precision conversions are applied + { + ov::runtime::Core core; + EXPECT_THROW(core.read_model(model, ov::runtime::Tensor()), ov::Exception); + } +} + TEST_F(RTInfoDeserialization, InputAndOutputV10) { std::string model = R"V0G0N( diff --git a/src/tests/functional/plugin/shared/include/behavior/executable_network/exec_network_base.hpp b/src/tests/functional/plugin/shared/include/behavior/executable_network/exec_network_base.hpp index 29796844f2c..f131557f26c 100644 --- a/src/tests/functional/plugin/shared/include/behavior/executable_network/exec_network_base.hpp +++ b/src/tests/functional/plugin/shared/include/behavior/executable_network/exec_network_base.hpp @@ -6,6 +6,7 @@ #include "base/behavior_test_utils.hpp" #include "common_test_utils/ngraph_test_utils.hpp" #include "common_test_utils/file_utils.hpp" +#include "openvino/core/model.hpp" namespace BehaviorTestsDefinitions { class ExecutableNetworkBaseTest : public testing::WithParamInterface, @@ -316,4 +317,50 @@ TEST_P(ExecNetSetPrecision, canSetOutputPrecisionForNetwork) { outputs_info.begin()->second->setPrecision(netPrecision); ASSERT_NO_THROW(ie->LoadNetwork(cnnNet, targetDevice, configuration)); } -} // namespace BehaviorTestsDefinitions \ No newline at end of file +TEST_P(ExecutableNetworkBaseTest, loadIncorrectV10Model) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + ov::runtime::CompiledModel execNet; + + // Create simple function + { + auto param1 = std::make_shared(ov::element::Type_t::f32, ov::Shape({1, 3, 24, 24})); + param1->set_friendly_name("param1"); + param1->output(0).get_tensor().set_names({"data1"}); + auto relu = std::make_shared(param1); + relu->set_friendly_name("data1"); + relu->output(0).get_tensor().set_names({"relu"}); + auto result = std::make_shared(relu); + result->set_friendly_name("result"); + function = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{param1}); + function->get_rt_info()["version"] = int64_t(10); + function->set_friendly_name("SimpleReLU"); + } + InferenceEngine::CNNNetwork cnnNet(function); + EXPECT_NO_THROW(ie->LoadNetwork(cnnNet, targetDevice, configuration)); +} + +TEST_P(ExecutableNetworkBaseTest, loadIncorrectV11Model) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + ov::runtime::CompiledModel execNet; + + // Create simple function + { + auto param1 = std::make_shared(ov::element::Type_t::f32, ov::Shape({1, 3, 24, 24})); + param1->set_friendly_name("param1"); + param1->output(0).get_tensor().set_names({"data1"}); + auto relu = std::make_shared(param1); + relu->set_friendly_name("data1"); + relu->output(0).get_tensor().set_names({"relu"}); + auto result = std::make_shared(relu); + result->set_friendly_name("result"); + function = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{param1}); + function->get_rt_info()["version"] = int64_t(11); + function->set_friendly_name("SimpleReLU"); + } + InferenceEngine::CNNNetwork cnnNet(function); + EXPECT_NO_THROW(ie->LoadNetwork(cnnNet, targetDevice, configuration)); +} + +} // namespace BehaviorTestsDefinitions diff --git a/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_graph_info.hpp b/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_graph_info.hpp index 2478213bd7d..b550b8293b6 100644 --- a/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_graph_info.hpp +++ b/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_graph_info.hpp @@ -236,7 +236,7 @@ TEST_P(OVExecGraphImportExportTest, importExportedIENetwork) { std::shared_ptr ie = ::PluginCache::get().ie(); InferenceEngine::ExecutableNetwork execNet; -// Create simple function + // Create simple function { auto param1 = std::make_shared(elementType, ngraph::Shape({1, 3, 24, 24})); param1->set_friendly_name("param1"); diff --git a/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_network_base.hpp b/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_network_base.hpp index 85b5aeb182d..3c397bd3741 100644 --- a/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_network_base.hpp +++ b/src/tests/functional/plugin/shared/include/behavior/ov_executable_network/exec_network_base.hpp @@ -653,6 +653,51 @@ TEST_P(OVExecutableNetworkBaseTest, getCompiledModelFromInferRequest) { ASSERT_NO_THROW(another_req.infer()); } } + +TEST_P(OVExecutableNetworkBaseTest, loadIncorrectV10Model) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + ov::runtime::CompiledModel execNet; + + // Create simple function + { + auto param1 = std::make_shared(element::Type_t::f32, ngraph::Shape({1, 3, 24, 24})); + param1->set_friendly_name("param1"); + param1->output(0).get_tensor().set_names({"data1"}); + auto relu = std::make_shared(param1); + relu->set_friendly_name("data1"); + relu->output(0).get_tensor().set_names({"relu"}); + auto result = std::make_shared(relu); + result->set_friendly_name("result"); + function = std::make_shared(ngraph::ResultVector{result}, ngraph::ParameterVector{param1}); + function->get_rt_info()["version"] = int64_t(10); + function->set_friendly_name("SimpleReLU"); + } + EXPECT_THROW(core->compile_model(function, targetDevice, configuration), ov::Exception); +} + +TEST_P(OVExecutableNetworkBaseTest, loadIncorrectV11Model) { + // Skip test according to plugin specific disabledTestPatterns() (if any) + SKIP_IF_CURRENT_TEST_IS_DISABLED() + ov::runtime::CompiledModel execNet; + + // Create simple function + { + auto param1 = std::make_shared(element::Type_t::f32, ngraph::Shape({1, 3, 24, 24})); + param1->set_friendly_name("param1"); + param1->output(0).get_tensor().set_names({"data1"}); + auto relu = std::make_shared(param1); + relu->set_friendly_name("data1"); + relu->output(0).get_tensor().set_names({"relu"}); + auto result = std::make_shared(relu); + result->set_friendly_name("result"); + function = std::make_shared(ngraph::ResultVector{result}, ngraph::ParameterVector{param1}); + function->get_rt_info()["version"] = int64_t(11); + function->set_friendly_name("SimpleReLU"); + } + EXPECT_NO_THROW(core->compile_model(function, targetDevice, configuration)); +} + } // namespace behavior } // namespace test } // namespace ov From 28e52a047592a4a536a5bba5a843bbd0f17d8511 Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Tue, 11 Jan 2022 16:37:27 +0900 Subject: [PATCH 65/78] [GPU] Fix a bug of logical padding of convolution (#9518) * [GPU] Fix a bug of logical padding of convolution + Transforms logical padding with wrong axis. * Fix a typo bug. --- src/plugins/intel_gpu/src/plugin/ops/convolution.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp b/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp index 248a0d3758b..3388c44f6e4 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/convolution.cpp @@ -23,14 +23,14 @@ namespace ov { namespace runtime { namespace intel_gpu { -struct ConvoltuionParameters { +struct ConvolutionParameters { cldnn::tensor stride; cldnn::tensor padding; cldnn::tensor dilation; uint32_t groups; }; -static ConvoltuionParameters GetConvolutionParameters(const ngraph::CoordinateDiff& pads_begin, +static ConvolutionParameters GetConvolutionParameters(const ngraph::CoordinateDiff& pads_begin, const ngraph::Strides& dilations, const ngraph::Strides& strides, uint32_t groups) { @@ -52,9 +52,9 @@ static ConvoltuionParameters GetConvolutionParameters(const ngraph::CoordinateDi break; } case 1: { - stride = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(strides[0], 1, 1)); - padding = cldnn::tensor({0, 0, TensorValue(pads_begin[0]), 0}, 0); - dilation = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(dilations[0], 1, 1)); + stride = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(1, strides[0], 1)); + padding = cldnn::tensor({0, 0, 0, TensorValue(pads_begin[0])}, 0); + dilation = cldnn::tensor(cldnn::batch(1), cldnn::feature(1), cldnn::spatial(1, dilations[0], 1)); break; } default: IE_THROW() << "Unsupported convolve parameters size. Only 1d, 2d, and 3d cases are supported"; @@ -239,7 +239,7 @@ static void CreateGroupConvolutionBackpropDataOp(Program& p, const std::shared_p static void DeformableConvolutionImpl(Program& p, const std::shared_ptr& op, - const ConvoltuionParameters& params, + const ConvolutionParameters& params, std::int64_t deformableGroupsNum, bool bilinearInterpolationPad = false) { auto inputs = p.GetInputPrimitiveIDs(op); From 1a3d0adb3e897d4d47b0777d459529f232339a10 Mon Sep 17 00:00:00 2001 From: Ilya Sharikov Date: Tue, 11 Jan 2022 10:56:50 +0300 Subject: [PATCH 66/78] Change omz model (#9551) --- tests/conditional_compilation/test_config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conditional_compilation/test_config.yml b/tests/conditional_compilation/test_config.yml index 97a073427b8..a274aa21c93 100644 --- a/tests/conditional_compilation/test_config.yml +++ b/tests/conditional_compilation/test_config.yml @@ -41,7 +41,7 @@ precision: FP32 - - model: - name: octave-resnext-101-0.25 + name: octave-resnet-26-0.25 type: omz precision: FP32 - @@ -56,6 +56,6 @@ precision: FP32 - - model: - name: densenet-201 + name: densenet-121 type: omz precision: FP32 From 986f0eaac671bba7f20e320cbb1b54f54997a2f9 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 11 Jan 2022 16:03:10 +0800 Subject: [PATCH 67/78] [CPU] Impl extract_image_patches cache (#9525) --- .../mkldnn_extract_image_patches_node.cpp | 64 +++++++++++++++++-- .../nodes/mkldnn_extract_image_patches_node.h | 3 +- .../extract_image_patches.cpp | 4 +- 3 files changed, 62 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.cpp b/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.cpp index ccd98c5c2ae..9045058f38d 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.cpp +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.cpp @@ -12,6 +12,7 @@ #include "list.hpp" #include #include "caseless.hpp" +#include using namespace MKLDNNPlugin; using namespace InferenceEngine; @@ -290,6 +291,40 @@ bool MKLDNNExtractImagePatchesNode::isSupportedOperation(const std::shared_ptr& op, const mkldnn::engine& eng, MKLDNNWeightsSharing::Ptr &cache) : MKLDNNNode(op, eng, cache) { std::string errorMessage; @@ -340,11 +375,30 @@ void MKLDNNExtractImagePatchesNode::prepareParams() { const auto& in_dims = getParentEdgeAt(0)->getMemory().getStaticDims(); const auto& out_dims = getChildEdgesAtPort(0)[0]->getMemory().getStaticDims(); const auto prcSize = getOriginalInputPrecisionAtPort(0).size(); - if (mayiuse(x64::sse41)) { - execPtr = std::make_shared(in_dims, out_dims, _ksizes, _strides, _rates, _auto_pad, prcSize); - } else { - execPtr = std::make_shared(in_dims, out_dims, _ksizes, _strides, _rates, _auto_pad, prcSize); - } + ExtractImagePatchesKey key = {in_dims, out_dims, _ksizes, _strides, _rates, _auto_pad, prcSize}; + const auto isJit = mayiuse(x64::sse41); + auto buildExecutor = [&isJit](const ExtractImagePatchesKey& key) -> executorPtr { + if (isJit) { + return std::make_shared(key.inDims, + key.outDims, + key.kSizes, + key.strides, + key.rates, + key.padType, + key.prcSize); + } else { + return std::make_shared(key.inDims, + key.outDims, + key.kSizes, + key.strides, + key.rates, + key.padType, + key.prcSize); + } + }; + auto cache = getRuntimeCache(); + auto result = cache->getOrCreate(key, buildExecutor); + execPtr = result.first; } void MKLDNNExtractImagePatchesNode::initSupportedPrimitiveDescriptors() { diff --git a/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.h b/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.h index d35d9f48d3b..6a96fd48a30 100644 --- a/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.h +++ b/src/plugins/intel_cpu/src/nodes/mkldnn_extract_image_patches_node.h @@ -52,14 +52,13 @@ public: void prepareParams() override; static bool isSupportedOperation(const std::shared_ptr& op, std::string& errorMessage) noexcept; - -private: enum class ExtImgPatcherPadType { VALID, SAME_LOWER, SAME_UPPER }; +private: std::vector _ksizes; std::vector _strides; std::vector _rates; diff --git a/src/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp b/src/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp index 2c58b36392f..e8a76305fd2 100755 --- a/src/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp +++ b/src/tests/functional/plugin/cpu/single_layer_tests/extract_image_patches.cpp @@ -79,13 +79,13 @@ const std::vector inputShapes = { // dynamic {-1, -1, -1, -1}, // static - {{2, 3, 13, 37}, {6, 4, 14, 14}, {8, 12, 15, 16}} + {{2, 3, 13, 37}, {6, 4, 14, 14}, {8, 12, 15, 16}, {2, 3, 13, 37}} }, InputShape{ // dynamic {{5, 15}, {6, 17}, {10, 15}, {13, 16}}, // static - {{5, 17, 10, 15}, {15, 10, 12, 13}, {10, 10, 15, 16}} + {{5, 17, 10, 15}, {15, 10, 12, 13}, {10, 10, 15, 16}, {5, 17, 10, 15}} }, }; From 2c6078e96c2f5288b674a206b8e8e4891e31ce81 Mon Sep 17 00:00:00 2001 From: Edward Shogulin Date: Tue, 11 Jan 2022 11:51:15 +0300 Subject: [PATCH 68/78] [LPT] Documentation (developer guide) (#7444) * [LPT] Documentation * 1) ToC was removed 2) SVG => PNG temporary conversion * [LPT] Refactoring + developer guide * [LPT] attribute doxygen documentation was added * [LPT] Developer Guide to Reference API links were added * [LPT] comments fixes * [LPT] Reference API to Developer Guide links were added * [LPT] titles were changed * [LPT] comments fixes #2 * [LPT] root document was moved to Plugin DG * [LPT] Documentation: image link quick fix * [LPT] Docummentation: PrecisionsAttribute description quick fix * fix comments from Karol * fixes * movement * directive was added * movement #2 * LPT reference in Executable Network rollback * snippets were updated ini accordance with new API --- docs/IE_PLUGIN_DG/ExecutableNetwork.md | 2 +- docs/IE_PLUGIN_DG/Intro.md | 1 + docs/IE_PLUGIN_DG/layout.xml | 73 +++- .../PluginTransformationPipeline.md | 17 + .../avg_pool_precision_preserved.md | 11 + .../attributes/intervals_alignment.md | 11 + .../attributes/per_tensor_quantization.md | 11 + .../attributes/precision_preserved.md | 11 + .../attributes/precisions.md | 11 + .../attributes/quantization_alignment.md | 11 + .../low_precision_transformation_pipeline.png | 3 + .../low_precision_transformation_pipeline.svg | 1 + .../img/model_fq_and_convolution.common.png | 3 + .../img/model_fq_and_convolution.common.svg | 1 + .../model_fq_and_convolution.transformed.png | 3 + .../model_fq_and_convolution.transformed.svg | 1 + .../model_fq_fq_and_convolution.common.png | 3 + .../model_fq_fq_and_convolution.common.svg | 1 + .../img/model_qdq_and_convolution.common.png | 3 + .../img/model_qdq_and_convolution.common.svg | 1 + .../low_precision_transformations/lpt.md | 319 +++++++++++++++ .../lpt_attributes.md | 56 +++ .../pipeline/img/step2_markup1.png | 3 + .../pipeline/img/step2_markup1.svg | 1 + .../pipeline/img/step2_markup2.png | 3 + .../pipeline/img/step2_markup2.svg | 1 + .../pipeline/img/step2_markup3.png | 3 + .../pipeline/img/step2_markup3.svg | 1 + .../pipeline/img/step2_markup4.png | 3 + .../pipeline/img/step2_markup4.svg | 1 + .../pipeline/img/step2_markup5.png | 3 + .../pipeline/img/step2_markup5.svg | 1 + .../pipeline/img/step2_markup6.png | 3 + .../pipeline/img/step2_markup6.svg | 1 + .../pipeline/img/step2_markup7.png | 3 + .../pipeline/img/step2_markup7.svg | 1 + .../pipeline/img/step2_markup_original.png | 3 + .../pipeline/img/step2_markup_original.svg | 1 + .../pipeline/img/step3_original.png | 3 + .../pipeline/img/step3_original.svg | 1 + .../pipeline/img/step3_transformed.png | 3 + .../pipeline/img/step3_transformed.svg | 1 + .../pipeline/step1_prerequisites.md | 6 + .../pipeline/step2_markup.md | 140 +++++++ .../pipeline/step3_main.md | 49 +++ .../pipeline/step4_cleanup.md | 8 + .../quantization/img/fq.common.png | 3 + .../quantization/img/fq.common.svg | 1 + .../quantization/img/fq.transformed.png | 3 + .../quantization/img/fq.transformed.svg | 1 + .../convert_subtract_constant.md | 3 + .../lin_op_sequence_fusion.md | 5 + .../pull_reshape_through_dequantization.md | 3 + .../pull_transpose_through_dequantization.md | 3 + .../align_quantization_intervals.md | 3 + .../align_quantization_parameters.md | 3 + .../step2_markup/create_attribute.md | 3 + .../create_precisions_dependent_attribute.md | 3 + .../markup_avg_pool_precision_preserved.md | 3 + .../step2_markup/markup_can_be_quantized.md | 3 + .../markup_per_tensor_quantization.md | 3 + .../step2_markup/markup_precisions.md | 3 + .../step2_markup/propagate_precisions.md | 3 + .../step2_markup/propagate_shared_value.md | 3 + .../propagate_through_precision_preserved.md | 3 + .../step2_markup/propagate_to_input.md | 3 + .../update_shared_precision_preserved.md | 3 + .../step3_main/activation/clamp.md | 3 + .../step3_main/activation/prelu.md | 3 + .../step3_main/activation/relu.md | 3 + .../step3_main/arithmetic/add.md | 57 +++ .../step3_main/arithmetic/img/add.common.png | 3 + .../step3_main/arithmetic/img/add.common.svg | 1 + .../arithmetic/img/add.transformed.png | 3 + .../arithmetic/img/add.transformed.svg | 1 + .../step3_main/arithmetic/multiply.md | 3 + .../step3_main/arithmetic/subtract.md | 3 + .../step3_main/convolution/convolution.md | 34 ++ .../convolution/convolution_backprop_data.md | 3 + .../convolution/group_convolution.md | 3 + .../img/fq_and_convolution.common.png | 3 + .../img/fq_and_convolution.common.svg | 1 + .../img/fq_and_convolution.transformed.png | 3 + .../img/fq_and_convolution.transformed.svg | 1 + .../img/fq_fq_and_convolution.common.png | 3 + .../img/fq_fq_and_convolution.common.svg | 1 + .../step3_main/image/interpolate.md | 3 + .../step3_main/matrix/mat_mul.md | 3 + .../step3_main/movement/concat.md | 3 + .../step3_main/movement/depth_to_space.md | 3 + .../step3_main/movement/pad.md | 3 + .../step3_main/movement/shuffle_channels.md | 3 + .../step3_main/movement/split.md | 3 + .../step3_main/movement/strided_slice.md | 3 + .../step3_main/movement/transpose.md | 3 + .../step3_main/movement/variadic_split.md | 3 + .../step3_main/normalization/mvn.md | 3 + .../step3_main/normalization/normalize_l2.md | 3 + .../step3_main/pooling/avg_pool.md | 3 + .../step3_main/pooling/max_pool.md | 3 + .../step3_main/quantization/fake_quantize.md | 3 + .../quantization/fold_fake_quantize.md | 3 + .../step3_main/reduction/reduce_max.md | 3 + .../step3_main/reduction/reduce_mean.md | 3 + .../step3_main/reduction/reduce_min.md | 3 + .../step3_main/reduction/reduce_sum.md | 3 + .../step3_main/shape/reshape.md | 3 + .../step3_main/shape/squeeze.md | 3 + .../step3_main/shape/unsqueeze.md | 3 + .../fake_quantize_decomposition.md | 3 + .../step4_cleanup/fold_convert.md | 3 + .../step4_cleanup/fuse_convert.md | 3 + .../fuse_multiply_to_fake_quantize.md | 3 + .../fuse_subtract_to_fake_quantize.md | 3 + .../multiply_to_group_convolution.md | 3 + docs/documentation.md | 1 + docs/doxygen/ie_docs.xml | 383 ++++++++++++++++++ docs/snippets/lpt_mkldnn_plugin.cpp | 221 ++++++++++ .../include/low_precision/add.hpp | 9 + .../align_quantization_intervals.hpp | 9 + .../align_quantization_parameters.hpp | 9 + .../include/low_precision/avg_pool.hpp | 8 + .../include/low_precision/clamp.hpp | 8 + .../include/low_precision/concat.hpp | 8 + .../convert_subtract_constant.hpp | 9 + .../include/low_precision/convolution.hpp | 8 + .../convolution_backprop_data.hpp | 8 + .../low_precision/create_attribute.hpp | 7 + .../create_precisions_dependent_attribute.hpp | 9 + .../include/low_precision/depth_to_space.hpp | 8 + .../eltwise_base_transformation.hpp | 4 + .../include/low_precision/fake_quantize.hpp | 8 + .../fake_quantize_decomposition.hpp | 9 + .../include/low_precision/fold_convert.hpp | 8 + .../low_precision/fold_fake_quantize.hpp | 8 + .../include/low_precision/fuse_convert.hpp | 8 + .../fuse_multiply_to_fake_quantize.hpp | 8 + .../fuse_subtract_to_fake_quantize.hpp | 8 + .../low_precision/group_convolution.hpp | 8 + .../include/low_precision/interpolate.hpp | 8 + .../low_precision/layer_transformation.hpp | 5 +- .../markup_avg_pool_precision_preserved.hpp | 8 + .../low_precision/markup_can_be_quantized.hpp | 10 + .../markup_per_tensor_quantization.hpp | 9 + .../low_precision/markup_precisions.hpp | 11 + .../include/low_precision/mat_mul.hpp | 8 + .../include/low_precision/max_pool.hpp | 8 + .../include/low_precision/multiply.hpp | 8 + .../multiply_to_group_convolution.hpp | 8 + .../include/low_precision/mvn.hpp | 8 + .../include/low_precision/normalize_l2.hpp | 8 + .../include/low_precision/pad.hpp | 8 + .../include/low_precision/prelu.hpp | 8 + .../low_precision/propagate_precisions.hpp | 8 + .../low_precision/propagate_shared_value.hpp | 9 + .../propagate_through_precision_preserved.hpp | 9 + .../low_precision/propagate_to_input.hpp | 9 + .../pull_reshape_through_dequantization.hpp | 9 + .../pull_transpose_through_dequantization.hpp | 9 + .../reduce_base_transformation.hpp | 10 +- .../include/low_precision/reduce_max.hpp | 8 + .../include/low_precision/reduce_mean.hpp | 8 + .../include/low_precision/reduce_min.hpp | 8 + .../include/low_precision/reduce_sum.hpp | 8 + .../include/low_precision/relu.hpp | 8 + .../include/low_precision/reshape.hpp | 8 + ...avg_pool_precision_preserved_attribute.hpp | 9 + .../rt_info/intervals_alignment_attribute.hpp | 12 + .../per_tensor_quantization_attribute.hpp | 7 + .../rt_info/precision_preserved_attribute.hpp | 8 + .../rt_info/precisions_attribute.hpp | 8 +- .../quantization_alignment_attribute.hpp | 8 + .../rt_info/shared_value_attribute.hpp | 6 + .../low_precision/shuffle_channels.hpp | 8 + .../include/low_precision/split.hpp | 8 + .../include/low_precision/squeeze.hpp | 8 + .../include/low_precision/strided_slice.hpp | 8 + .../include/low_precision/subtract.hpp | 8 + .../low_precision/transformation_context.hpp | 4 + .../transparent_base_transformation.hpp | 4 + .../include/low_precision/transpose.hpp | 8 + .../include/low_precision/unsqueeze.hpp | 8 + .../update_shared_precision_preserved.hpp | 11 +- .../include/low_precision/variadic_split.hpp | 8 + .../weightable_layer_transformation.hpp | 4 + .../lin_op_sequence_fusion.hpp | 4 + 186 files changed, 2216 insertions(+), 10 deletions(-) create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/PluginTransformationPipeline.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/avg_pool_precision_preserved.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/intervals_alignment.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/per_tensor_quantization.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precision_preserved.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precisions.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/quantization_alignment.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt_attributes.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step1_prerequisites.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step2_markup.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step3_main.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step4_cleanup.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/convert_subtract_constant.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/lin_op_sequence_fusion.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_reshape_through_dequantization.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_transpose_through_dequantization.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_intervals.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_parameters.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_attribute.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_precisions_dependent_attribute.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_avg_pool_precision_preserved.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_can_be_quantized.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_per_tensor_quantization.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_precisions.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_precisions.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_shared_value.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_through_precision_preserved.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_to_input.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/update_shared_precision_preserved.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/clamp.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/prelu.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/relu.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/add.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/multiply.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/subtract.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution_backprop_data.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/group_convolution.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.png create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.svg create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/image/interpolate.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/matrix/mat_mul.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/concat.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/depth_to_space.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/pad.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/shuffle_channels.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/split.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/strided_slice.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/transpose.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/variadic_split.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/mvn.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/normalize_l2.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/avg_pool.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/max_pool.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fake_quantize.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fold_fake_quantize.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_max.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_mean.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_min.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_sum.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/reshape.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/squeeze.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/unsqueeze.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fake_quantize_decomposition.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fold_convert.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_convert.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_multiply_to_fake_quantize.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_subtract_to_fake_quantize.md create mode 100644 docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/multiply_to_group_convolution.md create mode 100644 docs/doxygen/ie_docs.xml create mode 100644 docs/snippets/lpt_mkldnn_plugin.cpp diff --git a/docs/IE_PLUGIN_DG/ExecutableNetwork.md b/docs/IE_PLUGIN_DG/ExecutableNetwork.md index d9fc8af11ab..5f703bcd880 100644 --- a/docs/IE_PLUGIN_DG/ExecutableNetwork.md +++ b/docs/IE_PLUGIN_DG/ExecutableNetwork.md @@ -37,7 +37,7 @@ The implementation `CompileNetwork` is fully device-specific. The function accepts a const shared pointer to `ngraph::Function` object and performs the following steps: -1. Applies ngraph passes using `TransformNetwork` function, which defines plugin-specific conversion pipeline. +1. Applies ngraph passes using `TransformNetwork` function, which defines plugin-specific conversion pipeline. To support low precision inference, the pipeline can include Low Precision Transformations. These transformations are usually hardware specific. You can find how to use and configure Low Precisions Transformations in [Low Precision Transformations](@ref openvino_docs_IE_DG_lpt) guide. 2. Maps the transformed graph to a backend specific graph representation (for example, to MKLDNN graph for Intel CPU). 3. Allocates and fills memory for graph weights, backend specific memory handles and so on. diff --git a/docs/IE_PLUGIN_DG/Intro.md b/docs/IE_PLUGIN_DG/Intro.md index 8979d4c74a9..5a85573e543 100644 --- a/docs/IE_PLUGIN_DG/Intro.md +++ b/docs/IE_PLUGIN_DG/Intro.md @@ -52,6 +52,7 @@ Detailed guides * [Build](@ref openvino_docs_ie_plugin_dg_plugin_build) a plugin library using CMake\* * Plugin and its components [testing](@ref openvino_docs_ie_plugin_dg_plugin_testing) * [Quantized networks](@ref openvino_docs_ie_plugin_dg_quantized_networks) +* [Low precision transformations](@ref openvino_docs_IE_DG_lpt) guide * [Writing nGraph transformations](@ref ngraph_transformation) guide API References diff --git a/docs/IE_PLUGIN_DG/layout.xml b/docs/IE_PLUGIN_DG/layout.xml index 3dc629d959c..bba21ddd206 100644 --- a/docs/IE_PLUGIN_DG/layout.xml +++ b/docs/IE_PLUGIN_DG/layout.xml @@ -4,7 +4,78 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/PluginTransformationPipeline.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/PluginTransformationPipeline.md new file mode 100644 index 00000000000..7e13077f44f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/PluginTransformationPipeline.md @@ -0,0 +1,17 @@ +# Plugin Transformation Pipeline {#openvino_docs_IE_DG_plugin_transformation_pipeline} + +@sphinxdirective + +.. toctree:: + :maxdepth: 1 + :caption: Executable Network + :hidden: + + Low Precision Transformations + +@endsphinxdirective + +Typical plugin transformation pipeline includes steps: + 1. Common transformations + 2. [Low precision transformations](@ref openvino_docs_IE_DG_lpt) + 3. Plugin specific transformations \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/avg_pool_precision_preserved.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/avg_pool_precision_preserved.md new file mode 100644 index 00000000000..30f7411cbd9 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/avg_pool_precision_preserved.md @@ -0,0 +1,11 @@ +# AvgPoolPrecisionPreserved attribute {#openvino_docs_IE_DG_lpt_AvgPoolPrecisionPreserved} + +ngraph::AvgPoolPrecisionPreservedAttribute class represents the `AvgPoolPrecisionPreserved` attribute. + +Utility attribute, which is used only during `AvgPool` operation, precision preserved property definition. + +| Property name | Values | +|---------------|----------------------------------------------| +| Required | Yes | +| Defined | Operation | +| Properties | value (boolean) | \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/intervals_alignment.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/intervals_alignment.md new file mode 100644 index 00000000000..b977fd4a325 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/intervals_alignment.md @@ -0,0 +1,11 @@ +# IntervalsAlignment attribute {#openvino_docs_IE_DG_lpt_IntervalsAlignment} + +ngraph::IntervalsAlignmentAttribute class represents the `IntervalsAlignment` attribute. + +The attribute defines a subgraph with the same quantization intervals alignment. `FakeQuantize` operations are included. The attribute is used by quantization operations. + +| Property name | Values | +|---------------|----------------------------------------------| +| Required | Yes | +| Defined | Operation | +| Properties | combined interval, minimal interval, minimal levels, preferable precisions | \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/per_tensor_quantization.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/per_tensor_quantization.md new file mode 100644 index 00000000000..03a8a672177 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/per_tensor_quantization.md @@ -0,0 +1,11 @@ +# PerTensorQuantization attribute {#openvino_docs_IE_DG_lpt_PerTensorQuantization} + +ngraph::PerTensorQuantizationAttribute class represents the `PerTensorQuantization` attribute. + +The attribute defines if the operation input port requires per-tensor quantization. + +| Property name | Values | +|---------------|----------------------------------------------| +| Required | Yes | +| Defined | Operation, input ports | +| Properties | | \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precision_preserved.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precision_preserved.md new file mode 100644 index 00000000000..cf75ecc61c6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precision_preserved.md @@ -0,0 +1,11 @@ +# PrecisionPreserved attribute {#openvino_docs_IE_DG_lpt_PrecisionPreserved} + +ngraph::PrecisionPreservedAttribute class represents the `PrecisionPreserved` attribute. + +The attribute defines a precision preserved operation. If the attribute is absent, then an operation is not precision preserved. + +| Property name | Values | +|---------------|----------------------------------------------| +| Required | Yes | +| Defined | Operation | +| Properties | value (boolean) | \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precisions.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precisions.md new file mode 100644 index 00000000000..0b0c27a4801 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/precisions.md @@ -0,0 +1,11 @@ +# Precisions attribute {#openvino_docs_IE_DG_lpt_Precisions} + +ngraph::PrecisionsAttribute class represents the `Precisions` attribute. + +The attribute defines precision which is required for input/output port or an operation. + +| Property name | Values | +|---------------|----------------------------------------------| +| Required | Yes | +| Defined | Operation, input port, output port | +| Properties | precisions | \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/quantization_alignment.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/quantization_alignment.md new file mode 100644 index 00000000000..66747a63ecd --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/attributes/quantization_alignment.md @@ -0,0 +1,11 @@ +# QuantizationAlignment attribute {#openvino_docs_IE_DG_lpt_QuantizationAlignment} + +ngraph::QuantizationAlignmentAttribute class represents the `QuantizationAlignment` attribute. + +The attribute defines a subgraph with the same quantization alignment. `FakeQuantize` operations are not included. The attribute is used by quantization operations. + +| Property name | Values | +|---------------|----------------------------------------------| +| Required | Yes | +| Defined | Operation | +| Properties | value (boolean) | \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.png new file mode 100644 index 00000000000..749a83c1015 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ee64e2c942110b8dbbc7cb3d200ed7061da6a12a55c0f379378e31db9ae2180 +size 366513 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.svg new file mode 100644 index 00000000000..9292ce92a6a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/low_precision_transformation_pipeline.svg @@ -0,0 +1 @@ +Step 1PrerequisitesStep 2Markup transformationsStep 3Main transformationsStep 4Cleanup transformationsPullReshapeThroughDequantizationPullTransposeThroughDequantizationngraph::pass::LinOpSequenceFusionMarkupCanBeQuantizedMarkupPrecisionsMarkupPerTensorQuantizationMarkupAvgPoolPrecisionPreservedPropagatePrecisionsAlignQuantizationInttervalsAlignQuantizationParametersAddTransformationAvgPoolTransformationClampTransformationConcatTransformationConvolutionTransformationConvolutionBackpropDataTransformationDepthToSpaceTransformationFakeQuantizeDecompositionTransformationFakeQuantizeTransformationInterpolateTransformationGroupConvolutionTransformationMatMulTransformationMaxPoolTransformationMultiplyTransformationMVNTransformationNormalizeL2TransformationPReluTransformationReduceMaxTransformationReduceMeanTransformationReduceMinTransformationReduceSumTransformationReluTransformationReshapeTransformationSqueezeTransformationShuffleChannelsTransformationSplitTransformationStridedSliceTransformationTransposeTransformationUnsqueezeTransformationVariadicSplitTransformationFoldConvertTransformationFuseConvertTransformationFuseSubtractToFakeQuantizeTransformationFuseMultiplyToFakeQuantizeTransformationMultiplyToGroupConvolutionTransformationSubtractMultiplyToMultiplyAddTransformationFoldFakeQuantizeTransformation \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.png new file mode 100644 index 00000000000..37d7e97184a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d9a68912b2dde17c731ed31b090077e6812a84231544ce3d212c0e02b13dfb +size 204085 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.svg new file mode 100644 index 00000000000..af34cbfa239 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.common.svg @@ -0,0 +1 @@ +FP32 Convolution with quantized weightsFakeQuantizeFakeQuantizelevels: 256{f32} {1, 3, 299, 299}Parameter{f32} {1, 3, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7]Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7]ResultConstant{i8} {6, 3, 1, 1}Dequantization on weightsMultiply{f32} {6, 3, 1, 1}Convert{f32} {6, 3, 1, 1}Constant{f32} {6, 1, 1, 1}Subtract{f32} {6, 3, 1, 1}Constant{i8} {6, 1, 1, 1}Convert{f32} {6, 1, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.png new file mode 100644 index 00000000000..07fb2213a90 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79b2fd14f9ff7655e4a5abe7e71748e153a095fe1f5eb07c168f53cb12fbb406 +size 216703 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.svg new file mode 100644 index 00000000000..f1f18e7b94e --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_and_convolution.transformed.svg @@ -0,0 +1 @@ +DequantizationINT8 Convolution with zero pointQuantizationFakeQuantizelevels: 256{u8} {1, 3, 299, 299}Parameter{f32} {1, 3, 299, 299}Subtract{f32} {1, 3, 299, 299}Multiply{f32} {1, 6, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{f32} {1, 1, 1, 1}value:[-12.8]Constant{f32} {1, 1, 1, 1}value:[12.7]Constant{f32} {1, 1, 1, 1}value:[0]Constant{f32} {1, 1, 1, 1}value:[255]Constant{i8} {6, 3, 1, 1}Constant{u8} {}Constant{f32} {1, 6, 1, 1}ResultSubtract{f32} {1, 3, 299, 299}Constant{i8} {6, 1, 1, 1}Zero point on activationsZero point on weights \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.png new file mode 100644 index 00000000000..e12e47a748b --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d3e9a9eddfdcd50eedb035c500848b982b9317ba23f28809a831bbe66300bec +size 167226 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.svg new file mode 100644 index 00000000000..0505b70097f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_fq_fq_and_convolution.common.svg @@ -0,0 +1 @@ +FP32 ConvolutionFakeQuantizeFakeQuantizelevels: 256{f32} {1, 3, 299, 299}Parameter{f32} {1, 3, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7]Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7]ResultFakeQuantizelevels: 255{f32} {6, 3, 299, 299}Constant{i8} {6, 3, 1, 1}Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7]Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7] \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.png new file mode 100644 index 00000000000..e70b6f920e8 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec31aa62c0e1da3caf1531f2d92270f321857aca3044445ec242f33ee224f91b +size 297353 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.svg new file mode 100644 index 00000000000..76ac5325a4f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/img/model_qdq_and_convolution.common.svg @@ -0,0 +1 @@ +DequantizationFP32 Convolution with quantized weightsQuantizationFakeQuantizelevels: 256{f32} {1, 3, 299, 299}Parameter{f32} {1, 3, 299, 299}Multiply{f32} {1, 3, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{f32} {1, 1, 1, 1}Value:[-12.8]Constant{f32} {1, 1, 1, 1}Value:[12.7]Constant{f32} {1, 1, 1, 1}Value:[0]Constant{f32} {1, 1, 1, 1}Value:[255]Constant{f32} {}ResultConvert{f32} {1, 3, 299, 299}Convert{u8} {1, 3, 299, 299}Subtract{f32} {1, 3, 299, 299}Constant{u8} {}Convert{f32} {}Dequantization on weightsConstant{i8} {6, 3, 1, 1}Multiply{f32} {6, 3, 1, 1}Convert{f32} {6, 3, 1, 1}Constant{f32} {6, 1, 1, 1}Subtract{f32} {6, 3, 1, 1}Constant{i8} {6, 1, 1, 1}Convert{f32} {6, 1, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt.md new file mode 100644 index 00000000000..0267e801004 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt.md @@ -0,0 +1,319 @@ +# OpenVINO™ Low Precision Transformations {#openvino_docs_IE_DG_lpt} + +@sphinxdirective + +.. toctree:: + :maxdepth: 1 + :caption: Low Precision Transformations + :hidden: + + Low Precision Transformations + + Attributes + Step 1. Prerequisites transformations + Step 2. Markup transformations + Step 3. Main transformations + Step 4. Cleanup transformations + +@endsphinxdirective + +## Introduction +Low precision transformations (known as LPT) are a set of nGraph transformations, which are combined in one library. The library is mandatory part of OpenVINO to infer quantized model in low precision with the maximum performance on Intel CPU, GPU and ARM platforms. The library includes more than 45 transformations and supports more then 30 operations. Some transformations are mandatory, some of them are optional and developed for specific device. + +The goal of Low Precision Transformations (LPT) is to transform a quantized model from its original precision (FP16 or FP32) to a low precision (INT8: `signed int8` or `unsigned int8`), so that it is prepared for low precision inference in OpenVINO™ plugin. It is achieved by two main principles: +1. `FakeQuantize` operation decomposition to two parts: + - part #1: quantize operation - new `FakeQuantize` operation with output quantization intervals in low precision range (signed int8: [-128, 127] or [-127, 127], unsigned int8: [0, 255] or [0, 256]) and with low precision output (`signed int8` or `unsigned int8`), + - part #2: dequantization operations with low precision input and original precision output. +2. Propagation of the dequantization operation through original model's operations. It is done to avoid dequantization operations before original model operations, thus the quantize operations with low precision output remain before the original model operations. + +As result, operation input tensor precisions will be changed from original to low precision and operations can be inferred by OpenVINO™ plugin in low precision. + +For a more detailed description on how to quantize a model, see the [Low precision tools](#low-precision-tools) section below. For more information about model quantization, refer to **Brief History of Lower Precision in Deep Learning** section in [this whitepaper](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training). + +## Input model requirements + +LPT transformations propagate dequantization operations through the following operations: +* [Add-1](@ref openvino_docs_ops_arithmetic_Add_1) +* [AvgPool-1](@ref openvino_docs_ops_pooling_AvgPool_1) +* [Clamp-1](@ref openvino_docs_ops_activation_Clamp_1) +* [Concat-1](@ref openvino_docs_ops_movement_Concat_1) +* [Convolution-1](@ref openvino_docs_ops_convolution_Convolution_1) +* [ConvolutionBackpropData-1](@ref openvino_docs_ops_convolution_ConvolutionBackpropData_1) +* [DepthToSpace-1](@ref openvino_docs_ops_movement_DepthToSpace_1) +* [FakeQuantize-1](@ref openvino_docs_ops_quantization_FakeQuantize_1) +* [GroupConvolution-1](@ref openvino_docs_ops_convolution_GroupConvolution_1) +* [Interpolate-1](@ref openvino_docs_ops_image_Interpolate_1) +* [Interpolate-4](@ref openvino_docs_ops_image_Interpolate_4) +* [MatMul-1](@ref openvino_docs_ops_matrix_MatMul_1) +* [MaxPool-1](@ref openvino_docs_ops_pooling_MaxPool_1) +* [Multiply-1](@ref openvino_docs_ops_arithmetic_Multiply_1) +* [MVN-1](@ref openvino_docs_ops_normalization_MVN_1) +* [NormalizeL2-1](@ref openvino_docs_ops_normalization_NormalizeL2_1) +* [PRelu-1](@ref openvino_docs_ops_activation_PReLU_1) +* [ReduceMax-1](@ref openvino_docs_ops_reduction_ReduceMax_1) +* [ReduceMean-1](@ref openvino_docs_ops_reduction_ReduceMean_1) +* [ReduceMin-1](@ref openvino_docs_ops_reduction_ReduceMin_1) +* [ReduceSum-1](@ref openvino_docs_ops_reduction_ReduceSum_1) +* [Relu-1](@ref openvino_docs_ops_activation_ReLU_1) +* [Reshape-1](@ref openvino_docs_ops_shape_Reshape_1) +* [Split-1](@ref openvino_docs_ops_movement_Split_1) +* [Squeeze-1](@ref openvino_docs_ops_shape_Reshape_1) +* [StridedSlice-1](@ref openvino_docs_ops_movement_StridedSlice_1) +* [Transpose-1](@ref openvino_docs_ops_movement_Transpose_1) +* [Unsqueeze-1](@ref openvino_docs_ops_shape_Unsqueeze_1) +* [VariadicSplit-1](@ref openvino_docs_ops_movement_VariadicSplit_1) + +If operation is not supported by LPT then dequantization operation will not be propagated, input tensor precisions will not be changed to low precision and operation will be executed in original precision. + +For example, if you would like to infer a model with `Convolution` operation in low precision then the model can look as on picture below: + +![Quantized Convolution](img/model_fq_and_convolution.common.png) + +> There are several supported quantization approaches on activations and on weights. All supported approaches are described in [Quantization approaches](#quantization-approaches) section below. In demonstrated model [FakeQuantize operation quantization](#fakequantize-operation) approach is used. + +### Low precision tools +There are two tools to quantize a model: +1. [Post-Training Optimization Toolkit](@ref pot_docs_LowPrecisionOptimizationGuide) (POT) +2. [Neural Network Compression Framework](https://github.com/openvinotoolkit/nncf) (NNCF) + +Additionally, low precision transformations can handle ONNX quantized models. + +## Quantization approaches +LPT transformations support two quantization approaches: +1. `FakeQuantize` operation, +2. Quantize and dequantization operations + +Let's explore both approaches in details on `Convolution` operation. +### FakeQuantize operation +In this case `FakeQuantize` operation is used on activations and quantized constant on weights. Original input model: + +![Original model with FakeQuantize](img/model_fq_and_convolution.common.png) + +### Quantize and dequantization operations +In this case `FakeQuantize` operation and `Convert` are used as quantize operation and return quantized low precision tensor. After quantize operation on activations there are `Convert` and dequantization operations to compensate decomposition. Original input model: + +![Original model with Q/DQ](img/model_qdq_and_convolution.common.png) + +In both cases result is the same. In LPT result model you can see, that: +1. if necessary, `FakeQuantize` operations on activations were decomposed to two part: + - new `FakeQuantize`operation with updated output intervals in low precision range and low precision output, + - dequantization operations on activations; +2. if necessary, an existing `FakeQuantize` decomposition can be reworked to get better precision; +3. dequantization operations were propagated through `Convolution`. + +LPT result model: + +![Result model](img/model_fq_and_convolution.transformed.png) + +### Low precision transformations pipeline +LPT transformation pipeline has several steps. For each transformation inside one step pattern matcher is unique per transformation, but each operation can be assigned to several transformations. + +![Low precision transformations pipeline](img/low_precision_transformation_pipeline.png) + +Inside each step LPT transformations handle input model operation by operation, applying transformation matching pattern for each transformation from the step to an operation, and execute transformation if pattern is matched. Decomposition transformation decomposes `FakeQuantize` to quantize and dequantization operations. Dequantization operations from previous transformation result is used for the current one and so on, until the end of the model is achieved. + +As result, usually all operations are inferred by plugin in low precision. If plugin doesn't support an operation inference in low precision, then corresponding LPT transformation can be disabled, and input tensor precisions for the operation will not be changed. In this case the operation is inferred in the original precision. + +Low precision transformations pipeline includes four steps: +* [Step #1: Prerequisites](@ref openvino_docs_IE_DG_lpt_step1_prerequisites) +* [Step #2: Markup transformations](@ref openvino_docs_IE_DG_lpt_step2_markup) +* [Step #3: Main transformations](@ref openvino_docs_IE_DG_lpt_step3_main) +* [Step #4: Cleanup transformations](@ref openvino_docs_IE_DG_lpt_step4_cleanup) + +### Step 1. Prerequisites +This step fuses and propagates some operations in the model to prepare for the next step. It is required for OpenVINO plugins. Transformations: +* [PullReshapeThroughDequantization](@ref openvino_docs_IE_DG_lpt_PullReshapeThroughDequantization) +* [PullTransposeThroughDequantization](@ref openvino_docs_IE_DG_lpt_PullTransposeThroughDequantization) +* [LinOpSequenceFusion](@ref openvino_docs_IE_DG_lpt_LinOpSequenceFusion) + +The model on this step is changed. There are more details in developer guide [Prerequisites transformations](@ref openvino_docs_IE_DG_lpt_step1_prerequisites). + +### Step 2. Markup +This step creates runtime attributes for operations. These attributes will be used in next step. Transformations: +* [MarkupCanBeQuantized](@ref openvino_docs_IE_DG_lpt_MarkupCanBeQuantized) +* [MarkupPrecisions](@ref openvino_docs_IE_DG_lpt_MarkupPrecisions) +* [MarkupPerTensorQuantization](@ref openvino_docs_IE_DG_lpt_MarkupPerTensorQuantization) +* [MarkupAvgPoolPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_MarkupAvgPoolPrecisionPreserved) +* [PropagatePrecisions](@ref openvino_docs_IE_DG_lpt_PropagatePrecisions) +* [AlignQuantizationIntervals](@ref openvino_docs_IE_DG_lpt_AlignQuantizationIntervals) +* [AlignQuantizationParameters](@ref openvino_docs_IE_DG_lpt_AlignQuantizationParameters) + +The model on this step is changed: only new attributes are added to some operations. There are more details in developer guide [Markup transformations](@ref openvino_docs_IE_DG_lpt_step2_markup). + +### Step 3. Main transformations, FakeQuantize decomposition and dequantization operations handling +This step has the most transformations. These transformations can be separated in two groups: decomposition transformation and dequantization operations handling. There are more details in developer guide [Main transformations](@ref openvino_docs_IE_DG_lpt_step3_main). Transformations: +* [AddTransformation](@ref openvino_docs_IE_DG_lpt_AddTransformation) +* [AvgPoolTransformation](@ref openvino_docs_IE_DG_lpt_AvgPoolTransformation) +* [ClampTransformation](@ref openvino_docs_IE_DG_lpt_AvgPoolTransformation) +* [ConcatTransformation](@ref openvino_docs_IE_DG_lpt_ConcatTransformation) +* [ConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_ConvolutionTransformation) +* [ConvolutionBackpropDataTransformation](@ref openvino_docs_IE_DG_lpt_ConvolutionBackpropDataTransformation) +* [DepthToSpaceTransformation](@ref openvino_docs_IE_DG_lpt_DepthToSpaceTransformation) +* [FakeQuantizeDecompositionTransformation](@ref openvino_docs_IE_DG_lpt_FakeQuantizeDecompositionTransformation) +* [FakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FakeQuantizeTransformation) +* [InterpolateTransformation](@ref openvino_docs_IE_DG_lpt_InterpolateTransformation) +* [GroupConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_GroupConvolutionTransformation) +* [MatMulTransformation](@ref openvino_docs_IE_DG_lpt_MatMulTransformation) +* [MaxPoolTransformation](@ref openvino_docs_IE_DG_lpt_MaxPoolTransformation) +* [MultiplyTransformation](@ref openvino_docs_IE_DG_lpt_MultiplyTransformation) +* [MVNTransformation](@ref openvino_docs_IE_DG_lpt_MVNTransformation) +* [NormalizeL2Transformation](@ref openvino_docs_IE_DG_lpt_NormalizeL2Transformation) +* [PReluTransformation](@ref openvino_docs_IE_DG_lpt_PReluTransformation) +* [ReduceMaxTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMaxTransformation) +* [ReduceMeanTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMeanTransformation) +* [ReduceMinTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMinTransformation) +* [ReduceSumTransformation](@ref openvino_docs_IE_DG_lpt_ReduceSumTransformation) +* [ReluTransformation](@ref openvino_docs_IE_DG_lpt_ReluTransformation) +* [ReshapeTransformation](@ref openvino_docs_IE_DG_lpt_ReshapeTransformation) +* [SqueezeTransformation](@ref openvino_docs_IE_DG_lpt_SqueezeTransformation) +* [ShuffleChannelsTransformation](@ref openvino_docs_IE_DG_lpt_ShuffleChannelsTransformation) +* [SplitTransformation](@ref openvino_docs_IE_DG_lpt_SplitTransformation) +* [StridedSliceTransformation](@ref openvino_docs_IE_DG_lpt_StridedSliceTransformation) +* [TransposeTransformation](@ref openvino_docs_IE_DG_lpt_TransposeTransformation) +* [UnsqueezeTransformation](@ref openvino_docs_IE_DG_lpt_UnsqueezeTransformation) +* [VariadicSplitTransformation](@ref openvino_docs_IE_DG_lpt_VariadicSplitTransformation) + +#### Decomposition transformations +Decomposition transformations decompose the `FakeQuantize` operation to: quantize (`FakeQuantize` with low precision output) and dequantization operations (opposite to quantize, with low precision input and the original precision output). For dequantization operations LPT uses three operations: `Convert`, `Subtract` and `Multiply`. Element-wise operations `Subtract` and `Multiply` have constants on the second branches. If dequantization operations are not handled at the end of LPT pipeline, then they will be fused back to the `FakeQuantize`. + + +Original `FakeQuantize`: +![FakeQuantize operation before LPT](quantization/img/fq.common.png) + + +`FakeQuantize` after decomposition to quantization and dequantization operations: +![FakeQuantize operation after LPT](quantization/img/fq.transformed.png) + + +#### Dequantization operations handling transformations + +In this step, LPT transformations fuse dequantization operations or move them through existing model operations as much as possible. + +Original `Convolution` operation in FP32 with dequantization operations before: +![Convolution operation before LPT](img/model_fq_and_convolution.common.png) + +`Convolution` operation in INT8 after decomposition and dequantization operations handling: +![Convolution operation after LPT](img/model_fq_and_convolution.transformed.png) + +### Step 4: Cleanup of the result model +LPT cleanup transformations is final stage in LPT pipeline. In this step LPT transformations clean up the result model to avoid not handled dequantization operations: fuse dequantization operations if possible (fuse at least `Convert` operations if not) to other model operations to cleanup result model. Transformations: +* [FoldConvertTransformation](@ref openvino_docs_IE_DG_lpt_FoldConvertTransformation) +* [FoldFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FoldFakeQuantizeTransformation) +* [FuseConvertTransformation](@ref openvino_docs_IE_DG_lpt_FuseConvertTransformation) +* [FuseMultiplyToFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FuseMultiplyToFakeQuantizeTransformation) +* [FuseSubtractToFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FuseSubtractToFakeQuantizeTransformation) +* [MultiplyToGroupConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_MultiplyToGroupConvolutionTransformation) + +There are more details in developer guide [Cleanup transformations](@ref openvino_docs_IE_DG_lpt_step4_cleanup). + +`FakeQuantize` operation with not handled dequantization operations: +![TODO: FakeQuantize operation with dequantization operations before LPT](quantization/img/fq.transformed.png) + +`FakeQuantize` operation with fused dequantization operations: +![TODO: FakeQuantize operation with fused operations after LPT](quantization/img/fq.common.png) + + + +## Low precision transformations in plugin transformation pipeline +Typical transformation pipeline described below. + +### Step 1. Common optimizations +This step is optional for LPT but typically is presented in OpenVINO™ plugins. The step doesn't use any LPT transformation. Firstly, the step disables dequantization operations constant folding on constant subgraph on weights to prevent the lost of dequantization info on the next plugin transformations. After that, it optimizes nGraph function and convert operations to operation set 1. Typically, usage of this step is the simplest way to meet LPT requirements for the input quantized model. If plugin can guarantee that LPT input requirements are met, then this step can be skipped. + +@snippet snippets/lpt_mkldnn_plugin.cpp lpt_common + +### Step 2. Low precision transformations execution +This step is mandatory. It configures and runs LPT transformations. + +@snippet snippets/lpt_mkldnn_plugin.cpp lpt_execution + +### Step 3. Plugin-specific transformations +This step is optional. It modifies the nGraph function to a device-specific operation set. + +@snippet snippets/lpt_mkldnn_plugin.cpp lpt_device + +## Result model overview + +Let's explore quantized [TensorFlow* implementation of ResNet-50](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) model. Use [Model Downloader](@ref omz_tools_downloader) tool to download the `fp16` model from [OpenVINO™ Toolkit - Open Model Zoo repository](https://github.com/openvinotoolkit/open_model_zoo): +```sh +./downloader.py --name resnet-50-tf --precisions FP16-INT8 +``` +After that you should quantize model by the [Model Quantizer](@ref omz_tools_downloader) tool. +```sh +./quantizer.py --model_dir public/resnet-50-tf --dataset_dir --precisions=FP16-INT8 +``` + +### Inference + +The simplest way to infer the model and collect performance counters is [Benchmark Application](../../../../samples/cpp/benchmark_app/README.md). +```sh +./benchmark_app -m resnet-50-tf.xml -d CPU -niter 1 -api sync -report_type average_counters -report_folder pc_report_dir +``` +If you infer the model with the OpenVINO™ CPU plugin and collect performance counters, all operations (except last not quantized SoftMax) are executed in INT8 precision. + +### Results analysis + +Result model depends on different factors: +* The original model quantization possibility and quantization quality. For some models, some operations are not possible to be quantized by POT and NNCF tools. In this case `FakeQuantize` operations are absent before these operations and they will be inferred in original precision. +* LPT customization and plugin supported operations. If plugin doesn't support INT8 inference for some operation then corresponding LPT transformation should be disabled and the operation will be inferred in original precision. + + +Information about layer precision is stored in the performance counters that are +available from the Inference Engine API. For example, the part of performance counters table for quantized [TensorFlow* implementation of ResNet-50](https://github.com/openvinotoolkit/open_model_zoo/tree/master/models/public/resnet-50-tf) model inference on CPU Plugin looks as follows: + + +| layerName | execStatus | layerType | execType | realTime (ms) | cpuTime (ms) | +| --------------------------------------------------------- | ---------- | ------------ | -------------------- | ------------- | ------------ | +| resnet\_model/batch\_normalization\_15/FusedBatchNorm/Add | EXECUTED | Convolution | jit\_avx512\_1x1\_I8 | 0.377 | 0.377 | +| resnet\_model/conv2d\_16/Conv2D/fq\_input\_0 | NOT\_RUN | FakeQuantize | undef | 0 | 0 | +| resnet\_model/batch\_normalization\_16/FusedBatchNorm/Add | EXECUTED | Convolution | jit\_avx512\_I8 | 0.499 | 0.499 | +| resnet\_model/conv2d\_17/Conv2D/fq\_input\_0 | NOT\_RUN | FakeQuantize | undef | 0 | 0 | +| resnet\_model/batch\_normalization\_17/FusedBatchNorm/Add | EXECUTED | Convolution | jit\_avx512\_1x1\_I8 | 0.399 | 0.399 | +| resnet\_model/add\_4/fq\_input\_0 | NOT\_RUN | FakeQuantize | undef | 0 | 0 | +| resnet\_model/add\_4 | NOT\_RUN | Eltwise | undef | 0 | 0 | +| resnet\_model/add\_5/fq\_input\_1 | NOT\_RUN | FakeQuantize | undef | 0 | 0 | + + +> The `execStatus` column of the table includes possible values: +> - `EXECUTED` - layer was executed by standalone primitive, +> - `NOT_RUN` - layer was not executed by standalone primitive or was fused with another operation and executed in another layer primitive. +> +> The `execType` column of the table includes inference primitives with specific suffixes. The layers have the following marks: +> * Suffix `I8` for layers that had 8-bit data type input and were computed in 8-bit precision +> * Suffix `FP32` for layers computed in 32-bit precision + +As result all operations (except not quantized `SoftMax` at the end of the model) in OpenVINO™ CPU plugin are inferred in low precision. Note, please, in the result model there are `FakeQuantize` operations in FP32 but the plugin responsibility is fuse these operations with previous operations. OpenVINO™ CPU plugin achieves maximum optimized inference for all operations by fusing INT8 `Convolution` with FP32 output with `FakeQuantize` operation with FP32 input and INT8 output. In this case OpenVINO™ CPU plugin uses INT8 and FP32 vectorized instructions but reports about one INT8 kernel usage for inference, which is the most optimized for this case. + +## Mixed precision +If LPT input model operation output has `fp16` precision then dequantization computations still occurs in `fp32` precision. This approach is used to avoid accuracy loss in `fp16` arithmetic computations. Note, the latest dequantization operation output has `fp16` precision. + +## Customization +Low Precision Transformations can be customizable. Build-in customization options: +* operation precision restrictions, +* operation per tensor quantization restrictions, +* update precisions, +* dequantization precision. + + +### Operation precision restrictions +This option defines precisions which allowed for the operation input ports. The option value is passed as input argument for `LowPrecision` constructor. For example: + +@snippet snippets/lpt_mkldnn_plugin.cpp lpt_supported_precisions + +In provided example in result model `Convolution` operation inputs must have specific precisions: `u8` (unsigned int8) precision on input 0 (on activations) and `i8` (signed int8) precision on input 1 (on weights). + +### Operation per tensor quantization restrictions +This option defines if operation supports per-tensor quantization only. The option value is passed as input argument for `LowPrecision` constructor. For example: + +@snippet snippets/lpt_mkldnn_plugin.cpp per_tensor_quantization + +In provided example in result model `Convolution` operations must have per-tensor quantization on input 0 (on activations). + +### Update precisions +This option defines if each LPT transformation updates precision or not. The option value is boolean and is passed as `updatePrecisions` member of `LayerTransformation::Params` which is input argument for `LowPrecision` constructor. All transformations are affected. If `true` then low precision transformations update precisions to low precision and doesn't if `false`. Typically this option is used for plugin debugging. + +### Typical customization use cases + +Plugin specific customization can be implemented via nGraph transformation callbacks. For example: asymmetric quantization support can be easily customizable via `LayerTransformation::isAsymmetricQuantization` and `WeightableLayerTransformation::isAsymmetricOnWeights` methods usage in callbacks. For example: + +@snippet snippets/lpt_mkldnn_plugin.cpp asymmetric_quantization diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt_attributes.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt_attributes.md new file mode 100644 index 00000000000..ce567c746e7 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/lpt_attributes.md @@ -0,0 +1,56 @@ +# Attributes {#openvino_docs_IE_DG_lpt_attributes} + +@sphinxdirective + +.. toctree:: + :maxdepth: 1 + :caption: Attributes + :hidden: + + AvgPoolPrecisionPreserved + IntervalsAlignment + PerTensorQuantization + PrecisionPreserved + Precisions + QuantizationAlignment + +@endsphinxdirective + +## Introduction + +| Name | Target | Required | Mutable | +|-------------------------------------------------------------------------------------|------------------------|----------|---------| +| [AvgPoolPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_AvgPoolPrecisionPreserved) | Precision | No | Yes | +| [IntervalsAlignment](@ref openvino_docs_IE_DG_lpt_IntervalsAlignment) | Quantization interval | Yes | Yes | +| [PerTensorQuantization](@ref openvino_docs_IE_DG_lpt_PerTensorQuantization) | Precision | Yes | No | +| [PrecisionPreserved](@ref openvino_docs_IE_DG_lpt_PrecisionPreserved) | Precision | Yes | Yes | +| [Precisions](@ref openvino_docs_IE_DG_lpt_Precisions) | Precision | Yes | Yes | +| [QuantizationAlignment](@ref openvino_docs_IE_DG_lpt_QuantizationAlignment) | Quantization alignment | Yes | Yes | + +> `Target` attribute group defines attribute usage during model transformation for the best performance: +> - `Precision` - the attribute defines the most optimal output port precision. +> - `Quantization interval` - the attribute defines quantization interval. +> - `Quantization alignment` - the attribute defines quantization alignment: per-channel or per-tensor quantization. +> +> `Required` attribute group defines if attribute usage is required to get an optimal model during transformation: +> - `Yes` - the attribute is used by all OpenVINO plugins for low-precision optimization. +> - `No` - the attribute is used in a specific OpenVINO plugin. +> +> `Mutable` attribute group defines if transformation can update an existing attribute: +> - `Yes` - the attribute can be updated by the next transformations in the pipeline. But attribute update order is still important. +> - `No` - existing attribute can not be updated by the next transformation. Previous handled transformation has optimized a model according to the current value. + +`FakeQuantize` decomposition is a mandatory part of low precision transformations. Attributes used during decomposition are mandatory. Optional attributes are required only for certain operations. + +Attributes usage by transformations: + +| Attribute name | Created by transformations | Used by transformations | +|---------------------------|---------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| +| PrecisionPreserved | MarkupPrecisions, MarkupAvgPoolPrecisionPreserved | AlignQuantizationIntervals, AlignQuantizationParameters, FakeQuantizeDecompositionTransformation, MarkupAvgPoolPrecisionPreserved | +| AvgPoolPrecisionPreserved | MarkupAvgPoolPrecisionPreserved | | +| Precisions | MarkupCanBeQuantized, MarkupPrecisions | FakeQuantizeDecompositionTransformation | +| PerTensorQuantization | MarkupPerTensorQuantization | | +| IntervalsAlignment | AlignQuantizationIntervals | FakeQuantizeDecompositionTransformation | +| QuantizationAlignment | AlignQuantizationParameters | FakeQuantizeDecompositionTransformation | + +> **Note:** the same type of attribute instances can be created in different transformations. This approach is the result of the transformation single-responsibility principle. For example, `Precision` attribute instances are created in `MarkupCanBeQuantized` and `MarkupPrecisions` transformations, but the reasons for their creation are different. \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.png new file mode 100644 index 00000000000..813625f420b --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a79d152dae50fd3afaa78d8e18de7d279bb1c79b3e4d5c68fffed52a7c51b18 +size 383875 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.svg new file mode 100644 index 00000000000..21359eda17a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup1.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256{f32} {1, 3, 299, 299}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256{f32} {1, 3, 299, 299}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256{f32} {1, 3, 299, 299}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2{f32} {1, 6, 299, 299}AvgPoolname: maxPool{f32} {1, 6, 299, 299}Convolutionname: convolution2{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1in0: Precisions {precisions: {}}In1: Precisions {precisions: {}}{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.png new file mode 100644 index 00000000000..a6ac9efadab --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d54234622f538249dd5ccb5156cc10dd9b5bb40e800f6d1d906a0ff44ecabcf4 +size 388893 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.svg new file mode 100644 index 00000000000..d8d323becca --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup2.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256{f32} {1, 3, 299, 299}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256{f32} {1, 3, 299, 299}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256{f32} {1, 3, 299, 299}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}AvgPoolname: maxPool{f32} {1, 6, 299, 299}Convolutionname: convolution2in0: Precisions {precisions: {u8}}In1: Precisions {precisions: {i8}}{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1in0: Precisions {precisions: {}}In1: Precisions {precisions: {}}{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.png new file mode 100644 index 00000000000..cdf276757ed --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3132bad01388adf7f788592538194bceb6b94f76f1c3788ffb73b76b19a74990 +size 393300 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.svg new file mode 100644 index 00000000000..80f3f0dea20 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup3.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256{f32} {1, 3, 299, 299}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256{f32} {1, 3, 299, 299}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256{f32} {1, 3, 299, 299}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}AvgPoolname: maxPool{f32} {1, 6, 299, 299}Convolutionname: convolution2in0: PerTensorQuantization, Precisions {precisions: {u8}}in1: Precisions {precisions: {i8}}{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1in0: PerTensorQuantization, Precisions {precisions: {}}in1: Precisions {precisions: {}}{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.png new file mode 100644 index 00000000000..f3164acd100 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f5a98e0ae8dc1f21dd0458ad9ed61de68b134e1128279c3e8b4e700ff3648f8 +size 398967 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.svg new file mode 100644 index 00000000000..60ecb5f9673 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup4.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256{f32} {1, 3, 299, 299}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256{f32} {1, 3, 299, 299}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256{f32} {1, 3, 299, 299}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}AvgPoolname: maxPoolrt info: AvgPoolPrecisionPreserved{value: true}, PrecisionPreserved{value: true}{f32} {1, 6, 299, 299}Convolutionname: convolution2in0: PerTensorQuantization, Precisions{precisions: {u8}}In1: Precisions{precisions: {i8}}{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1in0: PerTensorQuantization, Precisions{precisions: {}}In1: Precisions{precisions: {}}{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.png new file mode 100644 index 00000000000..15231207176 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2618a80fd1be4d25dfc1f7e57e046a7844c9933a6fed316a0660c3051325557e +size 474998 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.svg new file mode 100644 index 00000000000..358a3ceb5c6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup5.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256{f32} {1,3,299,299} Precisions {precisions: {u8}}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: PrecisionPreserved{value: true}, Precisions {precisions: {u8}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256{f32} {1, 3, 299, 299} Precisions {precisions: {u8}}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256{f32} {1, 3, 299, 299} Precisions {precisions: {u8}}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: PrecisionPreserved{value: true}, Precisions {precisions: {u8}{f32} {1, 6, 299, 299}AvgPoolname: maxPoolrt info: AvgPoolPrecisionPreserved{value: true}, PrecisionPreserved{value: true}, Precisions {precisions: {u8}}{f32} {1, 6, 299, 299}Convolutionname: convolution2in0: PerTensorQuantization, Precisions {precisions: {u8}}in1: Precisions {precisions: {i8}}{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1in0: PerTensorQuantization, Precisions {precisions: {}}in1: Precisions {precisions: {}}{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.png new file mode 100644 index 00000000000..00a33774ce6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b7750b3424540912ec590aa5b56cba9e4f2f9db6d45c23aed1d78d094321230 +size 488940 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.svg new file mode 100644 index 00000000000..c8834585723 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup6.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1,3,299,299} Precisions{precisions: {u8}}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}, PrecisionPreserved{value: true}, Precisions {precisions: {u8}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299} Precisions{precisions: {u8}}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299} Precisions{precisions: {u8}}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}, PrecisionPreserved{value: true},Precisions{precisions: {u8}{f32} {1, 6, 299, 299}AvgPoolname: maxPoolrt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}},AvgPoolPrecisionPreserved{value: true}, PrecisionPreserved{value: true}, Precisions{precisions: {u8}{f32} {1, 6, 299, 299}Convolutionname: convolution2in0: PerTensorQuantization, Precisions{precisions: {u8}}In1: Precisions {precisions: {i8}}{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1in0: PerTensorQuantization, Precisions{precisions: {}}In1: Precisions {precisions: {}}{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.png new file mode 100644 index 00000000000..2724d138642 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7836c25a0db5a5f08adf5539fb5ee29f52bc7923148dc42f4c78d3354b7b8464 +size 520539 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.svg new file mode 100644 index 00000000000..625792de5c0 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup7.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true},Precisions{precisions: {u8}},QuantizationAlignment{value: false}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true},Precisions {precisions: {u8}},QuantizationAlignment{value: true}{f32} {1, 6, 299, 299}AvgPoolname: maxPoolrt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true}Precisions {precisions: {u8}}QuantizationAlignment{value: true}{f32} {1, 6, 299, 299}Convolutionname: convolution2in0: {f32}[1,6,7,7]: PerTensorQuantization, Precisions {precisions: {u8}}in1: {f32}[9,6,1,1]: Precisions {precisions: {i8}}{f32} {1, 6, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {6, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {6, 1, 1, 1}Convert{f32} {6, 1, 1, 1}Convolutionname: convolution1{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.png new file mode 100644 index 00000000000..3d0a7abe126 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:911d9730e6762a9919fe3a48f0c87a44a5aeac97468f2d28c5174c13c69ad74b +size 351583 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.svg new file mode 100644 index 00000000000..3663d9f8898 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step2_markup_original.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1levels: 256{f32} {1, 3, 299, 299}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2levels: 256{f32} {1, 3, 299, 299}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3levels: 256{f32} {1, 3, 299, 299}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2{f32} {1, 6, 299, 299}AvgPoolname: maxPool{f32} {1, 6, 299, 299}Convolutionname: convolution2{f32} {1, 9, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {9, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {9, 1, 1, 1}Convert{f32} {9, 1, 1, 1}Convolutionname: convolution1{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.png new file mode 100644 index 00000000000..7c06e5b0f1f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06caa4dc97b00f150395abc230bc90822f3bfa4e0bb3b65019f111a5a40e1d1c +size 520155 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.svg new file mode 100644 index 00000000000..69717ff6fce --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_original.svg @@ -0,0 +1 @@ +FakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]ResultConcatname: concat1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true},Precisions{precisions: {u8}},QuantizationAlignment{value: false}{f32} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]FakeQuantizename: fakeQuantize3rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{f32} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [-1.28]Constant{f32} {1, 1, 1, 1}value: [1.27]ResultConcatname: concat2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true},Precisions {precisions: {u8}},QuantizationAlignment{value: true}{f32} {1, 6, 299, 299}AvgPoolname: maxPoolrt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true}Precisions {precisions: {u8}}QuantizationAlignment{value: true}{f32} {1, 6, 299, 299}Convolutionname: convolutionin0: {f32}[1,6,7,7]: PerTensorQuantization, Precisions {precisions: {u8}}in1: {f32}[9,6,1,1]: Precisions {precisions: {i8}}{f32} {1, 6, 299, 299}Constant{i8} {9, 6, 1, 1}Dequantization on weightsMultiply{f32} {9, 6, 1, 1}Convert{f32} {9, 6, 1, 1}Constant{f32} {6, 1, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {6, 1, 1, 1}Convert{f32} {6, 1, 1, 1}Convolutionname: convolution1{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.png new file mode 100644 index 00000000000..cf65091c10b --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f19d8f068afa4aa62fc04cfa0d2678e6bfe3f90c164a08f588bff9685854030 +size 661189 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.svg new file mode 100644 index 00000000000..3b90c028118 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/img/step3_transformed.svg @@ -0,0 +1 @@ +Dequantizations on branch #2INT8 ConvolutionFakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input1{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [0.0]Constant{f32} {1, 1, 1, 1}value: [255.0]ResultConcatname: concat1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true},Precisions{precisions: {u8}},QuantizationAlignment{value: false}{u8} {1, 6, 299, 299}FakeQuantizename: fakeQuantize2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input2{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: [-0.64]Constant{f32} {1, 1, 1, 1}value: [0.635]Constant{f32} {1, 1, 1, 1}value: [64]Constant{f32} {1, 1, 1, 1}value: [192]FakeQuantizename: fakeQuantize3rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Parametername: input3{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value: -1.28]Constant{f32} {1, 1, 1, 1}value: [12.7]Constant{f32} {1, 1, 1, 1}value: [0.0]Constant{f32} {1, 1, 1, 1}value: [255.0]ResultConcatname: concat2rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true},Precisions {precisions: {u8}},QuantizationAlignment{value: true}{u8} {1, 6, 299, 299}AvgPoolname: maxPoolrt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}PrecisionPreserved{value: true}Precisions {precisions: {u8}}QuantizationAlignment{value: true}{u8} {1, 6, 299, 299}Convolutionname: convolutionin0: {f32}[1,6,7,7]: PerTensorQuantization, Precisions {precisions: {u8}}in1: {f32}[9,6,1,1]: Precisions {precisions: {i8}}{f32} {1, 6, 299, 299}Constant{i8} {9, 6, 1, 1}Convolutionname: convolution1{f32} {1, 9, 299, 299}Constant{f32} {9, 6, 1, 1}Dequantizations on branch #1Multiply{f32} {1, 6, 299, 299}Convert{f32} {1, 6, 299, 299}Constant{f32} {1, 6, 1, 1}Subtract{f32} {1, 6, 299, 299}Subtract{f32} {1, 6, 299, 299}Constant{u8} {1, 6, 1, 1}Constant{f32} {1, 6, 1, 1}Multiply{f32} {1, 6, 299, 299}Constant{f32} {1, 6, 1, 1}Subtract{f32} {9, 6, 1, 1}Constant{i8} {6, 1, 1, 1}Zero point on activationsZero point on weights \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step1_prerequisites.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step1_prerequisites.md new file mode 100644 index 00000000000..71d082054cd --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step1_prerequisites.md @@ -0,0 +1,6 @@ +# Step 1. Prerequisites Transformations {#openvino_docs_IE_DG_lpt_step1_prerequisites} + +Prerequisites transformations are optional. The transformations prepare a model before running other low precision transformations. The transformations do not operate with dequantization operations or update precisions. Prerequisites transformations include: +* [PullReshapeThroughDequantization](@ref openvino_docs_IE_DG_lpt_PullReshapeThroughDequantization) +* [PullTransposeThroughDequantization](@ref openvino_docs_IE_DG_lpt_PullTransposeThroughDequantization) +* [LinOpSequenceFusion](@ref openvino_docs_IE_DG_lpt_LinOpSequenceFusion) \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step2_markup.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step2_markup.md new file mode 100644 index 00000000000..8d32ffef000 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step2_markup.md @@ -0,0 +1,140 @@ +# Step 2. Markup Transformations {#openvino_docs_IE_DG_lpt_step2_markup} + +This step defines the optimal `FakeQuantize` decomposition precisions for the best inference performance via operations markup with runtime attribute instances. Attributes are created for input and output ports and operations. Transformations do not change the operation output port precisions. A model markup low precision logic is decomposed and implemented into the following common markup transformations. The order of transformations is important: + +1. [MarkupCanBeQuantized](@ref openvino_docs_IE_DG_lpt_MarkupCanBeQuantized) +2. [MarkupPrecisions](@ref openvino_docs_IE_DG_lpt_MarkupPrecisions) +3. [MarkupPerTensorQuantization](@ref openvino_docs_IE_DG_lpt_MarkupPerTensorQuantization) +4. [MarkupAvgPoolPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_MarkupAvgPoolPrecisionPreserved) +5. [PropagatePrecisions](@ref openvino_docs_IE_DG_lpt_PropagatePrecisions) +6. [AlignQuantizationIntervals](@ref openvino_docs_IE_DG_lpt_AlignQuantizationIntervals) +7. [AlignQuantizationParameters](@ref openvino_docs_IE_DG_lpt_AlignQuantizationParameters) + +The table of transformations and used attributes: + +| Transformation name | Create attributes | Use attributes | +|---------------------------------|-------------------------------|-------------------------------------------| +| MarkupCanBeQuantized | Precisions | | +| MarkupPrecisions | Precisions,PrecisionPreserved | | +| MarkupPerTensorQuantization | PerTensorQuantization | | +| MarkupAvgPoolPrecisionPreserved | AvgPoolPrecisionPreserved | Precisions, PrecisionPreserved | +| PropagatePrecisions | Precisions | Precisions, PrecisionPreserved | +| AlignQuantizationIntervals | IntervalsAlignment | PrecisionPreserved | +| AlignQuantizationParameters | QuantizationAlignment | PrecisionPreserved, PerTensorQuantization | + +> **Note:** the same type of attribute instances can be created in different transformations. This approach is the result of the transformation single-responsibility principle. For example, `Precision` attribute instances are created in `MarkupCanBeQuantized` and `MarkupPrecisions` transformations, but the reasons for their creation are different + +Common markup transformations can be decomposed into simpler utility markup transformations. The order of Markup utility transformations is not important: +* [CreateAttribute](@ref openvino_docs_IE_DG_lpt_CreateAttribute) +* [CreatePrecisionsDependentAttribute](@ref openvino_docs_IE_DG_lpt_CreatePrecisionsDependentAttribute) +* [PropagateThroughPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_PropagateThroughPrecisionPreserved) +* [PropagateToInput](@ref openvino_docs_IE_DG_lpt_PropagateToInput) +* [UpdateSharedPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_UpdateSharedPrecisionPreserved) + +Let's explore all transformations and their relations in detail, using one and the same model: + +![](img/step2_markup_original.png) + +The original model key features: +* The first `concat1` concatenation operation has not quantized `convolution1` consumer. +* The second `concat2` concatenation operation has quantized `convolution2` consumer with requirements: + - support `unsigned int8` on activations, + - per-tensor quantization. +* Between the `concat2` concatenation operation and `Convolution` there is an `AvgPool` operation, which mathematically should return an `f32` tensor. But the `MarkupAvgPoolPrecisionPreserved` transformation is active. This allows the low precision transformation, that goes after the `AvgPool`, to propagate low precision tensor to the next consumer. + +Transformations are run with the following parameters: + +@snippet snippets/lpt_mkldnn_plugin.cpp lpt_markup_pipeline + +## 1. MarkupCanBeQuantized +The transformation marks operations that cannot be quantized. No attributes are required before the transformation. + +Changes in the example model after `MarkupCanBeQuantized` transformation: +* Not quantized `convolution1` operation is marked by the `Precisions` attribute with empty values. This attribute allows the next transformation to ignore not quantized operation. + +Result model: + +![MarkupCanBeQuantized](img/step2_markup1.png) + +Model display features (here and below): +* The attributes added by the current transformation are marked in bold. +* If attributes do not fit into one line, then one line consists of only one attribute. + +## 2. MarkupPrecisions +The transformation is required and includes two tasks: +1. Mark operation input ports (create `Precision` attribute instance) by provided restrictions: input port index and required precisions. Restrictions are provided as input argument in `ngraph::pass::low_precision::LowPrecision` constructor. +2. Mark precision preserved operations. + +No attributes are required before the transformation. Changes in the example model after `MarkupPrecisions` transformation: +* Both concatenation operations are marked as precision preserved operations. It allows to propagate precision via these operations. +* Quantized `convolution2` operation is marked by the `Precisions` attribute with `u8` precision on activations and `i8` precisions on weights according to the provided restrictions. This attribute instance allows to specify which precisions are required for quantized `Convolution` operation. + +Result model: + +![MarkupPrecisions result](img/step2_markup2.png) + +## 3. MarkupPerTensorQuantization +The transformation is required and marks operations (create `PerTensorQuantization` attribute instance) by provided restrictions: an operation that requires per-tensor quantization. No attributes are required before the transformation. + +Changes in the example model after `MarkupPerTensorQuantization` transformation: +* both `Convolution` operations are marked by `PerTensorQuantization` + +Result model: + +![MarkupPerTensorQuantization result](img/step2_markup3.png) + +## 4. MarkupAvgPoolPrecisionPreserved +The transformation is optional. `MarkupAvgPoolPrecisionPreserved` marks `AvgPool` operations as precision preserved or not precision preserved. `AvgPool` operation is precision preserved if next not precision preserved operation can be inferred in low precision. In other words, `AvgPool` operations become precision preserved operations to speed up model inference. The transformation uses `PrecisionPreserved` attributes created before. The transformation is combined and uses: +* CreatePrecisionsDependentAttribute +* PropagateThroughPrecisionPreserved +* UpdateSharedPrecisionPreserved + +Changes in the example model after `MarkupAvgPoolPrecisionPreserved` transformation: +* `AvgPool` operations are marked by `PrecisionPreserved` and `AvgPoolPrecisionPreserved` (not used below). + +Result model: + +![MarkupAvgPoolPrecisionPreserved](img/step2_markup4.png) + +## 5. PropagatePrecisions +The transformation is required. `PropagatePrecision` is a key transformation in the markup pipeline, which marks `FakeQuantize` output port precisions. The transformation uses `PrecisionPreserved` attribute instances created before. The transformation is combined and uses: + +* CreateAttribute +* PropagateThroughPrecisionPreserved +* PropagateToInput + +Changes in the example model after `PropagatePrecisions` transformation: +* All precision preserved operations are marked by the `Precisions` attribute instance, which defines the required precision for the operation. +* `FakeQuantize` operation output ports are marked by `Precisions` attribute instances, which define target precision for decomposition. In the sample model, `FakeQuantize` operations have signed intervals, but the `Precisions` attributes are initialized by `u8` (`unsigned int8`) values as the result applied during transformations restrictions for `Convolution` operations. + +Result model: + +![PropagatePrecisions](img/step2_markup5.png) + +> **NOTE**: `AlignQuantizationIntervals` and `AlignQuantizationParameters` transformations are required if the model has quantized concatenation operations. + +## 6. AlignQuantizationIntervals +The transformation is required for models with the quantized operation. The transformation marks `FakeQuantize` operation and precision preserved consumers to combine quantization information from different `FakeQuantize` operations for future quantization intervals alignment. The transformation is combined and uses: +* CreateAttribute +* PropagateThroughPrecisionPreserved + +Changes in the example model after `AlignQuantizationIntervals` transformation: +* All `FakeQuantize` operations and their precision preserved consumers are marked by the `IntervalsAlignment` attribute instance. + +Result model: + +![AlignQuantizationIntervals](img/step2_markup6.png) + +## 7. AlignQuantizationParameters +The transformation is required for models with quantized concatenation operation. The transformation marks `FakeQuantize` precision preserved consumers to align quantization intervals. The transformation is combined and uses: +* CreateAttribute +* PropagateThroughPrecisionPreserved +* UpdateSharedPrecisionPreserved + + +Changes in the example model after `AlignQuantizationParameters` transformation: +* All `FakeQuantize` precision preserved consumers are marked by `QuantizationAlignment` attribute instance. `convolution1` input ports are marked by `Precisions` attribute instances with empty precisions collection. As a result, the `convolution1` operation was detected as not quantized, and the `QuantizationAlignment` attribute default value `false` does not change. `convolution2` input ports are marked by `Precisions` attribute instances with not empty precisions collection. `convolution2` operation was detected as quantized with the `PerTensorQuantization` attribute, and the `QuantizationAlignment` attribute default value changed to `true`. + +Final model: + +![AlignQuantizationParameters](img/step2_markup7.png) diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step3_main.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step3_main.md new file mode 100644 index 00000000000..81a07a82125 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step3_main.md @@ -0,0 +1,49 @@ +# Step 3. Main Transformations {#openvino_docs_IE_DG_lpt_step3_main} + +Main transformations are the majority of low precision transformations. Transformations operate with dequantization operations. Main transformations include: +* [AddTransformation](@ref openvino_docs_IE_DG_lpt_AddTransformation) +* [AvgPoolTransformation](@ref openvino_docs_IE_DG_lpt_AvgPoolTransformation) +* [ClampTransformation](@ref openvino_docs_IE_DG_lpt_AvgPoolTransformation) +* [ConcatTransformation](@ref openvino_docs_IE_DG_lpt_ConcatTransformation) +* [ConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_ConvolutionTransformation) +* [ConvolutionBackpropDataTransformation](@ref openvino_docs_IE_DG_lpt_ConvolutionBackpropDataTransformation) +* [DepthToSpaceTransformation](@ref openvino_docs_IE_DG_lpt_DepthToSpaceTransformation) +* [FakeQuantizeDecompositionTransformation](@ref openvino_docs_IE_DG_lpt_FakeQuantizeDecompositionTransformation) +* [FakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FakeQuantizeTransformation) +* [InterpolateTransformation](@ref openvino_docs_IE_DG_lpt_InterpolateTransformation) +* [GroupConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_GroupConvolutionTransformation) +* [MatMulTransformation](@ref openvino_docs_IE_DG_lpt_MatMulTransformation) +* [MaxPoolTransformation](@ref openvino_docs_IE_DG_lpt_MaxPoolTransformation) +* [MultiplyTransformation](@ref openvino_docs_IE_DG_lpt_MultiplyTransformation) +* [MVNTransformation](@ref openvino_docs_IE_DG_lpt_MVNTransformation) +* [NormalizeL2Transformation](@ref openvino_docs_IE_DG_lpt_NormalizeL2Transformation) +* [PReluTransformation](@ref openvino_docs_IE_DG_lpt_PReluTransformation) +* [ReduceMaxTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMaxTransformation) +* [ReduceMeanTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMeanTransformation) +* [ReduceMinTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMinTransformation) +* [ReduceSumTransformation](@ref openvino_docs_IE_DG_lpt_ReduceSumTransformation) +* [ReluTransformation](@ref openvino_docs_IE_DG_lpt_ReluTransformation) +* [ReshapeTransformation](@ref openvino_docs_IE_DG_lpt_ReshapeTransformation) +* [SqueezeTransformation](@ref openvino_docs_IE_DG_lpt_SqueezeTransformation) +* [ShuffleChannelsTransformation](@ref openvino_docs_IE_DG_lpt_ShuffleChannelsTransformation) +* [SplitTransformation](@ref openvino_docs_IE_DG_lpt_SplitTransformation) +* [StridedSliceTransformation](@ref openvino_docs_IE_DG_lpt_StridedSliceTransformation) +* [TransposeTransformation](@ref openvino_docs_IE_DG_lpt_TransposeTransformation) +* [UnsqueezeTransformation](@ref openvino_docs_IE_DG_lpt_UnsqueezeTransformation) +* [VariadicSplitTransformation](@ref openvino_docs_IE_DG_lpt_VariadicSplitTransformation) + +Let's explore some main transformations on the example model. Original model: + +![Original model](img/step3_original.png) + +Result model after main transformations: + +![Original model](img/step3_transformed.png) + +Changes in the example model after main transformation: +* All `FakeQuantize` operations (`fakeQuantize1`, `fakeQuantize2` and `fakeQuantize3`) were decomposed: + - original `FakeQuantize` operations were replaced with new operations with other output intervals and output port precision, + - dequantization operations. +* Dequantization operations were moved via precision preserved (`concat1` and `concat2`) and quantized (`convolution2`) operations. + +> **Note:** the left branch (branch #1) does not require per-tensor quantization. As a result, the `fakeQuantize1`output interval is [0, 255]. But quantized `convolution2` requires per-tensor quantization on the right branch (branch #2). Then all connected `FakeQuantize` interval operations (`fakeQuantize1` and `fakeQuantize2`) are aligned to have per-tensor quantization after the concatenation (`concat2`) operation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step4_cleanup.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step4_cleanup.md new file mode 100644 index 00000000000..0b4913273c6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/pipeline/step4_cleanup.md @@ -0,0 +1,8 @@ +# Step 4. Cleanup Transformations {#openvino_docs_IE_DG_lpt_step4_cleanup} + +* [FoldConvertTransformation](@ref openvino_docs_IE_DG_lpt_FoldConvertTransformation) +* [FoldFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FoldFakeQuantizeTransformation) +* [FuseConvertTransformation](@ref openvino_docs_IE_DG_lpt_FuseConvertTransformation) +* [FuseMultiplyToFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FuseMultiplyToFakeQuantizeTransformation) +* [FuseSubtractToFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FuseSubtractToFakeQuantizeTransformation) +* [MultiplyToGroupConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_MultiplyToGroupConvolutionTransformation) \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.png new file mode 100644 index 00000000000..7cacc57924a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288dec05908449cc3fa5e07700fac5cbdff17bb4b4035a4ee83c44cbc6c22c70 +size 59664 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.svg new file mode 100644 index 00000000000..056a1424ba7 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.common.svg @@ -0,0 +1 @@ +FakeQuantizeFakeQuantizelevels: 256{f32} {1, 3, 299, 299}Parameter{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value:[-12.8]Constant{f32} {1, 1, 1, 1}value:[12.7]Constant{f32} {1, 1, 1, 1}value:[-12.8]Constant{f32} {1, 1, 1, 1}value:[12.7]Result \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.png new file mode 100644 index 00000000000..34967b7e05d --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e345c0b2b5fe365ed298d40d3add4b06a8106096186f68dccb5131c01194e72 +size 102546 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.svg new file mode 100644 index 00000000000..2eae59b9572 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/quantization/img/fq.transformed.svg @@ -0,0 +1 @@ +DequantizationQuantizationFakeQuantizelevels: 256{u8} {1, 3, 299, 299}Parameter{f32} {1, 3, 299, 299}Multiply{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}value:[-12.8]Constant{f32} {1, 1, 1, 1}value:[12.7]Constant{f32} {1, 1, 1, 1}value:[0]Constant{f32} {1, 1, 1, 1}value:[255]Constant{f32} {}ResultConvert{f32} {1, 3, 299, 299}Subtract{f32} {1, 3, 299, 299}Constant{f32} {} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/convert_subtract_constant.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/convert_subtract_constant.md new file mode 100644 index 00000000000..49011c482f5 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/convert_subtract_constant.md @@ -0,0 +1,3 @@ +# ConvertSubtractConstant transformation {#openvino_docs_IE_DG_lpt_ConvertSubtractConstant} + +ngraph::pass::low_precision::ConvertSubtractConstant class represents the `ConvertSubtractConstant` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/lin_op_sequence_fusion.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/lin_op_sequence_fusion.md new file mode 100644 index 00000000000..14e23a61758 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/lin_op_sequence_fusion.md @@ -0,0 +1,5 @@ +# LinOpSequenceFusion transformation {#openvino_docs_IE_DG_lpt_LinOpSequenceFusion} + +ngraph::pass::LinOpSequenceFusion class represents the `LinOpSequenceFusion` transformation. + +`LinOpSequenceFusion` is common nGraph transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_reshape_through_dequantization.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_reshape_through_dequantization.md new file mode 100644 index 00000000000..214e8ac9993 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_reshape_through_dequantization.md @@ -0,0 +1,3 @@ +# PullReshapeThroughDequantization transformation {#openvino_docs_IE_DG_lpt_PullReshapeThroughDequantization} + +ngraph::pass::low_precision::PullReshapeThroughDequantization class represents the `PullReshapeThroughDequantization` transformation. \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_transpose_through_dequantization.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_transpose_through_dequantization.md new file mode 100644 index 00000000000..1acd058af16 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step1_prerequisites/pull_transpose_through_dequantization.md @@ -0,0 +1,3 @@ +# PullTransposeThroughDequantization transformation {#openvino_docs_IE_DG_lpt_PullTransposeThroughDequantization} + +ngraph::pass::low_precision::PullTransposeThroughDequantization class represents the `PullTransposeThroughDequantization` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_intervals.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_intervals.md new file mode 100644 index 00000000000..b41afd0b8f3 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_intervals.md @@ -0,0 +1,3 @@ +# AlignQuantizationIntervals transformation {#openvino_docs_IE_DG_lpt_AlignQuantizationIntervals} + +ngraph::pass::low_precision::AlignQuantizationIntervals class represents the `AlignQuantizationIntervals` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_parameters.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_parameters.md new file mode 100644 index 00000000000..7477d96dbbf --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/align_quantization_parameters.md @@ -0,0 +1,3 @@ +# AlignQuantizationParameters transformation {#openvino_docs_IE_DG_lpt_AlignQuantizationParameters} + +ngraph::pass::low_precision::AlignQuantizationParameters class represents the `AlignQuantizationParameters` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_attribute.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_attribute.md new file mode 100644 index 00000000000..118ce14305e --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_attribute.md @@ -0,0 +1,3 @@ +# CreateAttribute transformation {#openvino_docs_IE_DG_lpt_CreateAttribute} + +ngraph::pass::low_precision::CreateAttribute class represents the `CreateAttribute` transformation. \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_precisions_dependent_attribute.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_precisions_dependent_attribute.md new file mode 100644 index 00000000000..c747462e4c9 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/create_precisions_dependent_attribute.md @@ -0,0 +1,3 @@ +# CreatePrecisionsDependentAttribute transformation {#openvino_docs_IE_DG_lpt_CreatePrecisionsDependentAttribute} + +ngraph::pass::low_precision::CreatePrecisionsDependentAttribute class represents the `CreatePrecisionsDependentAttribute` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_avg_pool_precision_preserved.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_avg_pool_precision_preserved.md new file mode 100644 index 00000000000..4d9a97ffc47 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_avg_pool_precision_preserved.md @@ -0,0 +1,3 @@ +# MarkupAvgPoolPrecisionPreserved transformation {#openvino_docs_IE_DG_lpt_MarkupAvgPoolPrecisionPreserved} + +ngraph::pass::low_precision::MarkupAvgPoolPrecisionPreserved class represents the `MarkupAvgPoolPrecisionPreserved` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_can_be_quantized.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_can_be_quantized.md new file mode 100644 index 00000000000..1bd149e332a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_can_be_quantized.md @@ -0,0 +1,3 @@ +# MarkupCanBeQuantized transformation {#openvino_docs_IE_DG_lpt_MarkupCanBeQuantized} + +ngraph::pass::low_precision::MarkupCanBeQuantized class represents the `MarkupCanBeQuantized` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_per_tensor_quantization.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_per_tensor_quantization.md new file mode 100644 index 00000000000..d915ef73183 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_per_tensor_quantization.md @@ -0,0 +1,3 @@ +# MarkupPerTensorQuantization transformation {#openvino_docs_IE_DG_lpt_MarkupPerTensorQuantization} + +ngraph::pass::low_precision::MarkupPerTensorQuantization class represents the `MarkupPerTensorQuantization` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_precisions.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_precisions.md new file mode 100644 index 00000000000..673a8932529 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/markup_precisions.md @@ -0,0 +1,3 @@ +# MarkupPrecisions transformation {#openvino_docs_IE_DG_lpt_MarkupPrecisions} + +ngraph::pass::low_precision::MarkupPrecisions class represents the `MarkupPrecisions` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_precisions.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_precisions.md new file mode 100644 index 00000000000..50dcc23ce96 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_precisions.md @@ -0,0 +1,3 @@ +# PropagatePrecisions transformation {#openvino_docs_IE_DG_lpt_PropagatePrecisions} + +ngraph::pass::low_precision::PropagatePrecisions class represents the `PropagatePrecisions` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_shared_value.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_shared_value.md new file mode 100644 index 00000000000..e7f93dd64f0 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_shared_value.md @@ -0,0 +1,3 @@ +# PropagateSharedValue transformation {#openvino_docs_IE_DG_lpt_PropagateSharedValue} + +ngraph::pass::low_precision::PropagateSharedValue class represents the `PropagateSharedValue` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_through_precision_preserved.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_through_precision_preserved.md new file mode 100644 index 00000000000..e183b5265d7 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_through_precision_preserved.md @@ -0,0 +1,3 @@ +# PropagateThroughPrecisionPreserved transformation {#openvino_docs_IE_DG_lpt_PropagateThroughPrecisionPreserved} + +ngraph::pass::low_precision::PropagateThroughPrecisionPreserved class represents the `PropagateThroughPrecisionPreserved` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_to_input.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_to_input.md new file mode 100644 index 00000000000..08136272cdb --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/propagate_to_input.md @@ -0,0 +1,3 @@ +# PropagateToInput transformation {#openvino_docs_IE_DG_lpt_PropagateToInput} + +ngraph::pass::low_precision::PropagateToInput class represents the `PropagateToInput` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/update_shared_precision_preserved.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/update_shared_precision_preserved.md new file mode 100644 index 00000000000..aa18aea07cd --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step2_markup/update_shared_precision_preserved.md @@ -0,0 +1,3 @@ +# UpdateSharedPrecisionPreserved transformation {#openvino_docs_IE_DG_lpt_UpdateSharedPrecisionPreserved} + +ngraph::pass::low_precision::UpdateSharedPrecisionPreserved class represents the `UpdateSharedPrecisionPreserved` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/clamp.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/clamp.md new file mode 100644 index 00000000000..5e00b6a3ca0 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/clamp.md @@ -0,0 +1,3 @@ +# ClampTransformation transformation {#openvino_docs_IE_DG_lpt_ClampTransformation} + +ngraph::pass::low_precision::ClampTransformation class represents the `Clamp` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/prelu.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/prelu.md new file mode 100644 index 00000000000..4ffcade1647 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/prelu.md @@ -0,0 +1,3 @@ +# PReluTransformation transformation {#openvino_docs_IE_DG_lpt_PReluTransformation} + +ngraph::pass::low_precision::PReluTransformation class represents the `PRelu` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/relu.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/relu.md new file mode 100644 index 00000000000..8831de7aee6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/activation/relu.md @@ -0,0 +1,3 @@ +# ReluTransformation transformation {#openvino_docs_IE_DG_lpt_ReluTransformation} + +ngraph::pass::low_precision::ReluTransformation class represents the `Relu` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/add.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/add.md new file mode 100644 index 00000000000..337c49a9749 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/add.md @@ -0,0 +1,57 @@ +# AddTransformation transformation {#openvino_docs_IE_DG_lpt_AddTransformation} + +ngraph::pass::low_precision::AddTransformation class represents the `Add` operation transformation. + +The transformation propagates dequantization subtraction from one input branch to another and propagates dequantization multiplication from the same branch through `Add` operation. In transformation result, one `Add` operation input branch is in low precision without dequantization operations (empty branch), another input branch is in original precision with updated dequantization operations (full branch). + +Criteria for selecting an empty branch in order of priority: + +*Step 1.* If one branch is quantized only, then the quantized branch is an empty branch. + +*Step 2.* If only one branch has `FakeQuantize` before dequantization operations, then another branch is an empty branch. + +*Step 3.* If some `FakeQuantize` has more than one consumer and another has only one, then the branch with `FakeQuantize` with several consumers is an empty branch. + +*Step 4.* Constant branch is in original precision, data branch is an empty branch. In this case, dequantization operations are propagated to a constant branch and will be fused in one constant. + +*Step 5.* If both branches have operations from the following list before `FakeQuantize`: `Convolution`, `GroupConvolution`, and `MatMul`, or do not have any operations from the list, then the branch with larger shape volume is empty. + +*Step 6.* If the operation before `FakeQuantize` has several consumers in any branch, then the branch is empty. + +If dequantization operations on the full branch have a `FakeQuantize` operation parent, then they will be fused with `FakeQuantize` during another low precision transformation. If a `FakeQuantize` operation has a parent operation from the list: `Convolution`, `GroupConvolution`, and `MatMul`, then during inference the `FakeQuantize` can be inferred in one plugin kernel with the parent operation. + +Depending on the plugin instruction set, low precision inference for the `Add` operation can be implemented in two logical steps in one plugin kernel: + + * Inference step #1: Operations in the full branch, for example, `Convolution` and `FakeQuantize` with fused dequantization operations, and `Add` can be inferred in the original precision. + + * Inference step #2: Inference step #1 result can be added with the empty branch tensor in low precision. + +This approach allows to infer the `Add` operation in the optimal way. + +## Subgraph before transformation +The subgraph with quantized `Add` operation before transformation: + +\f[ +y_{ch,i}=(scale1_{ch} * (x1_{ch,i} - shift1_{ch})) + (scale2_{ch} * (x2_{ch,i} - shift2_{ch})) +\f] + +![Add before](img/add.common.png) + +## Subgraph after transformation +The subgraph with the `Add` operation after the transformation: + +\f[ +y_{ch,i}=scale2_{ch} * (scale1_{ch}' * (x1_{ch,i} - shift1_{ch}') + x2_{ch,i}) +\f] + +where: + +\f[ +scale1_{ch}' = scale1_{ch} / scale2_{ch} +\f] + +\f[ +shift1_{ch}' = shift1_{ch} + scale2_{ch} * shift2_{ch} / scale1_{ch} +\f] + +![Add before](img/add.transformed.png) \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.png new file mode 100644 index 00000000000..7d05063836f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8d3621c4be5d3382cb164a19676253412f85b5f47fac27b024c726f1571647e +size 380663 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.svg new file mode 100644 index 00000000000..ee254a66659 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.common.svg @@ -0,0 +1 @@ +QuantizeQuantizeDequantization on activationsMultiply{f32} {1, 3, 299, 299}Convert{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}Subtract{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}Dequantization on activationsMultiply{f32} {1, 3, 299, 299}Convert{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}Subtract{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}Add{f32} {1, 3, 299, 299}INT8 Convolution with zero pointSubtract{f32} {1, 3, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{i8} {6, 3, 1, 1}Constant{u8} {}Subtract{f32} {1, 3, 299, 299}Constant{i8} {6, 1, 1, 1}FakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [0.0]Constant{f32} {1, 1, 1, 1}value: [255.0]Add{f32} {1, 3, 299, 299}FakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [0.0]Constant{f32} {1, 1, 1, 1}value: [255.0]Branch#2Branch#1 \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.png new file mode 100644 index 00000000000..16a5cc7f127 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2d26dc0b86f339458a2fafbbd6a88daf3d3dc6fcefb636243f42a6e91bc328 +size 492066 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.svg new file mode 100644 index 00000000000..6c7fc6b7b5f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/img/add.transformed.svg @@ -0,0 +1 @@ +Dequantization on activationsQuantizeQuantizeDequantization on activationsMultiply{f32} {1, 3, 299, 299}Convert{f32} {1, 3, 299, 299}Constant{f32} {1, 3, 1, 1}Subtract{f32} {1, 3, 299, 299}Constant{f32} {1, 3, 1, 1}Add{f32} {1, 3, 299, 299}INT8 Convolution with zero pointSubtract{f32} {1, 3, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{i8} {6, 3, 1, 1}Constant{u8} {}Subtract{f32} {1, 3, 299, 299}Constant{i8} {6, 1, 1, 1}FakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [0.0]Constant{f32} {1, 1, 1, 1}value: [255.0]Add{f32} {1, 3, 299, 299}FakeQuantizename: fakeQuantize1rt info: IntervalsAlignment{combined: { -1.28, 1.27 }, preferablePrecisions: {i8}}{u8} {1, 3, 299, 299}Precisions {precisions: {u8}}Constant{f32} {1, 1, 1, 1}value: [-0.42667]Constant{f32} {1, 1, 1, 1}value: [0.42333]Constant{f32} {1, 1, 1, 1}value: [0.0]Constant{f32} {1, 1, 1, 1}value: [255.0]Branch#2Branch#1Multiply{f32} {1, 3, 299, 299}Constant{f32} {1, 3, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/multiply.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/multiply.md new file mode 100644 index 00000000000..43893093944 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/multiply.md @@ -0,0 +1,3 @@ +# MultiplyTransformation transformation {#openvino_docs_IE_DG_lpt_MultiplyTransformation} + +ngraph::pass::low_precision::MultiplyTransformation class represents the `Multiply` operation transformation. \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/subtract.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/subtract.md new file mode 100644 index 00000000000..8ba827aaea9 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/arithmetic/subtract.md @@ -0,0 +1,3 @@ +# SubtractTransformation transformation {#openvino_docs_IE_DG_lpt_SubtractTransformation} + +ngraph::pass::low_precision::SubtractTransformation class represents the `Subtract` operation transformation. \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution.md new file mode 100644 index 00000000000..c29aa9c5b29 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution.md @@ -0,0 +1,34 @@ +# ConvolutionTransformation transformation {#openvino_docs_IE_DG_lpt_ConvolutionTransformation} + +ngraph::pass::low_precision::ConvolutionTransformation class represents the `Convolution` operation transformation. + +The transformation propagates dequantization operations on activations and weights through the `Convolution` operation. The transformation supports several weights quantization approaches: +* quantized weights in low precision with dequantization operations, +* weights in original precision with `FakeQuantize` operation. + +Result dequantization `Multiply` constant value *result* is calculated as multiplication for dequantization `Multiply` constant value on activations *a* and dequantization `Multiply` constant value on weights *b* : + +\f[ +result_{i} = a_{i} \cdot b_{i} +\f] + +## Limitations + +* Dequantization on activations must be per-tensor. It means that dequantization `Multiply` constant value on activations must be scalar. + +## Subgraph before transformation + +### Quantized weights in low precision with dequantization operations +The subgraph with quantized `Convolution` before transformation with quantized weights in low precision constant and dequantization operations: + +![Convolution before](img/fq_and_convolution.common.png) + +### Weights in original precision with FakeQuantize operation +The subgraph with quantized `Convolution` before transformation with weights in original precision and `FakeQuantize` operation: + +![Convolution before](img/fq_fq_and_convolution.common.png) + +## Subgraph after transformation +The subgraph with `Convolution` operation after the transformation: + +![Convolution after](img/fq_and_convolution.transformed.png) \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution_backprop_data.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution_backprop_data.md new file mode 100644 index 00000000000..aa9af9f28b8 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/convolution_backprop_data.md @@ -0,0 +1,3 @@ +# ConvolutionBackpropDataTransformation transformation {#openvino_docs_IE_DG_lpt_ConvolutionBackpropDataTransformation} + +ngraph::pass::low_precision::ConvolutionBackpropDataTransformation class represents the `ConvolutionBackpropData` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/group_convolution.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/group_convolution.md new file mode 100644 index 00000000000..c5571fbada3 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/group_convolution.md @@ -0,0 +1,3 @@ +# GroupConvolutionTransformation transformation {#openvino_docs_IE_DG_lpt_GroupConvolutionTransformation} + +ngraph::pass::low_precision::GroupConvolutionTransformation class represents the `GroupConvolution` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.png new file mode 100644 index 00000000000..7b686f72935 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e5bfd5ca52ea6660e0ff67afefc98d64941eab6e8b464116242a6e044f318f5 +size 207602 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.svg new file mode 100644 index 00000000000..f45ca77e426 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.common.svg @@ -0,0 +1 @@ +FP32 Convolution with quantized weightsQuantized weightsDequantization on activationsConvolution{f32} {1, 6, 299, 299}Constant{i8} {6, 3, 1, 1}Dequantization on weightsMultiply{f32} {6, 3, 1, 1}Convert{f32} {6, 3, 1, 1}Constant{f32} {6, 1, 1, 1}Subtract{f32} {6, 3, 1, 1}Constant{i8} {6, 1, 1, 1}Convert{f32} {6, 1, 1, 1}Multiply{f32} {1, 3, 299, 299}Convert{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}Subtract{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.png new file mode 100644 index 00000000000..63fa693e6b0 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:756c225ee8e1da046e0210bf0696185b3939378f10b4ed6d757e43070d379436 +size 135804 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.svg new file mode 100644 index 00000000000..57bab915e3a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_and_convolution.transformed.svg @@ -0,0 +1 @@ +DequantizationINT8 Convolution with zero pointSubtract{f32} {1, 3, 299, 299}Multiply{f32} {1, 6, 299, 299}Convolution{f32} {1, 6, 299, 299}Constant{i8} {6, 3, 1, 1}Constant{u8} {}Constant{f32} {1, 6, 1, 1}Subtract{f32} {1, 3, 299, 299}Constant{i8} {6, 1, 1, 1}Zero point on activationsZero point on weights \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.png b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.png new file mode 100644 index 00000000000..b887ab2b764 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08d4116490ab329636fced24c292636fbe00856976b19e5219e433bc2c6e4e16 +size 190590 diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.svg b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.svg new file mode 100644 index 00000000000..c475faa62ac --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/convolution/img/fq_fq_and_convolution.common.svg @@ -0,0 +1 @@ +FP32 Convolution with quantized weightsNot quantized weights in original precisionConvolution{f32} {1, 6, 299, 299}FakeQuantizelevels: 255{f32} {6, 3, 299, 299}Constant{f32} {6, 3, 1, 1}Constant{f32} {1, 1, 1, 1}Value: [-12.8]Constant{f32} {1, 1, 1, 1}Value: [12.7]Constant{f32} {1, 1, 1, 1}Value: [-12.8]Constant{f32} {1, 1, 1, 1}Value: [12.7]Dequantization on activationsMultiply{f32} {1, 3, 299, 299}Convert{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1}Subtract{f32} {1, 3, 299, 299}Constant{f32} {1, 1, 1, 1} \ No newline at end of file diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/image/interpolate.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/image/interpolate.md new file mode 100644 index 00000000000..c6d3a3fbc25 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/image/interpolate.md @@ -0,0 +1,3 @@ +# InterpolateTransformation transformation {#openvino_docs_IE_DG_lpt_InterpolateTransformation} + +ngraph::pass::low_precision::InterpolateTransformation class represents the `Interpolate` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/matrix/mat_mul.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/matrix/mat_mul.md new file mode 100644 index 00000000000..3a54ca5e574 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/matrix/mat_mul.md @@ -0,0 +1,3 @@ +# MatMulTransformation transformation {#openvino_docs_IE_DG_lpt_MatMulTransformation} + +ngraph::pass::low_precision::MatMulTransformation class represents the `MatMul` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/concat.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/concat.md new file mode 100644 index 00000000000..698d3e3cc98 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/concat.md @@ -0,0 +1,3 @@ +# ConcatTransformation transformation {#openvino_docs_IE_DG_lpt_ConcatTransformation} + +ngraph::pass::low_precision::ConcatTransformation class represents the `Concat` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/depth_to_space.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/depth_to_space.md new file mode 100644 index 00000000000..c3ae40d70ba --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/depth_to_space.md @@ -0,0 +1,3 @@ +# DepthToSpaceTransformation transformation {#openvino_docs_IE_DG_lpt_DepthToSpaceTransformation} + +ngraph::pass::low_precision::DepthToSpaceTransformation class represents the `DepthToSpace` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/pad.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/pad.md new file mode 100644 index 00000000000..feb8561f3c0 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/pad.md @@ -0,0 +1,3 @@ +# PadTransformation transformation {#openvino_docs_IE_DG_lpt_PadTransformation} + +ngraph::pass::low_precision::PadTransformation class represents the `Pad` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/shuffle_channels.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/shuffle_channels.md new file mode 100644 index 00000000000..e41e1c05aa3 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/shuffle_channels.md @@ -0,0 +1,3 @@ +# ShuffleChannelsTransformation transformation {#openvino_docs_IE_DG_lpt_ShuffleChannelsTransformation} + +ngraph::pass::low_precision::ShuffleChannelsTransformation class represents the `ShuffleChannels` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/split.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/split.md new file mode 100644 index 00000000000..166ad30e3dc --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/split.md @@ -0,0 +1,3 @@ +# SplitTransformation transformation {#openvino_docs_IE_DG_lpt_SplitTransformation} + +ngraph::pass::low_precision::SplitTransformation class represents the `Split` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/strided_slice.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/strided_slice.md new file mode 100644 index 00000000000..4b385dc6e73 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/strided_slice.md @@ -0,0 +1,3 @@ +# StridedSliceTransformation transformation {#openvino_docs_IE_DG_lpt_StridedSliceTransformation} + +ngraph::pass::low_precision::StridedSliceTransformation class represents the `StridedSlice` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/transpose.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/transpose.md new file mode 100644 index 00000000000..bcf2ac02c50 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/transpose.md @@ -0,0 +1,3 @@ +# TransposeTransformation transformation {#openvino_docs_IE_DG_lpt_TransposeTransformation} + +ngraph::pass::low_precision::TransposeTransformation class represents the `Transpose` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/variadic_split.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/variadic_split.md new file mode 100644 index 00000000000..10bc02ead1c --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/movement/variadic_split.md @@ -0,0 +1,3 @@ +# VariadicSplitTransformation transformation {#openvino_docs_IE_DG_lpt_VariadicSplitTransformation} + +ngraph::pass::low_precision::VariadicSplitTransformation class represents the `VariadicSplit` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/mvn.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/mvn.md new file mode 100644 index 00000000000..3b712696b54 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/mvn.md @@ -0,0 +1,3 @@ +# MVNTransformation transformation {#openvino_docs_IE_DG_lpt_MVNTransformation} + +ngraph::pass::low_precision::MVNTransformation class represents the `MVN` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/normalize_l2.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/normalize_l2.md new file mode 100644 index 00000000000..6f86660f1a5 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/normalization/normalize_l2.md @@ -0,0 +1,3 @@ +# NormalizeL2Transformation transformation {#openvino_docs_IE_DG_lpt_NormalizeL2Transformation} + +ngraph::pass::low_precision::NormalizeL2Transformation class represents the `NormalizeL2` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/avg_pool.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/avg_pool.md new file mode 100644 index 00000000000..d53a8e28a78 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/avg_pool.md @@ -0,0 +1,3 @@ +# AvgPoolTransformation transformation {#openvino_docs_IE_DG_lpt_AvgPoolTransformation} + +ngraph::pass::low_precision::AvgPoolTransformation class represents the `AvgPool` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/max_pool.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/max_pool.md new file mode 100644 index 00000000000..ce7f2a28c7c --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/pooling/max_pool.md @@ -0,0 +1,3 @@ +# MaxPoolTransformation transformation {#openvino_docs_IE_DG_lpt_MaxPoolTransformation} + +ngraph::pass::low_precision::MaxPoolTransformation class represents the `MaxPool` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fake_quantize.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fake_quantize.md new file mode 100644 index 00000000000..8441554f637 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fake_quantize.md @@ -0,0 +1,3 @@ +# FakeQuantizeTransformation transformation {#openvino_docs_IE_DG_lpt_FakeQuantizeTransformation} + +ngraph::pass::low_precision::FakeQuantizeTransformation class represents the `FakeQuantize` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fold_fake_quantize.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fold_fake_quantize.md new file mode 100644 index 00000000000..34ec1af1b0a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/quantization/fold_fake_quantize.md @@ -0,0 +1,3 @@ +# FoldFakeQuantizeTransformation transformation {#openvino_docs_IE_DG_lpt_FoldFakeQuantizeTransformation} + +ngraph::pass::low_precision::FoldFakeQuantizeTransformation class represents the `FoldFakeQuantize` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_max.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_max.md new file mode 100644 index 00000000000..27153c02125 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_max.md @@ -0,0 +1,3 @@ +# ReduceMaxTransformation transformation {#openvino_docs_IE_DG_lpt_ReduceMaxTransformation} + +ngraph::pass::low_precision::ReduceMaxTransformation class represents the `ReduceMax` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_mean.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_mean.md new file mode 100644 index 00000000000..ca05bd56a8d --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_mean.md @@ -0,0 +1,3 @@ +# ReduceMeanTransformation transformation {#openvino_docs_IE_DG_lpt_ReduceMeanTransformation} + +ngraph::pass::low_precision::ReduceMeanTransformation class represents the `ReduceMean` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_min.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_min.md new file mode 100644 index 00000000000..0d5d0f74fd7 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_min.md @@ -0,0 +1,3 @@ +# ReduceMinTransformation transformation {#openvino_docs_IE_DG_lpt_ReduceMinTransformation} + +ngraph::pass::low_precision::ReduceMinTransformation class represents the `ReduceMin` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_sum.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_sum.md new file mode 100644 index 00000000000..b67ebf5d3a0 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/reduction/reduce_sum.md @@ -0,0 +1,3 @@ +# ReduceSumTransformation transformation {#openvino_docs_IE_DG_lpt_ReduceSumTransformation} + +ngraph::pass::low_precision::ReduceSumTransformation class represents the `ReduceSum` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/reshape.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/reshape.md new file mode 100644 index 00000000000..b4c69a720bc --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/reshape.md @@ -0,0 +1,3 @@ +# ReshapeTransformation transformation {#openvino_docs_IE_DG_lpt_ReshapeTransformation} + +ngraph::pass::low_precision::ReshapeTransformation class represents the `Reshape` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/squeeze.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/squeeze.md new file mode 100644 index 00000000000..a409c8ca61c --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/squeeze.md @@ -0,0 +1,3 @@ +# SqueezeTransformation transformation {#openvino_docs_IE_DG_lpt_SqueezeTransformation} + +ngraph::pass::low_precision::SqueezeTransformation class represents the `Squeeze` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/unsqueeze.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/unsqueeze.md new file mode 100644 index 00000000000..a9ffac0fa4a --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step3_main/shape/unsqueeze.md @@ -0,0 +1,3 @@ +# UnsqueezeTransformation transformation {#openvino_docs_IE_DG_lpt_UnsqueezeTransformation} + +ngraph::pass::low_precision::UnsqueezeTransformation class represents the `Unsqueeze` operation transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fake_quantize_decomposition.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fake_quantize_decomposition.md new file mode 100644 index 00000000000..83c4eb3d9e6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fake_quantize_decomposition.md @@ -0,0 +1,3 @@ +# FakeQuantizeDecompositionTransformation transformation {#openvino_docs_IE_DG_lpt_FakeQuantizeDecompositionTransformation} + +ngraph::pass::low_precision::FakeQuantizeDecompositionTransformation class represents the `FakeQuantizeDecompositionTransformation` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fold_convert.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fold_convert.md new file mode 100644 index 00000000000..c84e19da98e --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fold_convert.md @@ -0,0 +1,3 @@ +# FoldConvertTransformation transformation {#openvino_docs_IE_DG_lpt_FoldConvertTransformation} + +ngraph::pass::low_precision::FoldConvertTransformation class represents the `FoldConvertTransformation` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_convert.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_convert.md new file mode 100644 index 00000000000..3b720729c7f --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_convert.md @@ -0,0 +1,3 @@ +# FuseConvertTransformation transformation {#openvino_docs_IE_DG_lpt_FuseConvertTransformation} + +ngraph::pass::low_precision::FuseConvertTransformation class represents the `FuseConvertTransformation` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_multiply_to_fake_quantize.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_multiply_to_fake_quantize.md new file mode 100644 index 00000000000..10cab1a1788 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_multiply_to_fake_quantize.md @@ -0,0 +1,3 @@ +# FuseMultiplyToFakeQuantizeTransformation transformation {#openvino_docs_IE_DG_lpt_FuseMultiplyToFakeQuantizeTransformation} + +ngraph::pass::low_precision::FuseMultiplyToFakeQuantizeTransformation class represents the `FuseMultiplyToFakeQuantizeTransformation` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_subtract_to_fake_quantize.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_subtract_to_fake_quantize.md new file mode 100644 index 00000000000..7bd326435d6 --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/fuse_subtract_to_fake_quantize.md @@ -0,0 +1,3 @@ +# FuseSubtractToFakeQuantizeTransformation transformation {#openvino_docs_IE_DG_lpt_FuseSubtractToFakeQuantizeTransformation} + +ngraph::pass::low_precision::FuseSubtractToFakeQuantizeTransformation class represents the `FuseSubtractToFakeQuantizeTransformation` transformation. diff --git a/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/multiply_to_group_convolution.md b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/multiply_to_group_convolution.md new file mode 100644 index 00000000000..27742998abd --- /dev/null +++ b/docs/IE_PLUGIN_DG/plugin_transformation_pipeline/low_precision_transformations/transformations/step4_cleanup/multiply_to_group_convolution.md @@ -0,0 +1,3 @@ +# MultiplyToGroupConvolutionTransformation transformation {#openvino_docs_IE_DG_lpt_MultiplyToGroupConvolutionTransformation} + +ngraph::pass::low_precision::MultiplyToGroupConvolutionTransformation class represents the `MultiplyToGroupConvolutionTransformation` transformation. diff --git a/docs/documentation.md b/docs/documentation.md index bd4444f1469..e421351fca7 100644 --- a/docs/documentation.md +++ b/docs/documentation.md @@ -75,6 +75,7 @@ Inference Engine Plugin Developer Guide groupie_dev_api + Plugin Transformation Pipeline .. toctree:: :maxdepth: 1 diff --git a/docs/doxygen/ie_docs.xml b/docs/doxygen/ie_docs.xml new file mode 100644 index 00000000000..3a0f6d854eb --- /dev/null +++ b/docs/doxygen/ie_docs.xml @@ -0,0 +1,383 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/snippets/lpt_mkldnn_plugin.cpp b/docs/snippets/lpt_mkldnn_plugin.cpp new file mode 100644 index 00000000000..1808b011c37 --- /dev/null +++ b/docs/snippets/lpt_mkldnn_plugin.cpp @@ -0,0 +1,221 @@ +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ngraph { +namespace pass { +namespace device { + +class ConvertOpSet1ToDeviceSpecific: public ngraph::pass::FunctionPass { +public: + bool run_on_function(std::shared_ptr f) override { + return true; + } +}; + +} // namespace device +} // pass +} // ngraph + +int main() { +std::shared_ptr nGraphFunc; +ngraph::pass::Manager manager; +auto pass_config = manager.get_pass_config(); +//! [lpt_common] +// check if the function is quantized to ignore LPT transformations for not quantized function to speed up model loading +const bool useLpt = ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(nGraphFunc); +if (useLpt) { + // disable constant folding on constant subgraph to use the subgraph for LPT + manager.register_pass(std::vector{ + ngraph::element::i8, ngraph::element::u8, ngraph::element::i4, ngraph::element::u4 + }); +} + +// nGraph common transformations happen here + +if (useLpt) { + // convert subtract constant to INT8 to prevent unnecessary FP16 to FP32 conversion + manager.register_pass(std::vector{ + ngraph::element::i8, ngraph::element::u8, ngraph::element::i4, ngraph::element::u4 }); +} + +// nGraph common transformations happen here + +if (useLpt) { + // convert not supported cases FakeQuantize -> Convert -> Convert -> Subtract -> Multiply to a single FakeQuantize + pass_config->set_callback([](const std::shared_ptr &node) -> bool { + return ngraph::pass::low_precision::NetworkHelper::areQuantizeAndDequantizeSupportedForMultiply(node); + }); + + // convert not supported cases FakeQuantize -> Convert -> Convert -> Subtract -> Multiply to a single FakeQuantize + pass_config->set_callback([](const std::shared_ptr &node) -> bool { + return ngraph::pass::low_precision::NetworkHelper::areQuantizeAndDequantizeSupportedForSubtract(node); + }); +} + +manager.run_passes(nGraphFunc); +//! [lpt_common] + +//! [lpt_execution] +using namespace ngraph::pass::low_precision; +if (useLpt) { + // Low precision transformations plugin specific configuration: restrictions definition + auto supportedPrecisions = std::vector({ + OperationPrecisionRestriction::create({ + {0, {ngraph::element::u8}}, + {1, {ngraph::element::i8}}, + }), + OperationPrecisionRestriction::create({ + {0, {ngraph::element::u8, ngraph::element::i8}}, + {1, {ngraph::element::i8}} + }), + OperationPrecisionRestriction::create({ + {0, {ngraph::element::u8}}, + {1, {ngraph::element::i8}} + }), + OperationPrecisionRestriction::create({ + {0, {ngraph::element::u8}}, + {1, {ngraph::element::i8}}, + }), + }); + + // Low precision transformations plugin specific configuration: per-tensor quantization operations definition + auto perTensorQuantization = std::vector({ + OperationPerTensorQuantizationRestriction::create({0}), + OperationPerTensorQuantizationRestriction::create({0}) + }); + + // Low precision transformations instantiation and registration in pass manager + ngraph::pass::Manager lptManager; + lptManager.register_pass(supportedPrecisions, perTensorQuantization); + + // Low precision transformations plugin specific configuration: transformation callbacks definition + lptManager.get_pass_config()->set_callback([](const std::shared_ptr& node) -> bool { + if (const auto multiply = std::dynamic_pointer_cast(node)) { + return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(multiply); + } + return false; + }); + lptManager.get_pass_config()->set_callback([](const std::shared_ptr& node) -> bool { + return LayerTransformation::isAsymmetricQuantization(node) || WeightableLayerTransformation::isAsymmetricOnWeights(node); + }); + lptManager.get_pass_config()->set_callback([](const std::shared_ptr& node) -> bool { + return MultiplyToGroupConvolutionTransformation::isDynamicOrScalar(node); + }); + + // Low precision transformations execution + lptManager.run_passes(nGraphFunc); +} +//! [lpt_execution] + +//! [lpt_device] +ngraph::pass::Manager deviceSpecificManager; +deviceSpecificManager.register_pass(); +deviceSpecificManager.run_passes(nGraphFunc); +//! [lpt_device] + +return 0; +} + +int lpt_supported_precisions() { +std::shared_ptr nGraphFunc; +ngraph::pass::Manager manager; + +using namespace ngraph::pass::low_precision; +//! [lpt_supported_precisions] +auto supportedPrecisions = std::vector({ + OperationPrecisionRestriction::create({ + {0, {ngraph::element::u8}}, + {1, {ngraph::element::i8}}, + }), +}); + +ngraph::pass::Manager lptManager; +lptManager.register_pass(supportedPrecisions); +lptManager.run_passes(nGraphFunc); +//! [lpt_supported_precisions] + +ngraph::pass::Manager deviceSpecificManager; +deviceSpecificManager.register_pass(); +deviceSpecificManager.run_passes(nGraphFunc); + +return 0; +} + +int per_tensor_quantization() { +std::shared_ptr nGraphFunc; +//! [per_tensor_quantization] +using namespace ngraph::pass::low_precision; + +const std::vector emptyRestrictions; + +auto perTensorQuantization = std::vector({ + OperationPerTensorQuantizationRestriction::create({0}) +}); + +ngraph::pass::Manager lptManager; +lptManager.register_pass(emptyRestrictions, perTensorQuantization); +lptManager.run_passes(nGraphFunc); +//! [per_tensor_quantization] + +return 0; +} + +int asymmetric_quantization() { +std::shared_ptr nGraphFunc; +ngraph::pass::Manager manager; +auto pass_config = manager.get_pass_config(); + + +//! [asymmetric_quantization] +using namespace ngraph::pass::low_precision; +ngraph::pass::Manager lptManager; +lptManager.register_pass(); +lptManager.get_pass_config()->set_callback([](const std::shared_ptr& node) -> bool { + return LayerTransformation::isAsymmetricQuantization(node) || WeightableLayerTransformation::isAsymmetricOnWeights(node); +}); +lptManager.run_passes(nGraphFunc); +//! [asymmetric_quantization] + +return 0; +} + +int lpt_markup_pipeline() { +std::shared_ptr nGraphFunc; +ngraph::pass::Manager manager; + +using namespace ngraph::pass::low_precision; +//! [lpt_markup_pipeline] +auto supportedPrecisions = std::vector({ + OperationPrecisionRestriction::create({ + {0, {ngraph::element::u8}}, + {1, {ngraph::element::i8}}, + }), +}); + +auto perTensorQuantization = std::vector({ + OperationPerTensorQuantizationRestriction::create({0}) +}); + +ngraph::pass::Manager lptManager; +lptManager.register_pass(supportedPrecisions, perTensorQuantization); +lptManager.run_passes(nGraphFunc); +//! [lpt_markup_pipeline] + +ngraph::pass::Manager deviceSpecificManager; +deviceSpecificManager.register_pass(); +deviceSpecificManager.run_passes(nGraphFunc); + +return 0; +} diff --git a/src/common/low_precision_transformations/include/low_precision/add.hpp b/src/common/low_precision_transformations/include/low_precision/add.hpp index 92caba9f382..f5bfc4d06fb 100644 --- a/src/common/low_precision_transformations/include/low_precision/add.hpp +++ b/src/common/low_precision_transformations/include/low_precision/add.hpp @@ -11,6 +11,15 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief AddTransformation propagates dequantization subtraction from one input branch to another and + * propagates dequantization multiplication from the same branch through Add operation. + * + * For more details about the transformation, refer to + * [AddTransformation](@ref openvino_docs_IE_DG_lpt_AddTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API AddTransformation : public EltwiseBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/align_quantization_intervals.hpp b/src/common/low_precision_transformations/include/low_precision/align_quantization_intervals.hpp index 87befcfd24f..63500cf39b6 100644 --- a/src/common/low_precision_transformations/include/low_precision/align_quantization_intervals.hpp +++ b/src/common/low_precision_transformations/include/low_precision/align_quantization_intervals.hpp @@ -18,6 +18,15 @@ class LP_TRANSFORMATIONS_API AlignQuantizationIntervals; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief AlignQuantizationIntervals transformation marks precision preserved operations subgraph by `IntervalsAlignmentAttribute` + * after FakeQuantize operations. + * + * For more details about the transformation, refer to + * [AlignQuantizationIntervals](@ref openvino_docs_IE_DG_lpt_AlignQuantizationIntervals) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::AlignQuantizationIntervals : public ngraph::pass::FunctionPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/align_quantization_parameters.hpp b/src/common/low_precision_transformations/include/low_precision/align_quantization_parameters.hpp index 1b354c5fd5c..f45d447cdfe 100644 --- a/src/common/low_precision_transformations/include/low_precision/align_quantization_parameters.hpp +++ b/src/common/low_precision_transformations/include/low_precision/align_quantization_parameters.hpp @@ -19,6 +19,15 @@ class LP_TRANSFORMATIONS_API AlignQuantizationParameters; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief AlignQuantizationParameters transformation marks precision preserved operations subgraph by `QuantizationAlignmentAttribute` + * attribute after FakeQuantize operations. + * + * For more details about the transformation, refer to + * [AlignQuantizationParameters](@ref openvino_docs_IE_DG_lpt_AlignQuantizationParameters) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::AlignQuantizationParameters : public ngraph::pass::FunctionPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/avg_pool.hpp b/src/common/low_precision_transformations/include/low_precision/avg_pool.hpp index 12d5eaf7e2a..f6fa113fc8a 100644 --- a/src/common/low_precision_transformations/include/low_precision/avg_pool.hpp +++ b/src/common/low_precision_transformations/include/low_precision/avg_pool.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief AvgPoolTransformation propagates dequantization operations through AvgPool operation. + * + * For more details about the transformation, refer to + * [AvgPoolTransformation](@ref openvino_docs_IE_DG_lpt_AvgPoolTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API AvgPoolTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/clamp.hpp b/src/common/low_precision_transformations/include/low_precision/clamp.hpp index a3cf76a1284..0af98ae690a 100644 --- a/src/common/low_precision_transformations/include/low_precision/clamp.hpp +++ b/src/common/low_precision_transformations/include/low_precision/clamp.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ClampTransformation propagates dequantization operations through Clamp operation. + * + * For more details about the transformation, refer to + * [ClampTransformation](@ref openvino_docs_IE_DG_lpt_ClampTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ClampTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/concat.hpp b/src/common/low_precision_transformations/include/low_precision/concat.hpp index c1f752972ad..448b600f994 100644 --- a/src/common/low_precision_transformations/include/low_precision/concat.hpp +++ b/src/common/low_precision_transformations/include/low_precision/concat.hpp @@ -19,6 +19,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ConcatTransformation propagates dequantization operations through Concat operation. + * + * For more details about the transformation, refer to + * [ConcatTransformation](@ref openvino_docs_IE_DG_lpt_ConcatTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ConcatTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/convert_subtract_constant.hpp b/src/common/low_precision_transformations/include/low_precision/convert_subtract_constant.hpp index f9584eb6842..d03b4895538 100644 --- a/src/common/low_precision_transformations/include/low_precision/convert_subtract_constant.hpp +++ b/src/common/low_precision_transformations/include/low_precision/convert_subtract_constant.hpp @@ -20,6 +20,15 @@ class LP_TRANSFORMATIONS_API ConvertSubtractConstant; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief ConvertSubtractConstant marks Convert operations on constant subgraph by DISABLED_CONSTANT_FOLDING attribute + * to prevent constant folding. + * + * For more details about the transformation, refer to + * [ConvertSubtractConstant](@ref openvino_docs_IE_DG_lpt_ConvertSubtractConstant) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::ConvertSubtractConstant : public ngraph::pass::MatcherPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/convolution.hpp b/src/common/low_precision_transformations/include/low_precision/convolution.hpp index b49fcc89c4a..c124f1a7bf9 100644 --- a/src/common/low_precision_transformations/include/low_precision/convolution.hpp +++ b/src/common/low_precision_transformations/include/low_precision/convolution.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ConvolutionTransformation propagates dequantization operations through Convolution operation. + * + * For more details about the transformation, refer to + * [ConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_ConvolutionTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ConvolutionTransformation : public WeightableLayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/convolution_backprop_data.hpp b/src/common/low_precision_transformations/include/low_precision/convolution_backprop_data.hpp index a1176e71eff..c64cc7198c2 100644 --- a/src/common/low_precision_transformations/include/low_precision/convolution_backprop_data.hpp +++ b/src/common/low_precision_transformations/include/low_precision/convolution_backprop_data.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ConvolutionBackpropDataTransformation propagates dequantization operations through ConvolutionBackpropData operation. + * + * For more details about the transformation, refer to + * [ConvolutionBackpropDataTransformation](@ref openvino_docs_IE_DG_lpt_ConvolutionBackpropDataTransformation) page in + * the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ConvolutionBackpropDataTransformation : public WeightableLayerTransformation { public: ConvolutionBackpropDataTransformation(const Params& params = Params()); diff --git a/src/common/low_precision_transformations/include/low_precision/create_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/create_attribute.hpp index c7b7c468826..8388003778b 100644 --- a/src/common/low_precision_transformations/include/low_precision/create_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/create_attribute.hpp @@ -31,6 +31,13 @@ enum class AttributeSource { OutputPort }; +/** + * @ingroup ie_transformation_common_api + * @brief CreateAttribute transformation marks OperationType operations by AttributeType attribute. + * + * For more details about the transformation, refer to + * [CreateAttribute](@ref openvino_docs_IE_DG_lpt_CreateAttribute) page in the Inference Engine Developer Guide. + */ template class ngraph::pass::low_precision::CreateAttribute : public ngraph::pass::low_precision::BaseMatcherPass { public: diff --git a/src/common/low_precision_transformations/include/low_precision/create_precisions_dependent_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/create_precisions_dependent_attribute.hpp index 2e05cc85761..e157940b12d 100644 --- a/src/common/low_precision_transformations/include/low_precision/create_precisions_dependent_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/create_precisions_dependent_attribute.hpp @@ -29,6 +29,15 @@ class CreatePrecisionsDependentAttribute; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief CreatePrecisionsDependentAttribute transformation marks OperationType operations by + * PrecisionPreservedAttribute and AttributeType attributes with the same shared part. + * + * For more details about the transformation, refer to + * [CreatePrecisionsDependentAttribute](@ref openvino_docs_IE_DG_lpt_CreatePrecisionsDependentAttribute) page + * in the Inference Engine Developer Guide. + */ template class ngraph::pass::low_precision::CreatePrecisionsDependentAttribute : public ngraph::pass::MatcherPass { public: diff --git a/src/common/low_precision_transformations/include/low_precision/depth_to_space.hpp b/src/common/low_precision_transformations/include/low_precision/depth_to_space.hpp index 5a199454eb6..20e21110f56 100644 --- a/src/common/low_precision_transformations/include/low_precision/depth_to_space.hpp +++ b/src/common/low_precision_transformations/include/low_precision/depth_to_space.hpp @@ -10,6 +10,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief DepthToSpaceTransformation propagates dequantization operations through DepthToSpace operation. + * + * For more details about the transformation, refer to + * [DepthToSpaceTransformation](@ref openvino_docs_IE_DG_lpt_DepthToSpaceTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API DepthToSpaceTransformation : public TransparentBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/eltwise_base_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/eltwise_base_transformation.hpp index c648d6efadc..312dd5af31a 100644 --- a/src/common/low_precision_transformations/include/low_precision/eltwise_base_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/eltwise_base_transformation.hpp @@ -12,6 +12,10 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief EltwiseBaseTransformation is base class for element-wise LPT transformations. + */ class LP_TRANSFORMATIONS_API EltwiseBaseTransformation : public LayerTransformation { public: EltwiseBaseTransformation(const Params& params) : LayerTransformation(params) {} diff --git a/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp b/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp index 6a3f84b6b4c..cb564222467 100644 --- a/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fake_quantize.hpp @@ -13,6 +13,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FakeQuantizeTransformation fuses dequantization operations into FakeQuantize operation. + * + * For more details about the transformation, refer to + * [FakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FakeQuantizeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FakeQuantizeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/fake_quantize_decomposition.hpp b/src/common/low_precision_transformations/include/low_precision/fake_quantize_decomposition.hpp index 45948ca32b7..7123fbe0157 100644 --- a/src/common/low_precision_transformations/include/low_precision/fake_quantize_decomposition.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fake_quantize_decomposition.hpp @@ -13,6 +13,15 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FakeQuantizeDecompositionTransformation decomposes FakeQuantize operations to quantize + * (FakeQuantize with changes output intervals and low precision output type) and dequantize operations. + * + * For more details about the transformation, refer to + * [FakeQuantizeDecompositionTransformation](@ref openvino_docs_IE_DG_lpt_FakeQuantizeDecompositionTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FakeQuantizeDecompositionTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/fold_convert.hpp b/src/common/low_precision_transformations/include/low_precision/fold_convert.hpp index 4390b7290e2..0c5fd8cf002 100644 --- a/src/common/low_precision_transformations/include/low_precision/fold_convert.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fold_convert.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FoldConvertTransformation evaluates Convert operation on Subtract constant subgraph. + * + * For more details about the transformation, refer to + * [FoldConvertTransformation](@ref openvino_docs_IE_DG_lpt_FoldConvertTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FoldConvertTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/fold_fake_quantize.hpp b/src/common/low_precision_transformations/include/low_precision/fold_fake_quantize.hpp index 7f2862fc942..474fd4dfe8e 100644 --- a/src/common/low_precision_transformations/include/low_precision/fold_fake_quantize.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fold_fake_quantize.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FoldFakeQuantizeTransformation evaluate FakeQuantize operations. + * + * For more details about the transformation, refer to + * [FoldFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FoldFakeQuantizeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FoldFakeQuantizeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/fuse_convert.hpp b/src/common/low_precision_transformations/include/low_precision/fuse_convert.hpp index 4ccc59808ad..24ee1ee8994 100644 --- a/src/common/low_precision_transformations/include/low_precision/fuse_convert.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fuse_convert.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FuseConvertTransformation fuses Convert operation with Multiply, Subtract or Add operations. + * + * For more details about the transformation, refer to + * [FuseConvertTransformation](@ref openvino_docs_IE_DG_lpt_FuseConvertTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FuseConvertTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/fuse_multiply_to_fake_quantize.hpp b/src/common/low_precision_transformations/include/low_precision/fuse_multiply_to_fake_quantize.hpp index d43aa87441e..335eb292be9 100644 --- a/src/common/low_precision_transformations/include/low_precision/fuse_multiply_to_fake_quantize.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fuse_multiply_to_fake_quantize.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FuseMultiplyToFakeQuantizeTransformation fuses Multiply operation to FakeQuantize. + * + * For more details about the transformation, refer to + * [FuseMultiplyToFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FuseMultiplyToFakeQuantizeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FuseMultiplyToFakeQuantizeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/fuse_subtract_to_fake_quantize.hpp b/src/common/low_precision_transformations/include/low_precision/fuse_subtract_to_fake_quantize.hpp index 80d6f22f785..6b06e1505fc 100644 --- a/src/common/low_precision_transformations/include/low_precision/fuse_subtract_to_fake_quantize.hpp +++ b/src/common/low_precision_transformations/include/low_precision/fuse_subtract_to_fake_quantize.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief FuseSubtractToFakeQuantizeTransformation fuses Subtract operation to FakeQuantize. + * + * For more details about the transformation, refer to + * [FuseSubtractToFakeQuantizeTransformation](@ref openvino_docs_IE_DG_lpt_FuseSubtractToFakeQuantizeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API FuseSubtractToFakeQuantizeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/group_convolution.hpp b/src/common/low_precision_transformations/include/low_precision/group_convolution.hpp index b54921faf69..2e249fd7947 100644 --- a/src/common/low_precision_transformations/include/low_precision/group_convolution.hpp +++ b/src/common/low_precision_transformations/include/low_precision/group_convolution.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief GroupConvolutionTransformation propagates dequantization operations through GroupConvolution operation. + * + * For more details about the transformation, refer to + * [GroupConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_GroupConvolutionTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API GroupConvolutionTransformation : public ConvolutionTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/interpolate.hpp b/src/common/low_precision_transformations/include/low_precision/interpolate.hpp index 9d454e59542..cfb7d1c3a80 100644 --- a/src/common/low_precision_transformations/include/low_precision/interpolate.hpp +++ b/src/common/low_precision_transformations/include/low_precision/interpolate.hpp @@ -10,6 +10,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief InterpolateTransformation propagates dequantization operations through Interpolate operation. + * + * For more details about the transformation, refer to + * [InterpolateTransformation](@ref openvino_docs_IE_DG_lpt_InterpolateTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API InterpolateTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp index 7befc214a7d..f7162d8c6fd 100644 --- a/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/layer_transformation.hpp @@ -225,7 +225,10 @@ inline std::ostream &operator << (std::ostream &os, const DataPrecision& value) return os; } -// Base class for all LP transformations, holds some common data structures +/** + * @ingroup ie_transformation_common_api + * @brief Base class for low precision transformation. + */ class LP_TRANSFORMATIONS_API LayerTransformation : public ngraph::pass::MatcherPass { static std::vector defaultPrecisions; static std::mutex defaultPrecisionsMutex; diff --git a/src/common/low_precision_transformations/include/low_precision/markup_avg_pool_precision_preserved.hpp b/src/common/low_precision_transformations/include/low_precision/markup_avg_pool_precision_preserved.hpp index eaa9e7878c9..d2d3f6d75c6 100644 --- a/src/common/low_precision_transformations/include/low_precision/markup_avg_pool_precision_preserved.hpp +++ b/src/common/low_precision_transformations/include/low_precision/markup_avg_pool_precision_preserved.hpp @@ -18,6 +18,14 @@ class LP_TRANSFORMATIONS_API MarkupAvgPoolPrecisionPreserved; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief MarkupAvgPoolPrecisionPreserved transformation marks AvgPool operations as precision preserved or not. + * + * For more details about the transformation, refer to + * [MarkupAvgPoolPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_MarkupAvgPoolPrecisionPreserved) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::MarkupAvgPoolPrecisionPreserved : public ngraph::pass::FunctionPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/markup_can_be_quantized.hpp b/src/common/low_precision_transformations/include/low_precision/markup_can_be_quantized.hpp index 7e11d856e97..81885274cb1 100644 --- a/src/common/low_precision_transformations/include/low_precision/markup_can_be_quantized.hpp +++ b/src/common/low_precision_transformations/include/low_precision/markup_can_be_quantized.hpp @@ -18,6 +18,16 @@ class LP_TRANSFORMATIONS_API MarkupCanBeQuantized; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief MarkupCanBeQuantized transformation marks Convolution, ConvolutionBackpropData, GroupConvolution and Concat + * operations as able to be quantized or not. If an operation is not quantized, then PrecisionsAttribute attribute instance + * is created with empty precisions. + * + * For more details about the transformation, refer to + * [MarkupCanBeQuantized](@ref openvino_docs_IE_DG_lpt_MarkupCanBeQuantized) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::MarkupCanBeQuantized : public ngraph::pass::FunctionPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/markup_per_tensor_quantization.hpp b/src/common/low_precision_transformations/include/low_precision/markup_per_tensor_quantization.hpp index 5cdbe43d018..fda9a25030d 100644 --- a/src/common/low_precision_transformations/include/low_precision/markup_per_tensor_quantization.hpp +++ b/src/common/low_precision_transformations/include/low_precision/markup_per_tensor_quantization.hpp @@ -22,6 +22,15 @@ class LP_TRANSFORMATIONS_API MarkupPerTensorQuantization; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief MarkupPerTensorQuantization transformation marks operations as required per-tensor quantization according to the + * provided restrictions. + * + * For more details about the transformation, refer to + * [MarkupPerTensorQuantization](@ref openvino_docs_IE_DG_lpt_MarkupPerTensorQuantization) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::MarkupPerTensorQuantization : public ngraph::pass::FunctionPass { public: class PerTensorQuantization { diff --git a/src/common/low_precision_transformations/include/low_precision/markup_precisions.hpp b/src/common/low_precision_transformations/include/low_precision/markup_precisions.hpp index 4e5b484f7c4..87c9a0d0563 100644 --- a/src/common/low_precision_transformations/include/low_precision/markup_precisions.hpp +++ b/src/common/low_precision_transformations/include/low_precision/markup_precisions.hpp @@ -23,6 +23,17 @@ class LP_TRANSFORMATIONS_API MarkupPrecisions; } // namespace ngraph // Transformation is used to add customization options runtime +/** + * @ingroup ie_transformation_common_api + * @brief MarkupPrecisions transformation marks: + * 1) not supported operations by PrecisionsAttribute attribute with empty precisions, + * 2) operations with required precisions by PrecisionsAttribute attribute according to the provided restrictions, + * 3) precision preserved operations by PrecisionPreservedAttribute attribute. + * + * For more details about the transformation, refer to + * [MarkupPrecisions](@ref openvino_docs_IE_DG_lpt_MarkupPrecisions) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::MarkupPrecisions : public ngraph::pass::FunctionPass { public: class Restriction { diff --git a/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp b/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp index 067f82ea59b..a97e896bd30 100644 --- a/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp +++ b/src/common/low_precision_transformations/include/low_precision/mat_mul.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief MatMulTransformation propagates dequantization operations through MatMul operation. + * + * For more details about the transformation, refer to + * [MatMulTransformation](@ref openvino_docs_IE_DG_lpt_MatMulTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API MatMulTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/max_pool.hpp b/src/common/low_precision_transformations/include/low_precision/max_pool.hpp index ca2b8a08272..dcea90fca82 100644 --- a/src/common/low_precision_transformations/include/low_precision/max_pool.hpp +++ b/src/common/low_precision_transformations/include/low_precision/max_pool.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief MaxPoolTransformation propagates dequantization operations through MaxPool operation. + * + * For more details about the transformation, refer to + * [MaxPoolTransformation](@ref openvino_docs_IE_DG_lpt_MaxPoolTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API MaxPoolTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/multiply.hpp b/src/common/low_precision_transformations/include/low_precision/multiply.hpp index fee17230569..aeec4e8b9d5 100644 --- a/src/common/low_precision_transformations/include/low_precision/multiply.hpp +++ b/src/common/low_precision_transformations/include/low_precision/multiply.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief MultiplyTransformation propagates dequantization operations through Multiply operation. + * + * For more details about the transformation, refer to + * [MultiplyTransformation](@ref openvino_docs_IE_DG_lpt_MultiplyTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API MultiplyTransformation : public EltwiseBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/multiply_to_group_convolution.hpp b/src/common/low_precision_transformations/include/low_precision/multiply_to_group_convolution.hpp index 5e6bd900d8e..eb0122d390d 100644 --- a/src/common/low_precision_transformations/include/low_precision/multiply_to_group_convolution.hpp +++ b/src/common/low_precision_transformations/include/low_precision/multiply_to_group_convolution.hpp @@ -13,6 +13,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief MultiplyToGroupConvolutionTransformation replace quantized Multiply operations to GroupConvolution to speed up inference. + * + * For more details about the transformation, refer to + * [MultiplyToGroupConvolutionTransformation](@ref openvino_docs_IE_DG_lpt_MultiplyToGroupConvolutionTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API MultiplyToGroupConvolutionTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/mvn.hpp b/src/common/low_precision_transformations/include/low_precision/mvn.hpp index 42ddd6f0b62..a853ccf8911 100644 --- a/src/common/low_precision_transformations/include/low_precision/mvn.hpp +++ b/src/common/low_precision_transformations/include/low_precision/mvn.hpp @@ -10,6 +10,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief MVNTransformation propagates dequantization operations through MVN operation. + * + * For more details about the transformation, refer to + * [MVNTransformation](@ref openvino_docs_IE_DG_lpt_MVNTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API MVNTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/normalize_l2.hpp b/src/common/low_precision_transformations/include/low_precision/normalize_l2.hpp index 88a113cb38a..28250fadd21 100644 --- a/src/common/low_precision_transformations/include/low_precision/normalize_l2.hpp +++ b/src/common/low_precision_transformations/include/low_precision/normalize_l2.hpp @@ -10,6 +10,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief NormalizeL2Transformation propagates dequantization operations through NormalizeL2 operation. + * + * For more details about the transformation, refer to + * [NormalizeL2Transformation](@ref openvino_docs_IE_DG_lpt_NormalizeL2Transformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API NormalizeL2Transformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/pad.hpp b/src/common/low_precision_transformations/include/low_precision/pad.hpp index 66691f3871a..ce01ca32b5d 100644 --- a/src/common/low_precision_transformations/include/low_precision/pad.hpp +++ b/src/common/low_precision_transformations/include/low_precision/pad.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief PadTransformation propagates dequantization operations through Pad operation. + * + * For more details about the transformation, refer to + * [PadTransformation](@ref openvino_docs_IE_DG_lpt_PadTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API PadTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/prelu.hpp b/src/common/low_precision_transformations/include/low_precision/prelu.hpp index e58d4b25615..e93d70a9078 100644 --- a/src/common/low_precision_transformations/include/low_precision/prelu.hpp +++ b/src/common/low_precision_transformations/include/low_precision/prelu.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief PReluTransformation propagates dequantization operations through PRelu operation. + * + * For more details about the transformation, refer to + * [PReluTransformation](@ref openvino_docs_IE_DG_lpt_PReluTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API PReluTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/propagate_precisions.hpp b/src/common/low_precision_transformations/include/low_precision/propagate_precisions.hpp index 5ed4f929026..57e8eb07da3 100644 --- a/src/common/low_precision_transformations/include/low_precision/propagate_precisions.hpp +++ b/src/common/low_precision_transformations/include/low_precision/propagate_precisions.hpp @@ -22,6 +22,14 @@ class LP_TRANSFORMATIONS_API PropagatePrecisions; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief PropagatePrecisions transformation propagates PrecisionsAttribute attribute instances precision preserved operations. + * + * For more details about the transformation, refer to + * [PropagatePrecisions](@ref openvino_docs_IE_DG_lpt_PropagatePrecisions) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::PropagatePrecisions : public ngraph::pass::FunctionPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/propagate_shared_value.hpp b/src/common/low_precision_transformations/include/low_precision/propagate_shared_value.hpp index 3f05c0b3bf2..2049d062a53 100644 --- a/src/common/low_precision_transformations/include/low_precision/propagate_shared_value.hpp +++ b/src/common/low_precision_transformations/include/low_precision/propagate_shared_value.hpp @@ -27,6 +27,15 @@ class LP_TRANSFORMATIONS_API PropagateSharedValue; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief PropagateSharedValue transformation propagates shared value AttributeType attribute instances + * through precision preserved operations. + * + * For more details about the transformation, refer to + * [PropagateSharedValue](@ref openvino_docs_IE_DG_lpt_PropagateSharedValue) page + * in the Inference Engine Developer Guide. + */ template class ngraph::pass::low_precision::PropagateSharedValue : public ngraph::pass::FunctionPass { public: diff --git a/src/common/low_precision_transformations/include/low_precision/propagate_through_precision_preserved.hpp b/src/common/low_precision_transformations/include/low_precision/propagate_through_precision_preserved.hpp index 844e23bfb95..cf2512e0a52 100644 --- a/src/common/low_precision_transformations/include/low_precision/propagate_through_precision_preserved.hpp +++ b/src/common/low_precision_transformations/include/low_precision/propagate_through_precision_preserved.hpp @@ -27,6 +27,15 @@ class PropagateThroughPrecisionPreserved; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief PropagateThroughPrecisionPreserved transformation propagates AttributeType attribute instances + * through precision preserved operations. + * + * For more details about the transformation, refer to + * [PropagateThroughPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_PropagateThroughPrecisionPreserved) page + * in the Inference Engine Developer Guide. + */ template class ngraph::pass::low_precision::PropagateThroughPrecisionPreserved : public ngraph::pass::MatcherPass { public: diff --git a/src/common/low_precision_transformations/include/low_precision/propagate_to_input.hpp b/src/common/low_precision_transformations/include/low_precision/propagate_to_input.hpp index 42f840c1573..7bc661f292a 100644 --- a/src/common/low_precision_transformations/include/low_precision/propagate_to_input.hpp +++ b/src/common/low_precision_transformations/include/low_precision/propagate_to_input.hpp @@ -26,6 +26,15 @@ class PropagateToInput; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief PropagateToInput transformation propagates AttributeType shared value attribute instances + * from parent output ports to consumers input ports. + * + * For more details about the transformation, refer to + * [PropagateToInput](@ref openvino_docs_IE_DG_lpt_PropagateToInput) page + * in the Inference Engine Developer Guide. + */ template class ngraph::pass::low_precision::PropagateToInput : public ngraph::pass::MatcherPass { public: diff --git a/src/common/low_precision_transformations/include/low_precision/pull_reshape_through_dequantization.hpp b/src/common/low_precision_transformations/include/low_precision/pull_reshape_through_dequantization.hpp index e8bc2add659..4a872e257b9 100644 --- a/src/common/low_precision_transformations/include/low_precision/pull_reshape_through_dequantization.hpp +++ b/src/common/low_precision_transformations/include/low_precision/pull_reshape_through_dequantization.hpp @@ -19,6 +19,15 @@ class LP_TRANSFORMATIONS_API PullReshapeThroughDequantization; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief PullReshapeThroughDequantization propagates dequantization operations through Reshape operations. + * The transformation is used on constant subgraph weights to prepare a model for the next low precision transformations. + * + * For more details about the transformation, refer to + * [PullReshapeThroughDequantization](@ref openvino_docs_IE_DG_lpt_PullReshapeThroughDequantization) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::PullReshapeThroughDequantization : public ngraph::pass::MatcherPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/pull_transpose_through_dequantization.hpp b/src/common/low_precision_transformations/include/low_precision/pull_transpose_through_dequantization.hpp index f9d957389e6..973ec50e3c0 100644 --- a/src/common/low_precision_transformations/include/low_precision/pull_transpose_through_dequantization.hpp +++ b/src/common/low_precision_transformations/include/low_precision/pull_transpose_through_dequantization.hpp @@ -19,6 +19,15 @@ class LP_TRANSFORMATIONS_API PullTransposeThroughDequantization; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief PullTransposeThroughDequantization propagates dequantization operations through Transpose operations. + * The transformation is used on constant subgraph weights to prepare a model for the next low precision transformations. + * + * For more details about the transformation, refer to + * [PullTransposeThroughDequantization](@ref openvino_docs_IE_DG_lpt_PullTransposeThroughDequantization) page + * in the Inference Engine Developer Guide. + */ class ngraph::pass::low_precision::PullTransposeThroughDequantization : public ngraph::pass::MatcherPass { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/reduce_base_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/reduce_base_transformation.hpp index 0b9782e4eb2..26c5eb340db 100644 --- a/src/common/low_precision_transformations/include/low_precision/reduce_base_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/reduce_base_transformation.hpp @@ -13,11 +13,11 @@ namespace pass { namespace low_precision { /** -* @brief ReduceBaseTransformation: base class for Reduce*Transformation -* detects dequantization operations in front of the Reduce* layer and -* propagates them through the Reduce* if possible -* -*/ + * @ingroup ie_transformation_common_api + * @brief ReduceBaseTransformation: base class for Reduce*Transformation, + * detects dequantization operations in front of the Reduce* operation and + * propagates them through the Reduce* if possible. + */ class LP_TRANSFORMATIONS_API ReduceBaseTransformation : public LayerTransformation { public: diff --git a/src/common/low_precision_transformations/include/low_precision/reduce_max.hpp b/src/common/low_precision_transformations/include/low_precision/reduce_max.hpp index b9c2b98253e..a94d6937313 100644 --- a/src/common/low_precision_transformations/include/low_precision/reduce_max.hpp +++ b/src/common/low_precision_transformations/include/low_precision/reduce_max.hpp @@ -14,6 +14,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ReduceMaxTransformation propagates dequantization operations through ReduceMax operation. + * + * For more details about the transformation, refer to + * [ReduceMaxTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMaxTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ReduceMaxTransformation : public ReduceBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/reduce_mean.hpp b/src/common/low_precision_transformations/include/low_precision/reduce_mean.hpp index 31f542a3754..fd2e8cb1e69 100644 --- a/src/common/low_precision_transformations/include/low_precision/reduce_mean.hpp +++ b/src/common/low_precision_transformations/include/low_precision/reduce_mean.hpp @@ -14,6 +14,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ReduceMeanTransformation propagates dequantization operations through ReduceMean operation. + * + * For more details about the transformation, refer to + * [ReduceMeanTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMeanTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ReduceMeanTransformation : public ReduceBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/reduce_min.hpp b/src/common/low_precision_transformations/include/low_precision/reduce_min.hpp index e4ccdeab97e..fa203fec71c 100644 --- a/src/common/low_precision_transformations/include/low_precision/reduce_min.hpp +++ b/src/common/low_precision_transformations/include/low_precision/reduce_min.hpp @@ -14,6 +14,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ReduceMinTransformation propagates dequantization operations through ReduceMin operation. + * + * For more details about the transformation, refer to + * [ReduceMinTransformation](@ref openvino_docs_IE_DG_lpt_ReduceMinTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ReduceMinTransformation : public ReduceBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/reduce_sum.hpp b/src/common/low_precision_transformations/include/low_precision/reduce_sum.hpp index 82e8dd28883..ac37fa47ca6 100644 --- a/src/common/low_precision_transformations/include/low_precision/reduce_sum.hpp +++ b/src/common/low_precision_transformations/include/low_precision/reduce_sum.hpp @@ -14,6 +14,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ReduceSumTransformation propagates dequantization operations through ReduceSum operation. + * + * For more details about the transformation, refer to + * [ReduceSumTransformation](@ref openvino_docs_IE_DG_lpt_ReduceSumTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ReduceSumTransformation : public ReduceBaseTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/relu.hpp b/src/common/low_precision_transformations/include/low_precision/relu.hpp index 1f7489a73d8..fdca5d5cafa 100644 --- a/src/common/low_precision_transformations/include/low_precision/relu.hpp +++ b/src/common/low_precision_transformations/include/low_precision/relu.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ReluTransformation propagates dequantization operations through Relu operation. + * + * For more details about the transformation, refer to + * [ReluTransformation](@ref openvino_docs_IE_DG_lpt_ReluTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ReluTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/reshape.hpp b/src/common/low_precision_transformations/include/low_precision/reshape.hpp index cb1b3a28456..53b904d8709 100644 --- a/src/common/low_precision_transformations/include/low_precision/reshape.hpp +++ b/src/common/low_precision_transformations/include/low_precision/reshape.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ReshapeTransformation propagates dequantization operations through Reshape operation. + * + * For more details about the transformation, refer to + * [ReshapeTransformation](@ref openvino_docs_IE_DG_lpt_ReshapeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ReshapeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/avg_pool_precision_preserved_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/avg_pool_precision_preserved_attribute.hpp index 9da87cba5ce..2e2a83cd261 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/avg_pool_precision_preserved_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/avg_pool_precision_preserved_attribute.hpp @@ -13,6 +13,15 @@ #include "low_precision/rt_info/precision_preserved_attribute.hpp" namespace ngraph { + +/** + * @ingroup ie_transformation_common_api + * @brief AvgPoolPrecisionPreservedAttribute is utility attribute which is used only during `AvgPool` operation precision + * preserved property definition. + * + * For more details about the attribute, refer to + * [AvgPoolPrecisionPreservedAttribute](@ref openvino_docs_IE_DG_lpt_AvgPoolPrecisionPreserved) page in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API AvgPoolPrecisionPreservedAttribute : public PrecisionPreservedAttribute { public: OPENVINO_RTTI("LowPrecision::AvgPoolPrecisionPreserved", "", ov::RuntimeAttribute, 0); diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/intervals_alignment_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/intervals_alignment_attribute.hpp index af8664110d3..f24593dfbb9 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/intervals_alignment_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/intervals_alignment_attribute.hpp @@ -15,6 +15,10 @@ #include "low_precision/lpt_visibility.hpp" namespace ngraph { +/** + * @ingroup ie_transformation_common_api + * @brief IntervalsAlignmentSharedValue is used by IntervalsAlignmentAttribute as attribute shared value. + */ class LP_TRANSFORMATIONS_API IntervalsAlignmentSharedValue { public: class Interval { @@ -45,6 +49,14 @@ public: #endif }; +/** + * @ingroup ie_transformation_common_api + * @brief IntervalsAlignmentAttribute defines subgraph with the same quantization intervals alignment. + * FakeQuantize operations are included. The attribute is used by quantization operations. + * + * For more details about the attribute, refer to + * [IntervalsAlignmentAttribute](@ref openvino_docs_IE_DG_lpt_IntervalsAlignment) page in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API IntervalsAlignmentAttribute : public SharedAttribute { public: OPENVINO_RTTI("LowPrecision::IntervalsAlignment", "", ov::RuntimeAttribute, 0); diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/per_tensor_quantization_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/per_tensor_quantization_attribute.hpp index 9a991dbf684..bef9a3157fd 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/per_tensor_quantization_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/per_tensor_quantization_attribute.hpp @@ -14,6 +14,13 @@ #include "attribute_parameters.hpp" namespace ngraph { +/** + * @ingroup ie_transformation_common_api + * @brief PerTensorQuantizationAttribute defines if operation input port requires per-tensor quantization. + * + * For more details about the attribute, refer to + * [PerTensorQuantizationAttribute](@ref openvino_docs_IE_DG_lpt_PerTensorQuantization) page in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API PerTensorQuantizationAttribute : public ov::RuntimeAttribute { public: OPENVINO_RTTI("LowPrecision::PerTensorQuantization", "", ov::RuntimeAttribute, 0); diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/precision_preserved_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/precision_preserved_attribute.hpp index 3a82a18d979..3f2bbec9977 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/precision_preserved_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/precision_preserved_attribute.hpp @@ -13,6 +13,14 @@ #include "low_precision/rt_info/shared_value_attribute.hpp" namespace ngraph { +/** + * @ingroup ie_transformation_common_api + * @brief PrecisionPreservedAttribute defines the precision preserved operation. If the attribute is absent, then an operation is + * not precision preserved. + * + * For more details about the attribute, refer to + * [PrecisionPreservedAttribute](@ref openvino_docs_IE_DG_lpt_PrecisionPreserved) page in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API PrecisionPreservedAttribute : public SharedAttribute { public: OPENVINO_RTTI("LowPrecision::PrecisionPreserved", "", ov::RuntimeAttribute, 0); diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/precisions_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/precisions_attribute.hpp index 4945c4e0640..264494427d6 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/precisions_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/precisions_attribute.hpp @@ -18,7 +18,13 @@ #include "low_precision/rt_info/shared_value_attribute.hpp" namespace ngraph { - +/** + * @ingroup ie_transformation_common_api + * @brief PrecisionsAttribute defines precision which is required for input/output port or an operation. + * + * For more details about the attribute, refer to + * [PrecisionsAttribute](@ref openvino_docs_IE_DG_lpt_Precisions) page in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API PrecisionsAttribute : public SharedAttribute> { public: OPENVINO_RTTI("LowPrecision::Precisions", "", ov::RuntimeAttribute, 0); diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/quantization_alignment_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/quantization_alignment_attribute.hpp index d8a09ccdc54..96f5b9ec02b 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/quantization_alignment_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/quantization_alignment_attribute.hpp @@ -18,6 +18,14 @@ #include "attribute_parameters.hpp" namespace ngraph { +/** + * @ingroup ie_transformation_common_api + * @brief QuantizationAlignmentAttribute defines subgraph with the same quantization alignment. + * FakeQuantize operations are not included. The attribute is used by quantization operations. + * + * For more details about the attribute, refer to + * [QuantizationAlignmentAttribute](@ref openvino_docs_IE_DG_lpt_QuantizationAlignment) page in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API QuantizationAlignmentAttribute : public SharedAttribute { public: OPENVINO_RTTI("LowPrecision::QuantizationAlignment", "", ov::RuntimeAttribute, 0); diff --git a/src/common/low_precision_transformations/include/low_precision/rt_info/shared_value_attribute.hpp b/src/common/low_precision_transformations/include/low_precision/rt_info/shared_value_attribute.hpp index 5b922c07b5c..5f829d132b7 100644 --- a/src/common/low_precision_transformations/include/low_precision/rt_info/shared_value_attribute.hpp +++ b/src/common/low_precision_transformations/include/low_precision/rt_info/shared_value_attribute.hpp @@ -18,6 +18,12 @@ template class LP_TRANSFORMATIONS_API SharedAttribute : public ov::RuntimeAttribute { public: virtual ~SharedAttribute() = default; + + /** + * @ingroup ie_transformation_common_api + * @brief SharedValueAttribute type for shared value attributes. + * The attribute is used for attribute SharedValue value backward propagation. + */ class LP_TRANSFORMATIONS_API SharedValueAttribute : public std::enable_shared_from_this { public: struct LP_TRANSFORMATIONS_API SharedValue : public std::enable_shared_from_this { diff --git a/src/common/low_precision_transformations/include/low_precision/shuffle_channels.hpp b/src/common/low_precision_transformations/include/low_precision/shuffle_channels.hpp index ab28d754598..f5dd05fc8bc 100644 --- a/src/common/low_precision_transformations/include/low_precision/shuffle_channels.hpp +++ b/src/common/low_precision_transformations/include/low_precision/shuffle_channels.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief ShuffleChannelsTransformation propagates dequantization operations through ShuffleChannels operation. + * + * For more details about the transformation, refer to + * [ShuffleChannelsTransformation](@ref openvino_docs_IE_DG_lpt_ShuffleChannelsTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API ShuffleChannelsTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/split.hpp b/src/common/low_precision_transformations/include/low_precision/split.hpp index d4f2c72b8be..e85b5ed2dde 100644 --- a/src/common/low_precision_transformations/include/low_precision/split.hpp +++ b/src/common/low_precision_transformations/include/low_precision/split.hpp @@ -13,6 +13,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief SplitTransformation propagates dequantization operations through Split operation. + * + * For more details about the transformation, refer to + * [SplitTransformation](@ref openvino_docs_IE_DG_lpt_SplitTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API SplitTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/squeeze.hpp b/src/common/low_precision_transformations/include/low_precision/squeeze.hpp index fab050564c8..2bac4300c14 100644 --- a/src/common/low_precision_transformations/include/low_precision/squeeze.hpp +++ b/src/common/low_precision_transformations/include/low_precision/squeeze.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief SqueezeTransformation propagates dequantization operations through Squeeze operation. + * + * For more details about the transformation, refer to + * [SqueezeTransformation](@ref openvino_docs_IE_DG_lpt_SqueezeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API SqueezeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/strided_slice.hpp b/src/common/low_precision_transformations/include/low_precision/strided_slice.hpp index 5a0520f54ae..cf7bc52f408 100644 --- a/src/common/low_precision_transformations/include/low_precision/strided_slice.hpp +++ b/src/common/low_precision_transformations/include/low_precision/strided_slice.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief StridedSliceTransformation propagates dequantization operations through StridedSlice operation. + * + * For more details about the transformation, refer to + * [StridedSliceTransformation](@ref openvino_docs_IE_DG_lpt_StridedSliceTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API StridedSliceTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/subtract.hpp b/src/common/low_precision_transformations/include/low_precision/subtract.hpp index 56c66d99450..4d15b62c6c2 100644 --- a/src/common/low_precision_transformations/include/low_precision/subtract.hpp +++ b/src/common/low_precision_transformations/include/low_precision/subtract.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief SubtractTransformation propagates dequantization operations through Subtract operation. + * + * For more details about the transformation, refer to + * [SubtractTransformation](@ref openvino_docs_IE_DG_lpt_SubtractTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API SubtractTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/transformation_context.hpp b/src/common/low_precision_transformations/include/low_precision/transformation_context.hpp index 1aad5e55bd6..9a2473a71f6 100644 --- a/src/common/low_precision_transformations/include/low_precision/transformation_context.hpp +++ b/src/common/low_precision_transformations/include/low_precision/transformation_context.hpp @@ -13,6 +13,10 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief TransformationContext instance is used to pass model transformation context data between transformations. + */ class LP_TRANSFORMATIONS_API TransformationContext { public: TransformationContext(); diff --git a/src/common/low_precision_transformations/include/low_precision/transparent_base_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/transparent_base_transformation.hpp index d1f87f92f86..b9a3454b4b7 100644 --- a/src/common/low_precision_transformations/include/low_precision/transparent_base_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/transparent_base_transformation.hpp @@ -12,6 +12,10 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief TransparentBaseTransformation is base type for precision preserved operation transformation. + */ class LP_TRANSFORMATIONS_API TransparentBaseTransformation : public LayerTransformation { public: TransparentBaseTransformation(const Params& params) : LayerTransformation(params) {} diff --git a/src/common/low_precision_transformations/include/low_precision/transpose.hpp b/src/common/low_precision_transformations/include/low_precision/transpose.hpp index d22fcc8ed8c..f9eadc07578 100644 --- a/src/common/low_precision_transformations/include/low_precision/transpose.hpp +++ b/src/common/low_precision_transformations/include/low_precision/transpose.hpp @@ -12,6 +12,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief TransposeTransformation propagates dequantization operations through Transpose operation. + * + * For more details about the transformation, refer to + * [TransposeTransformation](@ref openvino_docs_IE_DG_lpt_TransposeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API TransposeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/unsqueeze.hpp b/src/common/low_precision_transformations/include/low_precision/unsqueeze.hpp index 580c09ad80b..92e4e2671e0 100644 --- a/src/common/low_precision_transformations/include/low_precision/unsqueeze.hpp +++ b/src/common/low_precision_transformations/include/low_precision/unsqueeze.hpp @@ -11,6 +11,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief UnsqueezeTransformation propagates dequantization operations through Unsqueeze operation. + * + * For more details about the transformation, refer to + * [UnsqueezeTransformation](@ref openvino_docs_IE_DG_lpt_UnsqueezeTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API UnsqueezeTransformation : public LayerTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/update_shared_precision_preserved.hpp b/src/common/low_precision_transformations/include/low_precision/update_shared_precision_preserved.hpp index 42745f8f793..e7899731d9a 100644 --- a/src/common/low_precision_transformations/include/low_precision/update_shared_precision_preserved.hpp +++ b/src/common/low_precision_transformations/include/low_precision/update_shared_precision_preserved.hpp @@ -25,6 +25,15 @@ class UpdateSharedPrecisionPreserved; } // namespace pass } // namespace ngraph +/** + * @ingroup ie_transformation_common_api + * @brief UpdateSharedPrecisionPreserved transformation updates shared AttributeType attribute instance value to true + * for precision preserved operations if ExpectedAttributeType exist. + * + * For more details about the transformation, refer to + * [UpdateSharedPrecisionPreserved](@ref openvino_docs_IE_DG_lpt_UpdateSharedPrecisionPreserved) page + * in the Inference Engine Developer Guide. + */ template class ngraph::pass::low_precision::UpdateSharedPrecisionPreserved : public ngraph::pass::MatcherPass { public: @@ -76,7 +85,7 @@ public: return true; }; - auto matcher = std::make_shared(pattern::any_input(), "PropagateThroughPrecisionPreserved"); + auto matcher = std::make_shared(pattern::any_input(), "UpdateSharedPrecisionPreserved"); this->register_matcher(matcher, callback); } diff --git a/src/common/low_precision_transformations/include/low_precision/variadic_split.hpp b/src/common/low_precision_transformations/include/low_precision/variadic_split.hpp index 014b3775fe7..2b45d001023 100644 --- a/src/common/low_precision_transformations/include/low_precision/variadic_split.hpp +++ b/src/common/low_precision_transformations/include/low_precision/variadic_split.hpp @@ -13,6 +13,14 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief VariadicSplitTransformation propagates dequantization operations through VariadicSplit operation. + * + * For more details about the transformation, refer to + * [VariadicSplitTransformation](@ref openvino_docs_IE_DG_lpt_VariadicSplitTransformation) page + * in the Inference Engine Developer Guide. + */ class LP_TRANSFORMATIONS_API VariadicSplitTransformation : public SplitTransformation { public: NGRAPH_RTTI_DECLARATION; diff --git a/src/common/low_precision_transformations/include/low_precision/weightable_layer_transformation.hpp b/src/common/low_precision_transformations/include/low_precision/weightable_layer_transformation.hpp index e045190aae4..bee3137907b 100644 --- a/src/common/low_precision_transformations/include/low_precision/weightable_layer_transformation.hpp +++ b/src/common/low_precision_transformations/include/low_precision/weightable_layer_transformation.hpp @@ -13,6 +13,10 @@ namespace ngraph { namespace pass { namespace low_precision { +/** + * @ingroup ie_transformation_common_api + * @brief WeightableLayerTransformation is base type for weightable operation transformation. + */ class LP_TRANSFORMATIONS_API WeightableLayerTransformation : public LayerTransformation{ public: WeightableLayerTransformation(const Params& params); diff --git a/src/common/transformations/include/transformations/common_optimizations/lin_op_sequence_fusion.hpp b/src/common/transformations/include/transformations/common_optimizations/lin_op_sequence_fusion.hpp index 9f7f1587d11..eaf9d1c846e 100644 --- a/src/common/transformations/include/transformations/common_optimizations/lin_op_sequence_fusion.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/lin_op_sequence_fusion.hpp @@ -39,6 +39,10 @@ public: MultiplyMultiplyFusion(); }; +/** + * @ingroup ie_transformation_common_api + * @brief LinOpSequenceFusion transformation fuses linear operation sequence. + */ class ngraph::pass::LinOpSequenceFusion: public ngraph::pass::GraphRewrite { public: NGRAPH_RTTI_DECLARATION; From ef390902ece9d0cd1bd70b25a7805c529386f9d2 Mon Sep 17 00:00:00 2001 From: Mingyu Kim Date: Tue, 11 Jan 2022 17:56:36 +0900 Subject: [PATCH 69/78] [GPU] fp16-int8 mixed precision (#9483) * Use fp16-int8 mixed precision, instead of fp32-int8 mixed precision for onednn * Allow quantization fusion into bsv32_fsv16 conv * For conv, do not select bsv16_fsv16. Select bsv32_fsv16 for mixed-layout * depthwise conv is supported even though it is not fp16 * Allow resample kernel to work as cross-layout * test case for cross-layout of resample_opt kernel * Select onednn-friendly format from cldnn conv * Optimization for fp16 mixed precision * Choose mixed layout in case of mixed precision from reorder_inputs * Support for mixed precision from depth_to_space * Do not convert first conv format * Use onednn for FC output of fp16 * Choose bsv8_fsv4 from quantization even when conv kernel size is not 7 * Select cldnn for first conv when input feature depth is 1 * For first conv, use onednn only when kernel size is 7x7 * Use short variable name and added is_i8_u8 helper function Co-authored-by: Kim,SungEun --- .../include/intel_gpu/runtime/layout.hpp | 4 + .../prepare_primitive_fusing.cpp | 4 + .../graph/graph_optimizer/reorder_inputs.cpp | 60 +++++++-- .../src/graph/impls/ocl/depth_to_space.cpp | 8 ++ .../intel_gpu/src/graph/layout_optimizer.cpp | 115 ++++++++++++------ src/plugins/intel_gpu/src/plugin/program.cpp | 2 +- .../src/plugin/transformations_pipeline.cpp | 2 +- .../tests/test_cases/resample_gpu_test.cpp | 8 +- 8 files changed, 150 insertions(+), 53 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp index 4b694ed0f6c..8bc0263a06e 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/layout.hpp @@ -116,6 +116,10 @@ struct data_type_traits { return (static_cast(data_type) & float_type_mask) != 0; } + static bool is_i8_u8(data_types data_type) { + return data_type == data_types::i8 || data_type == data_types::u8; + } + static size_t align_of(data_types data_type) { switch (data_type) { case data_types::bin: diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 6df79c97974..3fdb453bc09 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -844,13 +844,17 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) { input_data.as().get_primitive()->dilation.spatial[0] == 1 && input_data.as().get_primitive()->dilation.spatial[1] == 1; + auto expected_format = _lo.get_preferred_format(input_data); + should_fuse |= input_data.is_type() && conv_supports_fusings(input_data.as()) && quantize_node.get_scale_shift_opt() && ((out_layout.data_type == data_types::f32 || out_layout.data_type == data_types::f16) || input_data.get_output_layout().format == format::b_fs_yx_fsv16 || + input_data.get_output_layout().format == format::bs_fs_yx_bsv32_fsv16 || (_lo.should_select_b_fs_yx_fsv16_layout(input_data.as(), input_data.get_dependency(1).get_output_layout()) && !is_grouped_conv(input_data.as())) || // Avoid fusing to b_fs_yx_fsv16 (and similar) kernels + expected_format == cldnn::format::bs_fs_yx_bsv32_fsv16 /* Allow quantization fusing for onednn */ || ((input_data.get_dependency(0).get_output_layout().data_type == data_types::u8 || input_data.get_dependency(0).get_output_layout().data_type == data_types::i8) && (out_layout.data_type == data_types::u8 || out_layout.data_type == data_types::i8))); diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp index 71a31fa3246..436c6fb977e 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp @@ -562,16 +562,54 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) } }; - const auto reorder_weights_convolution = [&p, &lo, &rf](typed_program_node& conv_node) { - auto& weights = conv_node.weights(); - auto weights_layout = weights.get_output_layout(); - if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type() && !weights.is_constant()) { - auto dims = weights_layout.format.dimension(); - auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx; - auto reorder = rf.get_reorder(weights.id(), weights_layout, - layout{ weights_layout.data_type, preferred_format, weights_layout.size }); - if (reorder.first) { - p.add_intermediate(reorder.first, conv_node, 1, !reorder.second); + const auto reorder_convolution = [&p, &lo, &rf](typed_program_node& conv_node) { + { + // reorder weights convolution + auto& weights = conv_node.weights(); + auto weights_layout = weights.get_output_layout(); + if (!format::is_simple_data_format(weights_layout.format) && !weights.is_type() && !weights.is_constant()) { + auto dims = weights_layout.format.dimension(); + auto preferred_format = dims <= 4 ? format::bfyx : dims == 5 ? format::bfzyx : format::bfwzyx; + auto reorder = rf.get_reorder(weights.id(), weights_layout, + layout{ weights_layout.data_type, preferred_format, weights_layout.size }); + if (reorder.first) { + p.add_intermediate(reorder.first, conv_node, 1, !reorder.second); + } + } + } + + std::vector wrong_format = {format::b_fs_yx_fsv16, format::bs_fs_yx_bsv32_fsv16}; + std::vector correct_format = {format::b_fs_yx_fsv32, format::bs_fs_yx_bsv32_fsv32}; + for (int i = 0; i < wrong_format.size(); i++) { + // reorder for onednn mixed-precision conv + // If the layouts are like below, change input layout to fsv32. + // From: + // (bsv32_fsv16.u8) --> conv --> (bsv32_fsv16.fp16) + // To: + // (bsv32_fsv16.u8) --> reorder --> (bsv32_fsv32.u8) --> conv --> (bsv32_fsv16.fp16) + // + // Do not apply such change for b=1 first conv + + auto prev_node = conv_node.get_dependencies().front(); + auto old_layout = prev_node->get_output_layout(); + auto conv_layout = conv_node.get_output_layout(); + if (lo.get_optimization_attributes().use_onednn_impls + && conv_layout.format == wrong_format[i] + && data_type_traits::is_i8_u8(old_layout.data_type) + && (old_layout.format == wrong_format[i]) + && !(old_layout.size.batch[0] == 1 && old_layout.size.feature[0] <= 4)) { + auto new_layout = old_layout; + new_layout.format = correct_format[i]; + auto new_input = rf.get_reorder(prev_node->id(), + old_layout, + new_layout); + + if (new_input.first) { + p.add_intermediate(new_input.first, conv_node, 0, !new_input.second); + } + + // Prevent layout propagation as we are using mixed precision for conv + conv_node.get_dependencies().front()->set_output_layout(new_layout, false); } } }; @@ -598,7 +636,7 @@ void reorder_inputs::run(program& p, layout_optimizer& lo, reorder_factory& rf) reorder_input_detection_output, reorder_input_binary_convolution, reorder_input_and_weights_deconvolution, - reorder_weights_convolution, + reorder_convolution, reorder_input_fully_connected); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp index 153e0c77c9e..68dcf1a0683 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/depth_to_space.cpp @@ -67,6 +67,14 @@ attach_depth_to_space_impl::attach_depth_to_space_impl() { std::make_tuple(data_types::f16, format::b_fs_yx_fsv32), std::make_tuple(data_types::u8, format::b_fs_yx_fsv32), std::make_tuple(data_types::i8, format::b_fs_yx_fsv32), + std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv16), + std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv16), + std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv16), + std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv16), + std::make_tuple(data_types::f32, format::bs_fs_yx_bsv32_fsv32), + std::make_tuple(data_types::f16, format::bs_fs_yx_bsv32_fsv32), + std::make_tuple(data_types::u8, format::bs_fs_yx_bsv32_fsv32), + std::make_tuple(data_types::i8, format::bs_fs_yx_bsv32_fsv32), }); } diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 10a85f71a6d..fd4cc866cea 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -9,6 +9,7 @@ #include "data_inst.h" #include "reorder_inst.h" +#include "resample_inst.h" #include "reshape_inst.h" #include "generic_layer.hpp" #include @@ -208,6 +209,10 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, } } + // Ref kernels are the main for depth_to_space and region_yolo. It can do anything. + if (next.is_type() || next.is_type()) + return true; + if (next.is_type()) { // Avoid fusing current reorder to fuse next reorder if (next.get_users().size() == 1 && next.get_users().front()->is_type() && use_onednn_impls) { @@ -221,6 +226,14 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, return true; } + // resample_opt kernel can work cross-layout between fsv16 and fsv32 + if (next.is_type() && + (fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_yx_fsv32 + || fmt_prev == format::bs_fs_yx_bsv32_fsv16 || fmt_prev == format::bs_fs_yx_bsv32_fsv32) && + (fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_yx_fsv32 + || fmt_next == format::bs_fs_yx_bsv32_fsv16 || fmt_next == format::bs_fs_yx_bsv32_fsv32)) + return true; + if (next.is_type() && (((prev_simple && next_simple) && (prev_dt == next_dt)) || ((fmt_prev == format::b_fs_yx_fsv4 && fmt_next == format::bfyx) && (prev_dt == data_types::u8 || prev_dt == data_types::i8)))) @@ -319,21 +332,29 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, // Remove Reorder to support mixed format convolutions of bsv32fsv16 or bsv32fsv32 output if (next.is_type() && (prev.is_type() || prev.is_type()) && - (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2) && + (fmt_prev == format::bfyx || fmt_prev == format::bs_fs_yx_bsv4_fsv2 || fmt_prev == format::bs_fs_yx_bsv8_fsv4) && ((fmt_next == format::bs_fs_yx_bsv32_fsv32 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)) || (fmt_next == format::bs_fs_yx_bsv32_fsv16 && (prev_output_layout.size.feature[0] == 3 || prev_output_layout.size.feature[0] == 4)))) return true; // Remove Reorder for Convolution: b_fs_yx_fsv32 (i8/u8) -> b_fs_yx_fsv16 (fp32/fp16) - if (next.is_type() && fmt_prev == format::b_fs_yx_fsv32 && fmt_next == format::b_fs_yx_fsv16 && - !data_type_traits::is_floating_point(prev_dt) && data_type_traits::is_floating_point(next_dt)) { - auto& node = prev.get_users().front(); - // Avoid to fuse padding reorder to previous onednn convolution - if (prev.get_preferred_impl_type() == impl_types::onednn && - (node->get_output_layout().data_padding != prev.get_output_layout().data_padding)) - return false; - else - return true; + // b_fs_yx_fsv16 (fp32/fp16) -> b_fs_yx_fsv32 (i8/u8) + if (next.is_type()) { + const bool fsv32_to_fsv16 = (((fmt_prev == format::b_fs_yx_fsv32 && fmt_next == format::b_fs_yx_fsv16) || + (fmt_prev == format::bs_fs_yx_bsv32_fsv32 && fmt_next == format::bs_fs_yx_bsv32_fsv16)) && + data_type_traits::is_i8_u8(prev_dt) && data_type_traits::is_floating_point(next_dt)); + const bool fsv16_to_fsv32 = (((fmt_prev == format::b_fs_yx_fsv16 && fmt_next == format::b_fs_yx_fsv32) || + (fmt_prev == format::bs_fs_yx_bsv32_fsv16 && fmt_next == format::bs_fs_yx_bsv32_fsv32)) && + data_type_traits::is_floating_point(prev_dt) && data_type_traits::is_i8_u8(next_dt)); + if (fsv32_to_fsv16 || fsv16_to_fsv32) { + auto& node = prev.get_users().front(); + // Avoid to fuse padding reorder to previous onednn convolution + if (prev.get_preferred_impl_type() == impl_types::onednn && + (node->get_output_layout().data_padding != prev.get_output_layout().data_padding)) + return false; + else + return true; + } } if (next.is_type()) @@ -367,10 +388,12 @@ bool layout_optimizer::can_fuse_reorder(program_node& prev, program_node& next, } bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node* next, format fmt_prev, format fmt_next) { - if (next == nullptr) { - // Ref kernels are the main for depth_to_space and region_yolo. It can do anything - return prev.is_type() || prev.is_type(); - } + // Ref kernels are the main for depth_to_space and region_yolo. It can do anything. Should not see next. + if (prev.is_type() || prev.is_type()) + return true; + + if (next == nullptr) + return false; auto dt_prev = prev.get_output_layout().data_type; auto dt_next = next->get_output_layout().data_type; @@ -379,6 +402,14 @@ bool layout_optimizer::can_fuse_reorder_to_prev(program_node& prev, program_node if (prev.is_type()) return true; + // resample_opt kernel can work cross-layout between fsv16 and fsv32 + if (prev.is_type() && + (fmt_prev == format::b_fs_yx_fsv16 || fmt_prev == format::b_fs_yx_fsv32 + || fmt_prev == format::bs_fs_yx_bsv32_fsv16 || fmt_prev == format::bs_fs_yx_bsv32_fsv32) && + (fmt_next == format::b_fs_yx_fsv16 || fmt_next == format::b_fs_yx_fsv32 + || fmt_next == format::bs_fs_yx_bsv32_fsv16 || fmt_next == format::bs_fs_yx_bsv32_fsv32)) + return true; + if (prev.is_type() && fmt_next == format::b_fs_yx_fsv16) return true; @@ -900,7 +931,7 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, } } - if (use_onednn_impls) { + if (use_onednn_impls && onednn_valid_post_ops) { std::function has_any_convolutions_below; has_any_convolutions_below = [&](const program_node& node) -> bool { for (auto& usr : node.get_users()) { @@ -913,11 +944,10 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, /* ***************************** OneDNN impls format selection part ****************************** */ bool valid_grouped = !is_dw && prim->groups > 1 && (ofm_per_group % compute_block == 0 && ifm_per_group % compute_block == 0); - // TODO: uncomment this code when corresponding fsv32 optimizations inside clDNN will be implemented - // bool i8_u8_output = output_layout.data_type == data_types::u8 || output_layout.data_type == data_types::i8; + bool i8_u8_output = data_type_traits::is_i8_u8(output_layout.data_type); // bool is_first_conv = input_layout.size.feature[0] < 4; - if (i8_u8_input) { + if (i8_u8_output) { if ((non_grouped || valid_grouped || valid_int8_dw) && onednn_valid_post_ops && is_2d) { if (input_layout.size.batch[0] >= 16) { expected_format = cldnn::format::bs_fs_yx_bsv32_fsv32; @@ -937,23 +967,19 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, expected_format = imad_case(node); } expected_tensor = current_layout.size; - } else if (input_layout.data_type == data_types::f16 && is_2d) { + } else if ((output_layout.data_type == data_types::f16 || output_layout.data_type == data_types::f32) && is_2d) { expected_tensor = current_layout.size; if (input_layout.size.batch[0] >= 16 && onednn_valid_post_ops) { - if (output_layout.data_type == input_layout.data_type) { - if (non_grouped || valid_grouped || is_dw) { - expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16; - } else { - expected_format = cldnn::format::b_fs_yx_fsv16; - } + if (non_grouped || valid_grouped || is_dw) { + expected_format = cldnn::format::bs_fs_yx_bsv32_fsv16; } else { - expected_format = cldnn::format::bs_fs_yx_bsv16_fsv16; + expected_format = cldnn::format::b_fs_yx_fsv16; } } else { expected_format = cldnn::format::b_fs_yx_fsv16; } - } else if (input_layout.data_type == data_types::f16 && + } else if (output_layout.data_type == data_types::f16 && convolution_bs_fs_yx_bsv16_fsv16_opt(input_layout, output_layout, weights_layout, prim) && (output_layout.data_type == input_layout.data_type || !data_type_traits::is_floating_point(input_layout.data_type)) && is_2d) { @@ -973,7 +999,11 @@ layout layout_optimizer::get_expected_layout(layout const& current_layout, //} } else { /* *************************** Native impls format selection part ************************** */ - if (i8_u8_input) { + if (use_onednn_impls && i8_u8_input) { + // It is here because of post operation condition for onednn. + // Use fsv32 for onednn friendliness. + expected_format = cldnn::format::b_fs_yx_fsv32; + } else if (i8_u8_input) { if ((_optimization_attributes.b_fs_yx_fsv16_network && convolution_b_fs_yx_fsv16_opt(input_layout, output_layout, weights_layout, prim))) { expected_format = cldnn::format::b_fs_yx_fsv16; @@ -1155,10 +1185,11 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) { auto wei_dt = is_conv ? node.as().weights().get_output_layout().data_type : node.as().weights().get_output_layout().data_type; - if ((in_dt == data_types::f16 && wei_dt == data_types::f16) && (out_dt == data_types::f16 || out_dt == data_types::f32 || out_dt == data_types::i8)) + if ((in_dt == data_types::f16 && wei_dt == data_types::f16) && + (out_dt == data_types::f16 || out_dt == data_types::f32 || out_dt == data_types::i8 || out_dt == data_types::u8)) return true; if ((in_dt == data_types::i8 || in_dt == data_types::u8) && wei_dt == data_types::i8 && - (out_dt == data_types::f32 || out_dt == data_types::i32 || out_dt == data_types::i8 || out_dt == data_types::u8)) + (out_dt == data_types::f32 || out_dt == data_types::i32 || out_dt == data_types::f16 || out_dt == data_types::i8 || out_dt == data_types::u8)) return true; } else if (node.is_type()) { auto& fc_node = node.as(); @@ -1170,7 +1201,7 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) { if (in_dt == data_types::f32 && wei_dt == data_types::f32) return true; if ((in_dt == data_types::i8 || in_dt == data_types::u8) && (wei_dt == data_types::i8) && - (out_dt == data_types::i8 || out_dt == data_types::u8 || out_dt == data_types::i32 || out_dt == data_types::f32)) + (out_dt == data_types::i8 || out_dt == data_types::u8 || out_dt == data_types::i32 || out_dt == data_types::f16 || out_dt == data_types::f32)) return true; } @@ -1259,6 +1290,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format auto input_fmt = input_layout.format; auto output_fmt = output_layout.format; + auto input_dt = input_layout.data_type; + auto output_dt = output_layout.data_type; preferred_impl = impl_types::onednn; @@ -1273,14 +1306,22 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format } // Native impl works faster for this type of reorder - if (input_layout.format == format::bfyx && output_layout.format == format::bfyx) { + if (input_fmt == format::bfyx && output_fmt == format::bfyx) { preferred_impl = impl_types::ocl; } // onednn reorder doesn't support different number of dimensions in input and output layouts - if (input_layout.format.dimension() != output_layout.format.dimension()) { + if (input_fmt.dimension() != output_fmt.dimension()) { preferred_impl = impl_types::ocl; } + + // For mixed precision case, onednn is slower than cldnn + if (input_fmt == format::b_fs_yx_fsv16 && data_type_traits::is_i8_u8(input_dt)) + preferred_impl = impl_types::ocl; + if (output_fmt == format::b_fs_yx_fsv16 && data_type_traits::is_i8_u8(output_dt)) + preferred_impl = impl_types::ocl; + if (output_fmt == format::bfyx && output_dt == data_types::f32) + preferred_impl = impl_types::ocl; } else if (node.is_type() || node.is_type() || node.is_type()) { if (!_optimization_attributes.use_onednn_impls) return impl_types::ocl; @@ -1330,13 +1371,11 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format auto& conv = node.as(); auto input_layout = conv.input().get_output_layout(); auto output_layout = conv.get_output_layout(); - bool fp16_input = input_layout.data_type == data_types::f16; bool has_groups = conv.get_primitive()->groups > 1; bool is_depthwise = conv.get_primitive()->groups == input_layout.size.feature[0]; bool first_conv = input_layout.size.feature[0] <= 4; - bool enable_onednn_dw_fp16_conv = fp16_input && is_depthwise; - if (((has_groups && !enable_onednn_dw_fp16_conv) || first_conv) && - (output_layout.format == format::b_fs_yx_fsv16 || output_layout.format == format::bs_fs_yx_bsv32_fsv16) && + if (((has_groups && !is_depthwise) || first_conv) && + (output_layout.format == format::b_fs_yx_fsv16) && !needs_onednn_bfyx_to_blocked(format::bfyx, output_layout.format, input_layout, conv)) impl_candidate = impl_types::ocl; if (conv.get_output_layout().format == format::b_fs_yx_fsv32 && first_conv) @@ -1522,7 +1561,7 @@ format layout_optimizer::get_preferred_format(program_node& node) { if (node.get_users().size() == 1 && node.get_users().front()->is_type()) { auto& conv = node.get_users().front()->as(); auto ws = conv.get_dependency(1).get_output_layout().size; - if (data_type_traits::is_floating_point(conv.get_output_layout().data_type) || ws.spatial[0] != 7 || conv.get_primitive()->groups > 1) + if (ws.spatial[0] != 7 || conv.get_primitive()->groups > 1 || layout.size.feature[0] == 1) expected = format::bfyx; else expected = format::bs_fs_yx_bsv8_fsv4; diff --git a/src/plugins/intel_gpu/src/plugin/program.cpp b/src/plugins/intel_gpu/src/plugin/program.cpp index cc3fb1a6e10..3f04dbe019c 100644 --- a/src/plugins/intel_gpu/src/plugin/program.cpp +++ b/src/plugins/intel_gpu/src/plugin/program.cpp @@ -203,7 +203,7 @@ std::shared_ptr Program::BuildProgram(const std::vector func) { // Conversion to FP32 might be needed for quantized models that face any fp16 related issues (e.g. overflow) for non-quantized layers // With this key users can work-around such issues - if (!config.enable_fp16_for_quantized_models || use_onednn) { + if (!config.enable_fp16_for_quantized_models) { ngraph::pass::Manager manager; manager.register_pass(precisions_array {{ ngraph::element::f16, ngraph::element::f32 }}); manager.run_passes(func); diff --git a/src/plugins/intel_gpu/tests/test_cases/resample_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/resample_gpu_test.cpp index 3230d165404..f6a7efa28b8 100644 --- a/src/plugins/intel_gpu/tests/test_cases/resample_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/test_cases/resample_gpu_test.cpp @@ -2168,10 +2168,13 @@ struct resample_opt_random_test : testing::TestWithParam Date: Tue, 11 Jan 2022 11:33:36 +0200 Subject: [PATCH 70/78] Atomic guard impromenents (#9473) * Read atomic before doing CAS in the constructor. * Typo * Code style fix. Co-authored-by: Roman Slivinskyi --- src/core/src/atomic_guard.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/src/atomic_guard.hpp b/src/core/src/atomic_guard.hpp index 9220302e9c7..b7dbb478384 100644 --- a/src/core/src/atomic_guard.hpp +++ b/src/core/src/atomic_guard.hpp @@ -12,7 +12,7 @@ class AtomicGuard { public: AtomicGuard(std::atomic_bool& b) : m_atomic(b) { bool exp = false; - while (!m_atomic.compare_exchange_strong(exp, true)) { + while (m_atomic.load(std::memory_order_relaxed) || !m_atomic.compare_exchange_strong(exp, true)) { exp = false; } } From 062523c9e1383e2f455b4a7785dfb1f6378d56ff Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Tue, 11 Jan 2022 12:52:12 +0300 Subject: [PATCH 71/78] [GPU] Split fusion tests into separate files (#9553) --- .../tests/fusions/activation_fusion_test.cpp | 297 + .../fusions/batch_to_space_fusion_test.cpp | 158 + .../binary_convolution_fusion_test.cpp | 267 + .../tests/fusions/concatenate_fusion_test.cpp | 148 + .../tests/fusions/convolution_fusion_test.cpp | 3408 +++++ .../fusions/deconvolution_fusion_test.cpp | 819 ++ .../fusions/depth_to_space_fusion_test.cpp | 162 + .../tests/fusions/eltwise_fusion_test.cpp | 534 + .../fusions/fully_connected_fusion_test.cpp | 393 + .../tests/fusions/fusion_test_common.hpp | 230 + .../fusions/gather_elements_fusion_test.cpp | 243 + .../tests/fusions/gather_fusion_test.cpp | 193 + .../tests/fusions/gather_nd_fusion_test.cpp | 231 + .../tests/fusions/gemm_fusion_test.cpp | 370 + .../tests/fusions/lrn_fusion_test.cpp | 292 + .../tests/fusions/mvn_fusion_test.cpp | 291 + .../tests/fusions/normalize_fusion_test.cpp | 120 + .../tests/fusions/permute_fusion_test.cpp | 590 + .../tests/fusions/pooling_fusion_test.cpp | 571 + .../tests/fusions/reduce_fusion_test.cpp | 287 + .../tests/fusions/resample_fusion_test.cpp | 310 + .../scatter_elements_update_fusion_test.cpp | 174 + .../fusions/scatter_nd_update_fusion_test.cpp | 301 + .../fusions/scatter_update_fusion_test.cpp | 242 + .../fusions/space_to_batch_fusion_test.cpp | 157 + .../fusions/space_to_depth_fusion_test.cpp | 162 + .../tests/test_cases/fusings_gpu_test.cpp | 10336 ---------------- 27 files changed, 10950 insertions(+), 10336 deletions(-) create mode 100644 src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/batch_to_space_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/binary_convolution_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/concatenate_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/depth_to_space_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp create mode 100644 src/plugins/intel_gpu/tests/fusions/gather_elements_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/gather_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/gather_nd_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/lrn_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/mvn_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/normalize_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/permute_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/pooling_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/resample_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/scatter_elements_update_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/scatter_nd_update_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/scatter_update_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/space_to_batch_fusion_test.cpp create mode 100644 src/plugins/intel_gpu/tests/fusions/space_to_depth_fusion_test.cpp delete mode 100644 src/plugins/intel_gpu/tests/test_cases/fusings_gpu_test.cpp diff --git a/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp new file mode 100644 index 00000000000..fbff486db55 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/activation_fusion_test.cpp @@ -0,0 +1,297 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct activation_test_params { + tensor input_size; + data_types input_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + std::string kernel_name; +}; + +class ActivationFusingTest : public ::BaseFusingTest { +public: + void execute(activation_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + build_options options; + implementation_desc activation_impl = { p.input_format, p.kernel_name }; + options.set_option(build_option::optimize_data(true)); + options.set_option(build_option::force_implementations({ { "act", activation_impl } })); + network network_fused(this->engine, this->topology_fused, options); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(activation_test_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(activation_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; + } + + format get_input_format(activation_test_params &p) { return p.input_format; } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* -------------------------------- Activation cases --------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_ACTIVATION_F32_0 { 7, 32, 3, 3 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_1 { 1, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_2 { 7, 3, 7, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_3 { 1, 14, 8, 8 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_4 { 1, 17, 31, 29 }, data_types::f32, format::yxfb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_5 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_6 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F32_7 { 1, 17, 31, 29 }, data_types::f32, format::fyxb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_3D_F32_0 { 3, 16, 13, 13, 13 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_1 { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_2 { 1, 16, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_3 { 1, 17, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_4 { 1, 17, 7, 7, 7 }, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F32_5 { 1, 17, 7, 7, 7 }, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx + +#define CASE_ACTIVATION_F16_0 { 7, 32, 5, 5 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_1 { 1, 16, 8, 8 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_2 { 7, 16, 7, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_3 { 1, 14, 8, 8 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_4 { 1, 17, 31, 29 }, data_types::f16, format::yxfb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_5 { 1, 17, 31, 29 }, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_6 { 1, 17, 31, 29 }, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx +#define CASE_ACTIVATION_F16_7 { 1, 17, 31, 29 }, data_types::f16, format::fyxb, data_types::f32, format::bfyx +#define CASE_ACTIVATION_3D_F16_0 { 3, 16, 13, 13, 13 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_1 { 2, 16, 8, 8, 8 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_2 { 1, 16, 7, 7, 7 }, data_types::f16, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_3 { 1, 17, 7, 7, 7 }, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_4 { 1, 17, 7, 7, 7 }, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx +#define CASE_ACTIVATION_3D_F16_5 { 1, 17, 7, 7, 7 }, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx + +#define CASE_ACTIVATION_U8_1 { 1, 16, 8, 8 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_U8_2 { 1, 12, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_I8_1 { 1, 16, 8, 8 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_ACTIVATION_I8_2 { 1, 14, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_ACTIVATION_3D_I8_1 { 1, 17, 8, 8, 8 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx + +class activation_quantize_i8 : public ActivationFusingTest {}; +TEST_P(activation_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("in_low", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_high", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127, 0)), + data("out_high", get_mem(get_single_element_layout(p), 0, 127)), + quantize("quant", "act", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(activation_quantize_i8, per_channel) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127, 0)), + data("out_high", get_mem(get_single_element_layout(p), 0, 127)), + quantize("quant", "act", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_quantize_i8, ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 + activation_test_params{ CASE_ACTIVATION_F32_0, 2, 3, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_F32_1, 2, 3, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 3, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 3, "activation_opt" }, + + activation_test_params{ CASE_ACTIVATION_F32_0, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_1, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_2, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_3, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_4, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 3, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_2, 2, 3, "activation_ref" }, +})); + +INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_quantize_i8, ::testing::ValuesIn(std::vector{ + activation_test_params{ CASE_ACTIVATION_F32_5, 2, 3, "activation_ref" }, // FIXME - accuracy bug + activation_test_params{ CASE_ACTIVATION_F32_6, 2, 3, "activation_ref" }, // FIXME - accuracy bug + activation_test_params{ CASE_ACTIVATION_F32_7, 2, 3, "activation_ref" }, // FIXME - accuracy bug + activation_test_params{ CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref" }, // FIXME - accuracy bug + activation_test_params{ CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref" }, // FIXME - accuracy bug +})); + +class activation_scale_activation_quantize_u8 : public ActivationFusingTest {}; +TEST_P(activation_scale_activation_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + data("in_low", get_mem(get_single_element_layout(p), 0)), + data("in_high", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + scale("scale", "act", "scale_data"), + activation("act2", "scale", activation_func::softsign), + quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +TEST_P(activation_scale_activation_quantize_u8, per_channel) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + data("in_low", get_mem(get_per_channel_layout(p), 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + scale("scale", "act", "scale_data"), + activation("act2", "scale", activation_func::softsign), + quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 + activation_test_params{ CASE_ACTIVATION_F32_0, 2, 5, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_F32_1, 2, 5, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 5, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 5, "activation_opt" }, + + activation_test_params{ CASE_ACTIVATION_F32_0, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_1, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_2, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_3, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_4, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_5, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_6, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_7, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 5, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_2, 2, 5, "activation_ref" }, +})); + +INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ + activation_test_params{ CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref" }, // FIXME - accuracy bug +})); + +class activation_scale_activation : public ActivationFusingTest {}; +TEST_P(activation_scale_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + activation("act", "input", activation_func::relu), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + scale("scale", "act", "scale_data"), + activation("act2", "scale", activation_func::exp), + reorder("reorder_bfyx", "act2", p.default_format, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_scale_activation, ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 + activation_test_params{ CASE_ACTIVATION_F32_0, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_F32_1, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 4, "activation_opt" }, + + activation_test_params{ CASE_ACTIVATION_F32_0, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_1, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_2, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_3, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_4, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_5, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_6, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F32_7, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F32_2, 2, 4, "activation_ref" }, + + // InputDataType = FP16 + activation_test_params{ CASE_ACTIVATION_F16_0, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_F16_1, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_0, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_opt" }, + + activation_test_params{ CASE_ACTIVATION_F16_0, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_1, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_2, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_3, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_4, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_5, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_6, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_F16_7, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_0, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref" }, + + // InputDataType = UINT8 + activation_test_params{ CASE_ACTIVATION_U8_1, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_U8_2, 2, 4, "activation_ref" }, + + // InputDataType = INT8 + activation_test_params{ CASE_ACTIVATION_I8_1, 2, 4, "activation_opt" }, + activation_test_params{ CASE_ACTIVATION_3D_I8_1, 2, 4, "activation_opt" }, + + activation_test_params{ CASE_ACTIVATION_I8_1, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_I8_2, 2, 4, "activation_ref" }, + activation_test_params{ CASE_ACTIVATION_3D_I8_1, 2, 4, "activation_ref" } +})); + +INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_scale_activation, ::testing::ValuesIn(std::vector{ + activation_test_params{ CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref" }, // FIXME - accuracy bug + activation_test_params{ CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref" }, // FIXME - accuracy bug +})); diff --git a/src/plugins/intel_gpu/tests/fusions/batch_to_space_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/batch_to_space_fusion_test.cpp new file mode 100644 index 00000000000..c36135c70ab --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/batch_to_space_fusion_test.cpp @@ -0,0 +1,158 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct batch_to_space_test_params { + tensor input_size; + tensor output_size; + data_types input_type; + format input_format; + tensor block_shape; + tensor crops_begin; + tensor crops_end; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class BatchToSpaceFusingsTest : public ::BaseFusingTest { +public: + void execute(batch_to_space_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(batch_to_space_test_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(batch_to_space_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- BatchToSpace cases ----------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_BATCH_TO_SPACE_F32_1 { 8, 1, 1, 1 }, { 2, 1, 2, 2 }, data_types::f32, format::bfyx, { 1, 1, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_F32_2 { 64, 16, 2, 2 }, { 2, 112, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, { 1, 8, 2, 2 }, { 0, 8, 0, 0 }, { 0, 8, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_F16_1 { 16, 4, 1, 2 }, { 2, 12, 1, 2 }, data_types::f16, format::bfyx, { 1, 4, 2, 1 }, { 0, 2, 1, 0 }, { 0, 2, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_F16_2 { 32, 16, 2, 1 }, { 1, 16, 32, 2 }, data_types::f16, format::b_fs_yx_fsv16, { 1, 1, 16, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_U8_1 { 30, 12, 4, 6 }, { 1, 52, 8, 9 }, data_types::u8, format::bfyx, { 1, 5, 2, 3 }, { 0, 8, 0, 9 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_U8_2 { 24, 32, 4, 5 }, { 2, 64, 12, 8 }, data_types::u8, format::b_fs_yx_fsv16, { 1, 2, 3, 2 }, { 0, 0, 0, 2 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_I8_1 { 32, 1, 3, 4 }, { 1, 8, 6, 8 }, data_types::i8, format::bfyx, { 1, 8, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_BATCH_TO_SPACE_I8_2 { 16, 16, 2, 1 }, { 2, 32, 4, 2 }, data_types::i8, format::b_fs_yx_fsv16, { 1, 2, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx + +class batch_to_space_quantize_i8 : public BatchToSpaceFusingsTest {}; +TEST_P(batch_to_space_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + batch_to_space("batch_to_space", "input", p.block_shape, p.crops_begin, p.crops_end, p.output_size), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -128)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + quantize("quant", "batch_to_space", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, batch_to_space_quantize_i8, ::testing::ValuesIn(std::vector{ + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_1, 2, 3 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_2, 2, 3 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_1, 2, 3 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_2, 2, 3 }, +})); + +class batch_to_space_scale_act_eltwise_quantize_u8 : public BatchToSpaceFusingsTest {}; +TEST_P(batch_to_space_scale_act_eltwise_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + batch_to_space("batch_to_space", "input", p.block_shape, p.crops_begin, p.crops_end, p.output_size), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "batch_to_space", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), 0)), + data("out_high", get_mem(get_single_element_layout(p), 255)), + quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, batch_to_space_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_1, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_2, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_1, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_2, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_1, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_2, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_1, 2, 6 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_2, 2, 6 }, +})); + +class batch_to_space_scale_act_eltw : public BatchToSpaceFusingsTest {}; +TEST_P(batch_to_space_scale_act_eltw, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + batch_to_space("batch_to_space", "input", p.block_shape, p.crops_begin, p.crops_end, p.output_size), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "batch_to_space", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, batch_to_space_scale_act_eltw, ::testing::ValuesIn(std::vector{ + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_1, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_2, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_1, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_2, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_1, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_2, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_1, 2, 5 }, + batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_2, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/binary_convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/binary_convolution_fusion_test.cpp new file mode 100644 index 00000000000..d159503282b --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/binary_convolution_fusion_test.cpp @@ -0,0 +1,267 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { + +struct binary_convolution_test_params { + tensor in_shape; + tensor out_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class BinaryConvolutionFusingTest : public BaseFusingTest { +public: + void execute(binary_convolution_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(binary_convolution_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(binary_convolution_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } +}; + +} // namespace + +#define CASE_BIN_CONV1 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx +#define CASE_BIN_CONV2 { 1, 16, 4, 5 }, { 1, 30, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx +#define CASE_BIN_CONV3 { 1, 184, 12, 21 }, { 1, 224, 12, 21 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx + +/* ----------------------------------------------------------------------------------------------------- */ +/* -------------------------------------- binary convolution cases ------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +class conv_bin_activation : public BinaryConvolutionFusingTest {}; +TEST_P(conv_bin_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + activation("activation", "bin_conv_prim", activation_func::relu), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_activation, ::testing::ValuesIn(std::vector{ + binary_convolution_test_params{ CASE_BIN_CONV1, 2, 3 }, +})); + +class conv_bin_scale_activation : public BinaryConvolutionFusingTest {}; +TEST_P(conv_bin_scale_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + scale("scale", "bin_conv_prim", "scale_data"), + activation("activation", "scale", activation_func::relu), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_scale_activation, ::testing::ValuesIn(std::vector{ + binary_convolution_test_params{ CASE_BIN_CONV1, 2, 4 }, + binary_convolution_test_params{ CASE_BIN_CONV2, 2, 4 }, +})); + +class conv_bin_quantize_bin : public BinaryConvolutionFusingTest {}; +TEST_P(conv_bin_quantize_bin, channel_wise_quantize) { + auto p = GetParam(); + auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("in_lo", in_thresh), + data("in_hi", in_thresh), + data("out_lo", get_mem(get_per_channel_layout(p), -1)), + data("out_hi", get_mem(get_per_channel_layout(p), 1)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + quantize("quantize_data", "bin_conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 2, data_types::bin), + reorder("reorder_bfyx", "quantize_data", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_bin_quantize_bin, blob_wise_quantize) { + auto p = GetParam(); + auto in_thresh = get_mem(get_single_element_layout(p), min_random, max_random); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("in_lo", in_thresh), + data("in_hi", in_thresh), + data("out_lo", get_mem(get_single_element_layout(p), -1)), + data("out_hi", get_mem(get_single_element_layout(p), 1)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + quantize("quantize_data", "bin_conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 2, data_types::bin), + reorder("reorder_bfyx", "quantize_data", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_quantize_bin, ::testing::ValuesIn(std::vector{ + binary_convolution_test_params{ CASE_BIN_CONV1, 2, 3 }, + binary_convolution_test_params{ CASE_BIN_CONV2, 2, 3 }, +})); + +class conv_bin_scale_conv_dw : public BinaryConvolutionFusingTest {}; +TEST_P(conv_bin_scale_conv_dw, dw_kernel_3x3_stride2) { + auto p = GetParam(); + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + + auto dw_stride = tensor{ 1, 1, 2, 2 }; + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + scale("scale", "bin_conv_prim", "scale_data"), + convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + reorder("reorder_bfyx", "conv_dw", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_bin_scale_conv_dw, dw_kernel_3x3_stride1) { + auto p = GetParam(); + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + + auto dw_stride = tensor{ 1, 1, 1, 1 }; + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + scale("scale", "bin_conv_prim", "scale_data"), + convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + reorder("reorder_bfyx", "conv_dw", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_scale_conv_dw, ::testing::ValuesIn(std::vector{ + binary_convolution_test_params{ CASE_BIN_CONV2, 3, 4 }, + binary_convolution_test_params{ CASE_BIN_CONV3, 3, 4 }, +})); + +class conv_bin_scale_conv_dw_prelu : public BinaryConvolutionFusingTest {}; +TEST_P(conv_bin_scale_conv_dw_prelu, dw_kernel_3x3_stride2) { + auto p = GetParam(); + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + + auto dw_stride = tensor{ 1, 1, 2, 2 }; + auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + scale("scale", "bin_conv_prim", "scale_data"), + convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + data("slope_data", get_mem(get_per_channel_layout(p))), + activation("activation", "conv_dw", "slope_data", activation_func::relu_negative_slope), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_bin_scale_conv_dw_prelu, dw_kernel_3x3_stride1) { + auto p = GetParam(); + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + + auto dw_stride = tensor{ 1, 1, 1, 1 }; + auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), + binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), + scale("scale", "bin_conv_prim", "scale_data"), + convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + data("slope_data", get_mem(get_per_channel_layout(p))), + activation("activation", "conv_dw", "slope_data", activation_func::relu_negative_slope), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_scale_conv_dw_prelu, ::testing::ValuesIn(std::vector{ + binary_convolution_test_params{ CASE_BIN_CONV2, 3, 5 }, + binary_convolution_test_params{ CASE_BIN_CONV3, 3, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/concatenate_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/concatenate_fusion_test.cpp new file mode 100644 index 00000000000..b1ca06332fa --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/concatenate_fusion_test.cpp @@ -0,0 +1,148 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +#ifdef ENABLE_ONEDNN_FOR_GPU +namespace { +struct concat_test_params { + tensor in_shape; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + std::string kernel_name; +}; + +class ConcatOneDNNFusingTest : public ::BaseFusingTest { +public: + void execute(concat_test_params& p) { + // Onednn post operation has issue in a machine that does not support imad. + if (!engine.get_device_info().supports_imad) + return; + + auto input0_prim = get_mem(get_input_layout(p)); + auto input1_prim = get_mem(get_input_layout(p)); + + build_options onednn_options; + build_options cldnn_options; + + onednn_options.set_option(build_option::optimize_data(true)); + cldnn_options.set_option(build_option::optimize_data(true)); + + implementation_desc onednn_impl = { p.input_format, "", impl_types::onednn }; + implementation_desc cldnn_impl = { p.input_format, "", impl_types::ocl }; + onednn_options.set_option(build_option::force_implementations({ { "concat", onednn_impl } })); + cldnn_options.set_option(build_option::force_implementations({ { "concat", cldnn_impl } })); + + // for onednn fusing test, topology_non_fused means cldnn, topology_fused is onednn + network network_fused_cldnn(this->engine, this->topology_non_fused, cldnn_options); + network network_fused_onednn(this->engine, this->topology_fused, onednn_options); + + network_fused_cldnn.set_input_data("input0", input0_prim); + network_fused_cldnn.set_input_data("input1", input1_prim); + network_fused_onednn.set_input_data("input0", input0_prim); + network_fused_onednn.set_input_data("input1", input1_prim); + + ASSERT_FALSE(network_fused_cldnn.get_primitives_info().empty()); + ASSERT_FALSE(network_fused_onednn.get_primitives_info().empty()); + + auto find_and_check = [&](primitive_info& p) -> bool { + if (p.original_id == "concat" || p.original_id == "reorder_bfyx") + return true; + return false; + }; + + auto pi_fused_onednn = network_fused_onednn.get_primitives_info(); + auto pi_fused_cldnn = network_fused_cldnn.get_primitives_info(); + auto info_fused_onednn = std::find_if(pi_fused_onednn.begin(), pi_fused_onednn.end(), find_and_check); + auto info_fused_cldnn = std::find_if(pi_fused_cldnn.begin(), pi_fused_cldnn.end(), find_and_check); + + ASSERT_TRUE(info_fused_onednn != pi_fused_onednn.end()); + ASSERT_TRUE(info_fused_cldnn != pi_fused_cldnn.end()); + + compare(network_fused_cldnn, network_fused_onednn, p); + } + + layout get_input_layout(concat_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape }; + } + + layout get_per_channel_layout(concat_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* --------------------------------------- Concat cases ------------------------------------------------ */ +/* ----------------------------------------------------------------------------------------------------- */ +#define CASE_CONCAT_F32_1 { 1, 8, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONCAT_F16_1 { 1, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx + +class concat_onednn_activation : public ConcatOneDNNFusingTest {}; +TEST_P(concat_onednn_activation, along_f) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p)), + input_layout("input1", get_input_layout(p)), + concatenation("concat", + { "input0", "input1" }, + concatenation::concatenation_axis::along_f, + data_types::f16, + "", + padding{ { 0, 0, 0, 0 }, 0 }), + activation("act", "concat", activation_func::relu), + reorder("reorder_bfyx", "act", cldnn::format::bfyx, p.default_type) + ); + + tolerance = 1.f; + execute(p); +} + +class concat_onednn_eltwise : public ConcatOneDNNFusingTest {}; +TEST_P(concat_onednn_eltwise, along_f) { + auto p = GetParam(); + layout data_layout(p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0]*2, 1, 1 }); + + create_topologies( + input_layout("input0", get_input_layout(p)), + input_layout("input1", get_input_layout(p)), + data("scale_data", get_mem(data_layout, 1.0f / tensor{ 1, 1, 4, 4 }.count())), + concatenation("concat", + { "input0", "input1" }, + concatenation::concatenation_axis::along_f, + data_types::f16, + "", + padding{ { 0, 0, 0, 0 }, 0 }), + eltwise("scale", { "concat", "scale_data" }, eltwise_mode::prod, p.default_type), + reorder("reorder_bfyx", "scale", cldnn::format::bfyx, p.default_type) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_onednn_activation, ::testing::ValuesIn(std::vector{ + concat_test_params{ CASE_CONCAT_F16_1, 3, 3, "" }, +})); + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_onednn_eltwise, ::testing::ValuesIn(std::vector{ + concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" }, + concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" }, +})); +#endif diff --git a/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp new file mode 100644 index 00000000000..be63d06e064 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/convolution_fusion_test.cpp @@ -0,0 +1,3408 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { + +struct convolution_test_params { + tensor in_shape; + tensor out_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +struct bc_force_kernel_params { + tensor in_shape; + tensor out_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + std::string kernel_name; +}; + +struct conv_eltw_test_params { + tensor in_shape; + tensor out_shape; + tensor eltw_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class ConvFusingTest : public BaseFusingTest { +public: + void execute(convolution_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(convolution_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(convolution_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } +}; + +class ConvEltwTest : public ::BaseFusingTest { +public: + + void execute(conv_eltw_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_prim = [](primitive_info& p) -> bool { + // Add more ids when needed + if (p.original_id == "deconv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_prim); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(conv_eltw_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(conv_eltw_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } +}; + +class ConvFusingForceKernelTest : public BaseFusingTest { + public: + void execute(bc_force_kernel_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + build_options options; + options.set_option(build_option::optimize_data(true)); + implementation_desc conv_impl = { p.input_format, p.kernel_name }; + options.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, options); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(bc_force_kernel_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(bc_force_kernel_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } +}; + +#ifdef ENABLE_ONEDNN_FOR_GPU +class WeightsPrimitiveFusingTestOneDNN : public BaseFusingTest { +public: + void execute(convolution_test_params& p) { + // Onednn post operation has issue in a machine that does not support imad. + if (!engine.get_device_info().supports_imad) + return; + + auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p)); + + auto impl_forcing_bo = bo_fused.get(); + const auto& impl_forcing = impl_forcing_bo->forcing; + + auto forcing_format = p.input_format; + for (auto& forcing : impl_forcing) { + if (forcing.first == "conv_prim") { + forcing_format = forcing.second.output_format; + } + } + + implementation_desc conv_impl = { forcing_format, "", impl_types::onednn }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(convolution_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(convolution_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } +}; +#endif // ENABLE_ONEDNN_FOR_GPU + +} // namespace + +// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_CONV_FP32_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_CONV_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_CONV_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_CONV_FP32_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx +#define CASE_CONV_FP32_5 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_FP32_6 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_7 { 1, 16, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_8 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_9 { 1, 32, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_10 { 32, 16, 4, 5, 4 }, { 32, 32, 4, 5, 4 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_11 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_12 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_13 { 1, 16, 18, 5, 4 }, { 1, 16, 16, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_FP32_14 { 1, 3, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::bfyx, data_types::f32, format::bfyx + + +#define CASE_CONV_FP16_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_CONV_FP16_2 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::os_is_yx_isv16_osv16, data_types::f16, format::bfyx +#define CASE_CONV_FP16_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::os_is_yx_isv16_osv16, data_types::f16, format::bfyx +#define CASE_CONV_FP16_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::gs_oiyx_gsv16, data_types::f16, format::bfyx +#define CASE_CONV_FP16_5 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::i8, format::bfyx, data_types::f16, format::bfyx +#define CASE_CONV_FP16_6 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_7 { 1, 16, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_8 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_9 { 1, 32, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_10 { 32, 16, 4, 5, 4 }, { 32, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_11 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_12 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_CONV_FP16_13 { 16, 32, 4, 5 }, { 16, 64, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::bfyx, data_types::f16, format::bfyx + +#define CASE_CONV_U8S8_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_2 { 1, 15, 5, 5 }, { 1, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_4 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_5 { 1, 16, 5, 5 }, { 1, 32, 5, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_6 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_7 { 1, 64, 7, 7 }, { 1, 32, 7, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_8 { 1, 3, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_9 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_10 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_11 { 32, 15, 4, 5 }, { 32, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_12 { 32, 15, 5, 5 }, { 32, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_13 { 32, 16, 4, 5 }, { 32, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_14 { 32, 17, 4, 5 }, { 32, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_CONV_U8S8_15 { 1, 15, 2, 2 }, { 1, 30, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx + +#define CASE_CONV_S8S8_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_2 { 1, 15, 5, 5 }, { 1, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_4 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_5 { 1, 16, 5, 5 }, { 1, 32, 5, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_6 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_7 { 1, 64, 7, 7 }, { 1, 32, 7, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_8 { 1, 3, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_9 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_10 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_11 { 1, 4, 1280, 720 }, { 1, 4, 1280, 720 }, { 1, 1, 5, 5 }, tensor{ 1 }, tensor{ { 0, 0, 2, 2 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv4, data_types::i8, format::os_is_yx_osv16_isv4, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_12 { 32, 15, 4, 5 }, { 32, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_13 { 32, 15, 5, 5 }, { 32, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_14 { 32, 16, 4, 5 }, { 32, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_CONV_S8S8_15 { 32, 17, 4, 5 }, { 32, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx + +#define CASE_CONV3D_U8S8_1 { 1, 15, 5, 4, 5 }, { 1, 30, 3, 2, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_U8S8_2 { 1, 15, 5, 5, 5 }, { 1, 30, 3, 3, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_U8S8_3 { 1, 16, 5, 4, 5 }, { 1, 32, 5, 4, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_U8S8_4 { 1, 17, 5, 4, 5 }, { 1, 17, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 17, data_types::u8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_U8S8_5 { 1, 3, 5, 4, 5 }, { 1, 32, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx + +#define CASE_CONV3D_S8S8_1 { 1, 15, 5, 4, 5 }, { 1, 30, 3, 2, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_S8S8_2 { 1, 15, 5, 5, 5 }, { 1, 30, 3, 3, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_S8S8_3 { 1, 16, 5, 4, 5 }, { 1, 32, 5, 4, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_S8S8_4 { 1, 17, 5, 4, 5 }, { 1, 17, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 17, data_types::i8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx +#define CASE_CONV3D_S8S8_5 { 1, 3, 5, 4, 5 }, { 1, 18, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx + +// in_shape; out_shape; eltw_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_CONV_ELTW_FP32_1 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP32_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP32_5 { 1, 32, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 32, 2, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_ELTW_FP32_6 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 16, 2, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_CONV_ELTW_FP32_7 { 1, 16, 3, 5 }, { 1, 32, 1, 3 }, { 1, 32, 3, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_FP32_8 { 1, 32, 3, 5, 4 }, { 1, 16, 1, 3, 2 }, { 1, 1, 2, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx + +#define CASE_CONV_ELTW_i8_1 { 1, 16, 3, 5 }, { 1, 32, 1, 3 }, { 1, 32, 3, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_i8_2 { 1, 16, 3, 5, 3 }, { 1, 32, 2, 4, 2 }, { 1, 1, 2, 4, 2 }, { 1, 1, 2, 2, 2 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx +#define CASE_CONV_ELTW_i8_3 { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx +#define CASE_CONV_ELTW_i8_4 { 1, 16, 1, 4 }, { 1, 16, 1, 2 }, { 1, 16, 1, 1 }, { 1, 1, 1, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_CONV_ELTW_i8_5 { 1, 16, 1, 4, 1 }, { 1, 16, 1, 2, 1 }, { 1, 16, 2, 1, 1 }, { 1, 1, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx + +#define CASE_BIN_CONV1 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx +#define CASE_BIN_CONV2 { 1, 16, 4, 5 }, { 1, 30, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx +#define CASE_BIN_CONV3 { 1, 184, 12, 21 }, { 1, 224, 12, 21 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx + +#define CASE_FC_FP32_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_FP32_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_FP32_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_FP32_3D_1 { 5, 3, 1, 3 }, { 5, 3, 1, 5 }, { 5, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_FC_FP32_3D_2 { 2, 1, 1, 1 }, { 2, 1, 1, 32 }, { 32, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_FC_FP32_3D_3 { 2, 32, 1, 32 }, { 2, 32, 1, 16 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx + +#define CASE_FC_U8S8_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_1 { 2, 32, 1, 3 }, { 2, 32, 1, 16 }, { 16, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_2 { 1, 1, 1, 3 }, { 1, 1, 1, 32 }, { 32, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_3 { 2, 3, 1, 1 }, { 2, 3, 1, 15 }, { 15, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_4 { 1, 512, 1, 1024 }, { 1, 384, 1, 1024 }, { 1024, 1024, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx + +#define CASE_NORMALIZE_I8_1 { 1, 2, 3, 3 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- FP32 convolution cases ------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +/* ----------- NOTE: A part of tests is disabled until all FP kernels don't support fusings ------------ */ + +class conv_fp32_reorder_fsv16_to_bfyx : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_fsv16_to_bfyx, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32), + convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32) + ); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_1, 2, 2 }, + convolution_test_params{ CASE_CONV_FP32_2, 2, 2 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 2 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 2 }, + convolution_test_params{ CASE_CONV_FP32_5, 2, 2 }, + convolution_test_params{ CASE_CONV_FP32_14, 2, 2 }, + + convolution_test_params{ CASE_CONV_FP16_1, 2, 2 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 2 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 2 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 2 }, + convolution_test_params{ CASE_CONV_FP16_5, 2, 2 }, + convolution_test_params{ CASE_CONV_FP16_13, 2, 2 } +})); + +class conv_fp32_reorder_fsv16_to_bfyx_conv : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_fsv16_to_bfyx_conv, basic) { + auto p = GetParam(); + + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + auto dw_stride = tensor{ 0, 0, 1, 1 }; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32), + convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32), + convolution("conv_output", "reorder_bfyx", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs), + reorder("reorder_output", "activation", p.default_format, data_types::f32) + ); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx_conv, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_1, 3, 4 }, + convolution_test_params{ CASE_CONV_FP32_2, 3, 4 }, + convolution_test_params{ CASE_CONV_FP32_3, 3, 4 }, + convolution_test_params{ CASE_CONV_FP32_4, 3, 4 }, + convolution_test_params{ CASE_CONV_FP32_5, 3, 4 }, + convolution_test_params{ CASE_CONV_FP32_14, 3, 4 }, + + convolution_test_params{ CASE_CONV_FP16_1, 3, 4 }, + convolution_test_params{ CASE_CONV_FP16_2, 3, 4 }, + convolution_test_params{ CASE_CONV_FP16_3, 3, 4 }, + convolution_test_params{ CASE_CONV_FP16_4, 3, 4 }, + convolution_test_params{ CASE_CONV_FP16_5, 3, 4 }, + convolution_test_params{ CASE_CONV_FP16_13, 3, 4 }, +})); + + +class conv_fp32_activation : public ConvFusingTest {}; +TEST_P(conv_fp32_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::abs), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 3 }, + + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, +})); + + +class conv_fp32_scale : public ConvFusingTest {}; +TEST_P(conv_fp32_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale, ::testing::ValuesIn(std::vector{ + // convolution_test_params{ CASE_CONV_FP32_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_10, 2, 3 }, + + // convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_10, 2, 3 }, +})); + +class conv_fp32_bias : public ConvFusingTest {}; +TEST_P(conv_fp32_bias, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, std::vector{}, p.groups, p.stride, p.pad, p.dilation), + eltwise("add_bias", { "conv_prim", "bias" }, eltwise_mode::sum), + reorder("reorder_bfyx", "add_bias", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_bias, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_10, 2, 3 }, + + convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_10, 2, 3 }, +})); + +class conv_fp32_double_bias : public ConvFusingTest {}; +TEST_P(conv_fp32_double_bias, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias1", get_mem(get_bias_layout(p))), + data("bias2", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, std::vector{}, p.groups, p.stride, p.pad, p.dilation), + eltwise("add_bias1", { "conv_prim", "bias1" }, eltwise_mode::sum), + eltwise("add_bias2", { "add_bias1", "bias2" }, eltwise_mode::sum), + reorder("reorder_bfyx", "add_bias2", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_double_bias, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, +})); + +class conv_fp32_prelu_eltwise : public ConvFusingTest {}; +TEST_P(conv_fp32_prelu_eltwise, basic_sum) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_prelu_eltwise, basic_prod) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::prod), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_prelu_eltwise, eltw_broadcast_sum) { + auto p = GetParam(); + tensor eltw_shape = p.default_format.spatial_num() == 2 ? tensor{ 1, 1, 1, 1 } : tensor{ 1, 1, 1, 1, 1 }; + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, eltw_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_prelu_eltwise, eltw_broadcast_prod) { + auto p = GetParam(); + tensor eltw_shape = p.default_format.spatial_num() == 2 ? tensor{ 1, 1, 1, 1 } : tensor{ 1, 1, 1, 1, 1 }; + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, eltw_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::prod), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_prelu_eltwise, vector_ops) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_prelu_eltwise, vector_ops_mixed_types) { + auto p = GetParam(); + auto slope_type = p.default_type == data_types::f32 ? data_types::f16 : data_types::f32; + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(layout{ slope_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } })), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_prelu_eltwise, ::testing::ValuesIn(std::vector{ + // convolution_test_params{ CASE_CONV_FP32_1, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 4 }, + + // convolution_test_params{ CASE_CONV_FP32_1, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 4 }, +})); + +class conv_fp32_multi_eltwise_2 : public ConvFusingTest {}; +TEST_P(conv_fp32_multi_eltwise_2, basic) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1", "conv_prim", "eltwise_data", eltwise_mode::sum), + eltwise("eltwise2", "eltwise1", "conv_prim", eltwise_mode::prod), + reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_2, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 4 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 4 }, +})); + + +class conv_fp32_multi_eltwise_2_clamp : public ConvFusingTest {}; +TEST_P(conv_fp32_multi_eltwise_2_clamp, basic) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise1_data", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1", "conv_prim", "eltwise1_data", eltwise_mode::sum), + activation("activation", "eltwise1", activation_func::clamp, { 0.5f, 2.5f }), + eltwise("eltwise2", "activation", "conv_prim", eltwise_mode::prod), + reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_2_clamp, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 2, 5 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 5 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 5 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 5 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 5 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 5 }, +})); + + +class conv_fp32_multi_eltwise_4_clamp : public ConvFusingTest {}; +TEST_P(conv_fp32_multi_eltwise_4_clamp, basic) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise1_data", get_mem(get_output_layout(p))), + data("eltwise2_data", get_mem(get_output_layout(p))), + data("eltwise4_data", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1_add", "conv_prim", "eltwise1_data", eltwise_mode::sum), + activation("activation", "eltwise1_add", activation_func::clamp, { 0.5f, 2.5f }), + eltwise("eltwise2_mul", "activation", "conv_prim", eltwise_mode::prod), + eltwise("eltwise3_div", "eltwise2_mul", "eltwise2_data", eltwise_mode::prod), + eltwise("eltwise4_add", "eltwise3_div", "eltwise4_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise4_add", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_4_clamp, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 2, 7 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 7 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 7 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 7 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 7 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 7 }, +})); + +class conv_fp32_eltwise_fusing_extend_ops : public ConvFusingTest {}; +TEST_P(conv_fp32_eltwise_fusing_extend_ops, pattern01_simple_sub) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data1", get_mem(get_output_layout(p))), + data("eltwise_data2", get_mem(get_output_layout(p))), + data("eltwise_data4", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1_sum", "conv_prim", "eltwise_data1", eltwise_mode::sum), + eltwise("eltwise2_sub", "conv_prim", "eltwise_data2", eltwise_mode::sub), + eltwise("eltwise3_prod", "eltwise1_sum", "eltwise2_sub", eltwise_mode::prod), + eltwise("eltwise4_sum", "eltwise3_prod", "eltwise_data4", eltwise_mode::sum), + concatenation("concat", { "eltwise4_sum", "eltwise4_sum" }, cldnn::concatenation::along_f), + reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_eltwise_fusing_extend_ops, pattern02_sub_scale) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data1", get_mem(get_output_layout(p))), + data("eltwise_data2", get_mem(get_output_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1_sum", "conv_prim", "eltwise_data1", eltwise_mode::sum), + eltwise("eltwise2_sub", "conv_prim", "eltwise1_sum", eltwise_mode::sub), + eltwise("eltwise3_prod", "eltwise2_sub", "eltwise_data2", eltwise_mode::prod), + scale("scale", "eltwise3_prod", "scale_data"), + concatenation("concat", { "scale", "scale" }, cldnn::concatenation::along_f), + reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_fp32_eltwise_fusing_extend_ops, pattern03_sub_div) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data1", get_mem(get_output_layout(p))), + data("eltwise_data2", get_mem(get_output_layout(p), 1.0f)), + data("eltwise_data3", get_mem(get_output_layout(p))), + data("eltwise_data4", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1_sum", "conv_prim", "eltwise_data1", eltwise_mode::sum), + eltwise("eltwise2_div", "eltwise1_sum", "eltwise_data2", eltwise_mode::div), + eltwise("eltwise3_prod", "eltwise2_div", "eltwise_data3", eltwise_mode::prod), + eltwise("eltwise4_sum", "eltwise3_prod", "eltwise_data4", eltwise_mode::sum), + concatenation("concat", { "eltwise4_sum", "eltwise4_sum" }, cldnn::concatenation::along_f), + reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_eltwise_fusing_extend_ops, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 3, 7 }, + convolution_test_params{ CASE_CONV_FP32_3, 3, 7 }, + convolution_test_params{ CASE_CONV_FP32_4, 3, 7 }, + + convolution_test_params{ CASE_CONV_FP16_2, 3, 7 }, + convolution_test_params{ CASE_CONV_FP16_3, 3, 7 }, + convolution_test_params{ CASE_CONV_FP16_4, 3, 7 }, +})); + +class conv_fp32_eltwise_fusing_2conv : public ConvFusingTest {}; +TEST_P(conv_fp32_eltwise_fusing_2conv, basic) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("bias0", get_mem(get_bias_layout(p))), + data("weights0", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim0", "input", { "weights0" }, { "bias0" }, p.groups, p.stride, p.pad, p.dilation), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1", "conv_prim0", "conv_prim", eltwise_mode::sum), + eltwise("eltwise2", "conv_prim0", "conv_prim", eltwise_mode::sum), + eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod), + concatenation("concat", { "eltwise3", "eltwise3" }, cldnn::concatenation::along_f), + reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim0", conv_impl }, { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_eltwise_fusing_2conv, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 4, 7 }, + convolution_test_params{ CASE_CONV_FP32_3, 4, 7 }, + convolution_test_params{ CASE_CONV_FP32_4, 4, 7 }, + + convolution_test_params{ CASE_CONV_FP16_2, 4, 7 }, + convolution_test_params{ CASE_CONV_FP16_3, 4, 7 }, + convolution_test_params{ CASE_CONV_FP16_4, 4, 7 }, +})); + + +class conv_fp32_multi_eltwise_3_fusing : public ConvFusingTest {}; +TEST_P(conv_fp32_multi_eltwise_3_fusing, basic) { + if (engine.get_device_info().supports_immad) { + return; + } + + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data1", get_mem(get_output_layout(p))), + data("eltwise_data2", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum), + eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum), + eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod), + reorder("reorder_bfyx", "eltwise3", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_3_fusing, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 2, 5 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 5 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 5 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 5 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 5 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 5 }, +})); + + + +class conv_fp32_multi_eltwise_quantization : public ConvFusingTest {}; +TEST_P(conv_fp32_multi_eltwise_quantization, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("eltwise_data1", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum), + eltwise("eltwise2", "eltwise1", "quantize", eltwise_mode::prod), + reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_quantization, ::testing::ValuesIn(std::vector{ +// convolution_test_params{ CASE_CONV_FP32_2, 4, 5 }, + convolution_test_params{ CASE_CONV_FP32_4, 4, 5 }, + + convolution_test_params{ CASE_CONV_FP16_2, 4, 5 }, + convolution_test_params{ CASE_CONV_FP16_3, 4, 5 }, + convolution_test_params{ CASE_CONV_FP16_4, 4, 5 }, +})); + + +class conv_fp32_multi_eltwise_concat : public ConvFusingTest {}; +TEST_P(conv_fp32_multi_eltwise_concat, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data1", get_mem(get_output_layout(p))), + data("eltwise_data2", get_mem(get_output_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("weights", get_mem(get_weights_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum), + eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum), + concatenation("concat", + { "eltwise1", "eltwise2" }, + concatenation::concatenation_axis::along_f, + data_types::i8, + "", + padding{ { 0, 0, 0, 0 }, 0 }), + reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_concat, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 5, 5 }, + convolution_test_params{ CASE_CONV_FP32_3, 5, 5 }, + convolution_test_params{ CASE_CONV_FP32_4, 5, 5 }, + + convolution_test_params{ CASE_CONV_FP16_2, 5, 5 }, + convolution_test_params{ CASE_CONV_FP16_3, 5, 5 }, + convolution_test_params{ CASE_CONV_FP16_4, 5, 5 }, +})); + +class conv_fp32_eltwise_b_fs_zyx_fsv16 : public ConvFusingTest {}; + +TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::b_fs_zyx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +class conv_fp32_swish : public ConvFusingTest {}; +TEST_P(conv_fp32_swish, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("sigmoid", "conv_prim", activation_func::logistic), + eltwise("mul", { "conv_prim", "sigmoid" }, eltwise_mode::prod), + reorder("reorder_bfyx", "mul", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_swish, ::testing::ValuesIn(std::vector{ + // convolution_test_params{ CASE_CONV_FP32_1, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 4 }, + + // convolution_test_params{ CASE_CONV_FP32_1, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 4 }, +})); + +TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, splitted_vector_ops) { + auto p = GetParam(); + + std::vector weights_idx; + for (size_t w = 0; w < p.groups; w++) { + create_topologies(data("weights" + std::to_string(w), get_mem(get_weights_layout(p, p.groups)))); + weights_idx.push_back(("weights" + std::to_string(w))); + } + + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", weights_idx, {}, 1, p.stride, p.pad, p.dilation), + eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::b_fs_zyx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1e-5f; + // commented because split mode is disabled + // execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_eltwise_b_fs_zyx_fsv16, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_6, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_7, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_8, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_9, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_11, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_12, 2, 3 }, + // convolution_test_params{ CASE_CONV_FP32_13, 2, 3 }, - leads to mvn_scale_activation_quantize_i8_eltwise_fp32_quantize_i8.basic/11 test failure + + convolution_test_params{ CASE_CONV_FP16_6, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_7, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_8, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_9, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_11, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_12, 2, 3 }, +})); + +class conv_fp32_quantize_u8_first_conv : public ConvFusingTest {}; +TEST_P(conv_fp32_quantize_u8_first_conv, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + reorder("reordered_input", "input", format::b_fs_yx_fsv16, p.data_type), + convolution("conv_prim", "reordered_input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_quantize_u8_first_conv, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_14, 2, 3 }, +})); + +class conv_fp32_quantize_u8 : public ConvFusingTest {}; +TEST_P(conv_fp32_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_quantize_u8, ::testing::ValuesIn(std::vector{ + // For now only b_fs_yx_fsv16 supports this case + convolution_test_params{ CASE_CONV_FP32_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 3 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, +})); + +class conv_fp32_scale_quantize_i8 : public ConvFusingTest {}; +TEST_P(conv_fp32_scale_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + // Output elements are in range [-127, 127] + // 1.0f difference is allowed, since quantize can return different values in ref and scale_shift kernels + // due to big error of division (in ref kernel). + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_quantize_i8, ::testing::ValuesIn(std::vector{ + // For now only b_fs_yx_fsv16 supports this case + convolution_test_params{ CASE_CONV_FP32_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 4 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 4 }, +})); + +class conv_fp32_scale_activation_quantize_i8 : public ConvFusingTest {}; +TEST_P(conv_fp32_scale_activation_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ + // For now only b_fs_yx_fsv16 supports this case + convolution_test_params{ CASE_CONV_FP32_2, 2, 5 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 5 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 5 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 5 }, +})); + +class conv_fp32_scale_activation_quantize_u8_eltwise_fp32 : public ConvFusingTest {}; +TEST_P(conv_fp32_scale_activation_quantize_u8_eltwise_fp32, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, p.default_type), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_u8_eltwise_fp32, ::testing::ValuesIn(std::vector{ + // For now only b_fs_yx_fsv16 supports this case + convolution_test_params{ CASE_CONV_FP32_2, 2, 6 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 6 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 6 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 6 }, +})); + +class conv_fp32_scale_activation_quantize_i8_activation : public ConvFusingTest {}; +TEST_P(conv_fp32_scale_activation_quantize_i8_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("slope_data", get_mem(get_per_channel_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + activation("activation_quantize", "quantize", activation_func::relu), + reorder("reorder_bfyx", "activation_quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_activation, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 2, 6 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 6 }, + + convolution_test_params{ CASE_CONV_FP16_2, 2, 6 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 6 }, +})); + + +class conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public ConvFusingTest {}; +TEST_P(conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_lo1", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("out_hi1", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("eltwise_data", get_mem(layout{ data_types::i8, p.input_format, p.out_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), + reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP32_2, 2, 7 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 7 }, +})); + +class conv_fp32_activation_eltwise_in_u8_fp32 : public ConvFusingTest {}; +TEST_P(conv_fp32_activation_eltwise_in_u8_fp32, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(layout{ data_types::i8, p.input_format, p.out_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::relu_negative_slope), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_eltwise_in_u8_fp32, ::testing::ValuesIn(std::vector{ + // convolution_test_params{ CASE_CONV_FP32_1, 2, 4 }, - eltwise fusing not supported + convolution_test_params{ CASE_CONV_FP32_2, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_3, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_4, 2, 4 }, + // convolution_test_params{ CASE_CONV_FP32_5, 2, 4 }, - eltwise fusing not supported + convolution_test_params{ CASE_CONV_FP32_6, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_7, 2, 4 }, + // convolution_test_params{ CASE_CONV_FP32_8, 2, 4 }, - unknown bug + convolution_test_params{ CASE_CONV_FP32_9, 2, 4 }, + convolution_test_params{ CASE_CONV_FP32_10, 2, 4 }, +})); + +class conv_fp32_activation_eltwise_diff_sizes : public ConvEltwTest {}; +TEST_P(conv_fp32_activation_eltwise_diff_sizes, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::relu_negative_slope), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_1, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_2, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_4, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_5, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_6, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_7, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_8, 3, 4 }, +})); + +class conv_scale_activation_eltwise_fp32_quantize_i8 : public ConvEltwTest {}; +TEST_P(conv_scale_activation_eltwise_fp32_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + data("scale_data", get_mem(get_per_channel_layout(p))), + scale("scale", "conv", "scale_data"), + activation("activation", "scale", activation_func::hyperbolic_tan), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), + eltwise("eltw", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127, 127)), + data("out_high", get_mem(get_single_element_layout(p), -127, 127)), + quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_scale_activation_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_1, 2, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_2, 2, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_3, 2, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_4, 2, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_5, 3, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_6, 3, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_7, 3, 6 }, + conv_eltw_test_params{ CASE_CONV_ELTW_FP32_8, 3, 6 }, +})); + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- INT8 convolution cases ------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +class conv_int8_scale : public ConvFusingTest {}; +TEST_P(conv_int8_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_int8_scale, fp16_scale_out) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data", optional_data_type{ data_types::f16 }), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 3 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, +})); + +class conv_int8_eltwise : public ConvFusingTest {}; +TEST_P(conv_int8_eltwise, fp16_eltwise_out) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("scale", { "conv_prim", "scale_data" }, eltwise_mode::prod, data_types::f16), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 3 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, +})); + +class conv_int8_scale_shift_swish : public ConvFusingTest {}; +TEST_P(conv_int8_scale_shift_swish, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + data("shift_data", get_mem(get_per_channel_layout(p), 1)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("scale0", { "conv_prim", "scale_data" }, eltwise_mode::prod), + eltwise("scale1", { "conv_prim", "scale_data" }, eltwise_mode::prod), + eltwise("shift0", { "scale0", "shift_data" }, eltwise_mode::sum), + eltwise("shift1", { "scale1", "shift_data" }, eltwise_mode::sum), + activation("sigmoid", "shift0", activation_func::logistic), + eltwise("mul", { "shift1", "sigmoid" }, eltwise_mode::prod), + reorder("reorder_bfyx", "mul", p.default_format, data_types::f32) + ); + + tolerance = 1e-3f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 8 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 8 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 8 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 8 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 8 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 8 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 8 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 8 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 8 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 8 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 8 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 8 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 8 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 8 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 8 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 8 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 8 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 8 }, +})); + +class conv_int8_prelu_eltwise : public ConvFusingTest {}; +TEST_P(conv_int8_prelu_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_int8_prelu_eltwise, fsv16) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + if (p.default_format.dimension() == 4) { + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + } else { + // TODO Add 5D int8 optimized convolution implementations + return; + } + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_prelu_eltwise, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_7, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_8, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_7, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_8, 2, 4 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 4 }, +})); + +class conv_int8_activation_eltwise_quantize : public ConvFusingTest {}; +TEST_P(conv_int8_activation_eltwise_quantize, fsv16) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::negative), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + if (p.default_format.dimension() == 4) { + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + } else { + // TODO Add 5D int8 optimized convolution implementations + return; + } + + tolerance = 1.f; + execute(p); +} + +TEST_P(conv_int8_activation_eltwise_quantize, fsv32) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::negative), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + if (p.default_format.dimension() == 4) { + implementation_desc conv_impl = { format::b_fs_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + } else { + // TODO Add 5D int8 optimized convolution implementations + return; + } + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise_quantize, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_7, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_8, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_7, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_8, 2, 5 }, +})); + +class conv_int8_activation_eltwise : public ConvFusingTest {}; +TEST_P(conv_int8_activation_eltwise, fsv16) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::negative), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + if (p.default_format.dimension() == 4) { + implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + } else { + // TODO Add 5D int8 optimized convolution implementations + return; + } + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(conv_int8_activation_eltwise, fsv32) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::negative), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + if (p.default_format.dimension() == 4) { + implementation_desc conv_impl = { format::b_fs_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + } else { + // TODO Add 5D int8 optimized convolution implementations + return; + } + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_7, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_8, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_7, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_8, 2, 4 }, +})); + +class conv_int8_quantize_u8 : public ConvFusingTest {}; +TEST_P(conv_int8_quantize_u8, per_channel) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(conv_int8_quantize_u8, per_tensor) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_quantize_u8, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_8, 2, 3 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, +})); + +class conv_int8_scale_quantize_i8 : public ConvFusingTest {}; +TEST_P(conv_int8_scale_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + // Output elements are in range [-127, 127] + // 1.0f difference is allowed, since quantize can return different values in ref and scale_shift kernels + // due to big error of division (in ref kernel). + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_quantize_i8, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_9, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 4 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 4 }, +})); + +class conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8 : public ConvFusingTest {}; +TEST_P(conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + implementation_desc conv_impl = { format::b_fs_yx_fsv4, "convolution_gpu_b_fs_yx_fsv4_int8" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_S8S8_11, 2, 4 }, +})); + +class conv_int8_relu_quantize : public ConvFusingTest {}; +TEST_P(conv_int8_relu_quantize, i8) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("relu", "conv_prim", activation_func::relu), + quantize("quantize", "relu", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + // Output elements are in range [-127, 127] + // 1.0f difference is allowed, since quantize can return different values in ref and scale_shift kernels + // due to big error of division (in ref kernel). + tolerance = 1.0f; + execute(p); +} + +TEST_P(conv_int8_relu_quantize, u8) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("relu", "conv_prim", activation_func::relu), + quantize("quantize", "relu", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_relu_quantize, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 4 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 4 }, +})); + +class conv_int8_scale_activation_quantize_i8 : public ConvFusingTest {}; +TEST_P(conv_int8_scale_activation_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 2.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 5 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 5 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 5 }, +})); + +class conv_int8_scale_activation_quantize_i8_eltwise_fp32 : public ConvFusingTest {}; +TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltwise_fp32, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 6 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 6 }, +})); + +class conv_int8_scale_activation_quantize_i8_activation : public ConvFusingTest {}; +TEST_P(conv_int8_scale_activation_quantize_i8_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("slope_data", get_mem(get_per_channel_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + activation("activation_quantize", "quantize", activation_func::relu), + reorder("reorder_bfyx", "activation_quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_activation, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 6 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 6 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 6 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 6 }, +})); + + +class conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public ConvFusingTest {}; +// With some input values accuracy error might be = 2, so the test is disabled. +TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, DISABLED_basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_lo1", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("out_hi1", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("eltwise_data", get_mem(layout{ data_types::i8, p.input_format, p.out_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), + reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 7 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 7 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 7 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 7 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 7 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 7 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 7 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 7 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 7 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 7 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 7 }, +})); + +class conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec : public ConvFusingTest {}; +TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_lo1", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("out_hi1", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("slope_data", get_mem(get_per_channel_layout(p))), + data("eltwise_data", get_mem(layout{ data_types::i8, format::b_fs_yx_fsv4, p.out_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), + reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::b_fs_yx_fsv4, "convolution_gpu_b_fs_yx_fsv4_1x1" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1.f; + execute(p); +} + +TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops_mixed_types) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_lo1", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("out_hi1", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), + data("slope_data", get_mem(layout{ data_types::f16, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } })), + data("eltwise_data", get_mem(layout{ data_types::u8, format::b_fs_yx_fsv4, p.out_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), + reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::b_fs_yx_fsv4, "convolution_gpu_b_fs_yx_fsv4_1x1" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_3, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_5, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_5, 2, 7 }, +})); + +class conv_int8_asymmetric_weights : public ConvFusingTest {}; +TEST_P(conv_int8_asymmetric_weights, basic) { + auto p = GetParam(); + auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx; + auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) : + get_weights_layout(p); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(weights_layout)), + data("bias", get_mem(get_bias_layout(p))), + data("w_zp", get_mem(get_weights_zp_layout(p), 1, 127)), + eltwise("w_sub", { "weights", "w_zp" }, eltwise_mode::sub, data_types::f32), + convolution("conv_prim", "input", { "w_sub" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), + reorder("reorder_bfyx", "conv_prim", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused.get_primitives_info().empty()); + ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); + + // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto pi_not_fused = network_not_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_conv); + + ASSERT_TRUE(info_fused != pi_fused.end()); + ASSERT_TRUE(info_not_fused != pi_not_fused.end()); + + ASSERT_EQ(info_fused->c_dependencies.size(), 4lu); // input + weights + bias + w_zp + ASSERT_EQ(info_not_fused->c_dependencies.size(), 3lu); // input + weights + bias + + compare(network_not_fused, network_fused, p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_asymmetric_weights, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 2 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 2 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 2 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 2 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 2 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 2 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 2 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 2 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 2 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 2 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 2 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 2 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 2 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 2 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 2 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 2 }, +})); + +class conv_int8_asymmetric_data : public ConvFusingTest {}; +TEST_P(conv_int8_asymmetric_data, basic) { + auto p = GetParam(); + auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx; + auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) : + get_weights_layout(p); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(weights_layout)), + data("bias", get_mem(get_bias_layout(p))), + data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)), + eltwise("a_sub", { "input", "a_zp" }, eltwise_mode::sub, data_types::f32), + convolution("conv_prim", "a_sub", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), + reorder("reorder_bfyx", "conv_prim", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused.get_primitives_info().empty()); + ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); + + // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto pi_not_fused = network_not_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_conv); + + ASSERT_TRUE(info_fused != pi_fused.end()); + ASSERT_TRUE(info_not_fused != pi_not_fused.end()); + + ASSERT_EQ(info_fused->c_dependencies.size(), 5lu); // input + weights + bias + a_zp + comp + ASSERT_EQ(info_not_fused->c_dependencies.size(), 3lu); // input + weights + bias + + compare(network_not_fused, network_fused, p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_asymmetric_data, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 3 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, +})); + +class conv_int8_asymmetric_data_and_weights : public ConvFusingTest {}; +TEST_P(conv_int8_asymmetric_data_and_weights, basic) { + auto p = GetParam(); + auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx; + auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) : + get_weights_layout(p); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(weights_layout)), + data("bias", get_mem(get_bias_layout(p))), + data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)), + data("w_zp", get_mem(get_weights_zp_layout(p), 1, 127)), + eltwise("a_sub", { "input", "a_zp" }, eltwise_mode::sub, data_types::f32), + eltwise("w_sub", { "weights", "w_zp" }, eltwise_mode::sub, data_types::f32), + convolution("conv_prim", "a_sub", { "w_sub" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), + reorder("reorder_bfyx", "conv_prim", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused.get_primitives_info().empty()); + ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); + + // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto pi_not_fused = network_not_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_conv); + + ASSERT_TRUE(info_fused != pi_fused.end()); + ASSERT_TRUE(info_not_fused != pi_not_fused.end()); + + ASSERT_EQ(info_fused->c_dependencies.size(), 6lu); // input + weights + bias + a_zp + w_zp + comp + ASSERT_EQ(info_not_fused->c_dependencies.size(), 3lu); // input + weights + bias + + compare(network_not_fused, network_fused, p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_asymmetric_data_and_weights, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 3 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, +})); + + +class conv_i8_activation_eltwise_diff_sizes : public ConvEltwTest {}; +TEST_P(conv_i8_activation_eltwise_diff_sizes, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::abs), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_i8_activation_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ + conv_eltw_test_params{ CASE_CONV_ELTW_i8_1, 3, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_i8_2, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_i8_3, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_i8_4, 2, 4 }, + conv_eltw_test_params{ CASE_CONV_ELTW_i8_5, 3, 4 }, +})); + +/* ----------------------------------------------------------------------------------------------------- */ +/* ----------------------------------- Force convolution kernel cases ---------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +class conv_fp16_activation : public ConvFusingForceKernelTest {}; +TEST_P(conv_fp16_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::abs), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp16_activation, ::testing::ValuesIn(std::vector{ + bc_force_kernel_params{ CASE_CONV_FP16_13, 2, 3, "convolution_gpu_fs_byx_fsv32" }, +})); + + +class conv_fp16_scale : public ConvFusingForceKernelTest {}; +TEST_P(conv_fp16_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + scale("scale", "conv_prim", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp16_scale, ::testing::ValuesIn(std::vector{ + bc_force_kernel_params{ CASE_CONV_FP16_13, 2, 3, "convolution_gpu_fs_byx_fsv32" }, +})); + +// reorder(bfyx to fs_b_yx_fsv32) + conv +#define FSV32_CASE_CONV_FP32_1 { 1, 32, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx + +class conv_fp32_reorder_bfyx_to_fsv32_conv_basic : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_basic, basic) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + reorder("reorder_fsv32", "input", format::fs_b_yx_fsv32, data_types::f32), + convolution("conv_output", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs) + ); + + implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); + + execute(p); +} +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_basic, ::testing::ValuesIn(std::vector{ + convolution_test_params{ FSV32_CASE_CONV_FP32_1, 3, 3 } +})); + + +class conv_fp32_reorder_bfyx_to_fsv32_conv_mean : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_mean, have_mean) { + auto p = GetParam(); + memory::ptr mul = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } }); + set_values(mul, { 0.5f, 2.5f, -5.0f, 4.3f, 1.2f, -3.5f }); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("mul", mul), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + reorder("reorder_fsv32", "input", format::fs_b_yx_fsv32, data_types::f32, "mul", reorder_mean_mode::mul), + convolution("conv_output", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs) + ); + + implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); + + execute(p); +} +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_mean, ::testing::ValuesIn(std::vector{ + convolution_test_params{ FSV32_CASE_CONV_FP32_1, 3, 3 } +})); + + +class conv_fp32_reorder_bfyx_to_fsv32_conv_subtract : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature) { + auto p = GetParam(); + const std::vector& values_to_subtract = { + 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f, + 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f, + 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f, + 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f + }; + + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + auto dw_stride = tensor{ 0, 0, 1, 1 }; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32, values_to_subtract), + convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs) + ); + + implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); + + execute(p); +} +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, ::testing::ValuesIn(std::vector{ + convolution_test_params{ FSV32_CASE_CONV_FP32_1, 4, 4 } +})); + + +class conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, have_fused_activation) { + auto p = GetParam(); + + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + auto dw_stride = tensor{ 0, 0, 1, 1 }; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32), + activation("activation_quantize", "reorder_fsv32", activation_func::relu), + convolution("conv_output", "activation_quantize", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs) + ); + + implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); + + execute(p); +} +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, ::testing::ValuesIn(std::vector{ + convolution_test_params{ FSV32_CASE_CONV_FP32_1, 4, 5 } +})); + + +class conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding : public ConvFusingTest {}; +TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, have_data_padding) { + auto p = GetParam(); + + auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); + auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; + auto dw_stride = tensor{ 0, 0, 1, 1 }; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -127, 127)), + data("weights_dw", get_mem(dw_weights_layout, -127, 127)), + convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation), + reorder("reorder_fsv32", "conv_prim", layout(data_types::f32, format::fs_b_yx_fsv32, dw_tensor, padding{ { 0, 0, 1, 1 }, 0 })), + convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), + activation("activation", "conv_output", activation_func::abs), + activation("activation2", "conv_prim", activation_func::abs), + eltwise("add_bias", { "activation", "activation2" }, eltwise_mode::sum) + ); + + implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; + bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); + + execute(p); +} +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, ::testing::ValuesIn(std::vector{ + convolution_test_params{ FSV32_CASE_CONV_FP32_1, 5, 6 } +})); + +#ifdef ENABLE_ONEDNN_FOR_GPU +class conv_int8_eltwise_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_int8_eltwise_onednn, u8_eltwise_sum_out) { + auto p = GetParam(); + + auto shift_layout = get_output_layout(p); + shift_layout.data_type = data_types::f32; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), 0, 2)), + data("bias", get_mem(get_bias_layout(p))), + data("shift_data", get_mem(shift_layout)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("shift", { "conv_prim", "shift_data" }, eltwise_mode::sum, data_types::f32), + // Add 'not fusable' primitive to be able to test full size tensor sum + crop("crop", "shift", get_output_layout(p).size, { 0, 0, 0, 0 }), + reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +TEST_P(conv_int8_eltwise_onednn, u8_eltwise_prod_out) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -2, 2)), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()) ), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("scale", { "conv_prim", "scale_data" }, eltwise_mode::prod, data_types::u8), + crop("crop", "scale", get_output_layout(p).size, { 0, 0, 0, 0 }), + reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 4 }, + + convolution_test_params{ CASE_CONV_U8S8_11, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 4 }, + + convolution_test_params{ CASE_CONV3D_U8S8_1, 3, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_2, 3, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_3, 3, 4 }, + convolution_test_params{ CASE_CONV3D_U8S8_5, 3, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_1, 3, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_2, 3, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_3, 3, 4 }, + convolution_test_params{ CASE_CONV3D_S8S8_5, 3, 4 }, +})); + +class conv_fp32_activation_abs_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_fp32_activation_abs_onednn, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::abs), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_abs_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, +})); + +class conv_fp32_activation_mish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_fp32_activation_mish_onednn, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::mish), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_mish_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, +})); + +class conv_fp32_activation_swish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_fp32_activation_swish_onednn, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::swish), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_swish_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, +})); + +class conv_fp32_activation_hswish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_fp32_activation_hswish_onednn, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::hswish), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_hswish_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, +})); + +class conv_fp32_activation_exp_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_fp32_activation_exp_onednn, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::exp), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_exp_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_FP16_1, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_2, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_3, 2, 3 }, + convolution_test_params{ CASE_CONV_FP16_4, 2, 3 }, +})); + +class conv_int8_quantize_u8_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_int8_quantize_u8_onednn, per_channel) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -2, 2)), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), -10, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 0, 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(conv_int8_quantize_u8_onednn, per_tensor) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -2, 2)), + data("bias", get_mem(get_bias_layout(p), 0)), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_quantize_u8_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, +})); + +class conv_int8_activation_eltwise_quantize_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_int8_activation_eltwise_quantize_onednn, bsv32_fsv32) { + auto p = GetParam(); + layout eltwise_layout = get_output_layout(p); + eltwise_layout.format = format::bs_fs_yx_bsv32_fsv32; + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -1, 1)), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(eltwise_layout, -0.5, 0.5)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::abs), + eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::bs_fs_yx_bsv32_fsv32, "", impl_types::onednn }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise_quantize_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_7, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_8, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_11, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 5 }, + + convolution_test_params{ CASE_CONV_S8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_4, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_7, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_8, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 5 }, +})); + +class conv_int8_scale_shift_swish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_int8_scale_shift_swish_onednn, bsv32_fsv32) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -1, 1)), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + data("shift_data", get_mem(get_per_channel_layout(p), 1)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("scale0", { "conv_prim", "scale_data" }, eltwise_mode::sum), + eltwise("shift0", { "scale0", "shift_data" }, eltwise_mode::sum), + activation("sigmoid", "shift0", activation_func::swish), + eltwise("scale1", { "sigmoid", "scale_data" }, eltwise_mode::sum), + eltwise("shift1", { "scale1", "shift_data" }, eltwise_mode::sum), + reorder("reorder_bfyx", "shift1", p.default_format, data_types::f32) + ); + + implementation_desc conv_impl = { format::bs_fs_yx_bsv32_fsv32, "", impl_types::onednn }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_1, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 7 }, + + convolution_test_params{ CASE_CONV_U8S8_11, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 7 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 7 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 7 }, +})); + +class conv_int8_eltwise_scale_onednn : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(conv_int8_eltwise_scale_onednn, u8_eltwise_prod_out_reuse) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p), -2, 2)), + data("bias", get_mem(get_bias_layout(p))), + data("sum_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), + eltwise("sum", { "conv_prim", "sum_data" }, eltwise_mode::sum, data_types::f32), + eltwise("scale", { "sum", "scale_data" }, eltwise_mode::prod, data_types::f32), + crop("crop", "scale", get_output_layout(p).size, { 0, 0, 0, 0 }), + reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + + auto input_prim = get_mem(get_input_layout(p)); + + auto forcing_format = p.input_format; + implementation_desc conv_impl = { forcing_format, "", impl_types::onednn }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + // First network.execute() call + compare(network_not_fused, network_fused, p); + // Second network.execute() call to make sure that scales have not been wrongly overwritten within first iteration + // and don't affect final result of second iteration + compare(network_not_fused, network_fused, p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise_scale_onednn, ::testing::ValuesIn(std::vector{ + convolution_test_params{ CASE_CONV_U8S8_15, 2, 5 }, +})); + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------ OneDNN post-ops cases with optimizations ----------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +// Before optimization: eltw_linear + eltw_linear +// After optimization: eltw_linear +// Limitations: no +// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128 +// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:-0.5 +class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), -128)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_eltw_linear, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 3 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 }, +})); + +// Before optimization: eltw_non_linear + eltw_linear +// After optimization: eltw_non_linear +// Limitations: beta = 0 in eltw_linear +// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512 +// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512 +class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 512)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::f32), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 3 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 }, +})); + +// Before optimization: binary_add + eltw_linear +// After optimization: binary_add +// Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer +// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127 +// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127 +class post_ops_optimizations_onednn_binary_add_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_eltw_linear, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 3 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 }, +})); + +// Before optimization: binary_mul + eltw_linear +// After optimization: binary_mul +// Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer +// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512 +// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512 +class post_ops_optimizations_onednn_binary_mul_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("eltwise_data", get_mem(get_per_channel_layout(p), -1, 1)), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 512)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + eltwise("eltwise", { "conv_prim", "eltwise_data" }, eltwise_mode::prod), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_eltw_linear, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 4 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_9, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 4 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 4 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 4 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 4 }, +})); + +// Before optimization: o_scale + eltw_linear +// After optimization: o_scale +// Limitations: beta = 0 in eltw_linear +// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512 +// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:eltwise_clip:0:512 +class post_ops_optimizations_onednn_oscale_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 512)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_linear, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 3 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 3 }, +})); + +// Before optimization: eltw_any + sum + eltw_linear +// After optimization: eltw_any + sum +// Limitations: beta = 0 in eltw_linear +// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127 +// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127 +class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), 0)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("eltwise_data", get_mem(get_output_layout(p))), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + activation("activation", "conv_prim", activation_func::relu_negative_slope), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum), + quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 128, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 5 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_10, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 5 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 5 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 5 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 5 }, +})); + +// Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi +// Here's out_lo > out_hi and no optimizations +// DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127 +class post_ops_optimizations_input_range : public WeightsPrimitiveFusingTestOneDNN {}; +TEST_P(post_ops_optimizations_input_range, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), -10)), + data("in_hi", get_mem(get_single_element_layout(p), 10)), + data("out_lo", get_mem(get_single_element_layout(p), 127)), + data("out_hi", get_mem(get_single_element_layout(p), -128)), + convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), + quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, ::testing::ValuesIn(std::vector{ + // cases with batch = 1 + convolution_test_params{ CASE_CONV_U8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_3, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_1, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_2, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_3, 2, 3 }, + + // cases with batch = 16 + convolution_test_params{ CASE_CONV_U8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_10, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_9, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_10, 2, 3 }, + + // cases with batch = 32 + convolution_test_params{ CASE_CONV_U8S8_11, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_U8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_12, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_13, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_14, 2, 3 }, + convolution_test_params{ CASE_CONV_S8S8_15, 2, 3 }, +})); + +#endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp new file mode 100644 index 00000000000..a4a3e50dd28 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/deconvolution_fusion_test.cpp @@ -0,0 +1,819 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { + +struct deconv_test_params { + tensor in_shape; + tensor out_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +struct deconv_eltw_test_params { + tensor in_shape; + tensor out_shape; + tensor eltw_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class DeconvolutionFusingTest : public ::BaseFusingTest { +public: + void execute(deconv_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "deconv") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(deconv_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(deconv_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } +}; + +class ConvEltwTest : public ::BaseFusingTest { +public: + + void execute(deconv_eltw_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_prim = [](primitive_info& p) -> bool { + // Add more ids when needed + if (p.original_id == "deconv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_prim); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(deconv_eltw_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(deconv_eltw_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* --------------------------------------- Deconvolution cases ----------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_DECONV_FP32_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_FP32_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx + +#define CASE_DECONV_FP16_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::gs_oiyx_gsv16, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx +#define CASE_DECONV_FP16_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::gs_oiyx_gsv16, data_types::f16, format::bfyx + +#define CASE_DECONV_S8S8_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_S8S8_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx + +#define CASE_DECONV_U8S8_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_U8S8_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx + + +// 3D +// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_DECONV_FP32_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::gs_oizyx_gsv16, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::gs_oizyx_gsv16, data_types::f32, format::bfzyx +#define CASE_DECONV_FP32_3D_9 { 16, 16, 4, 5, 3 }, { 16, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx + +#define CASE_DECONV_FP16_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::oizyx, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::gs_oizyx_gsv16, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::oizyx, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::gs_oizyx_gsv16, data_types::f16, format::bfzyx +#define CASE_DECONV_FP16_3D_9 { 16, 16, 4, 5, 3 }, { 16, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx + +#define CASE_DECONV_S8S8_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_S8S8_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx + +#define CASE_DECONV_U8S8_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_U8S8_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx + +#define CASE_DECONV_ELTW_FP32_1 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_DECONV_ELTW_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_ELTW_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_ELTW_FP32_4 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_ELTW_FP32_5 { 1, 15, 4, 5, 4 }, { 1, 30, 6, 7, 6 }, { 1, 30, 6, 1, 6 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_ELTW_FP32_6 { 1, 32, 2, 2, 2 }, { 1, 16, 4, 4, 4 }, { 1, 16, 1, 4, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx +#define CASE_DECONV_ELTW_FP32_7 { 1, 16, 3, 5 }, { 1, 32, 5, 7 }, { 1, 32, 1, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx +#define CASE_DECONV_ELTW_FP32_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx + +#define CASE_DECONV_ELTW_i8_1 { 1, 16, 3, 5 }, { 1, 32, 5, 7 }, { 1, 32, 5, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_DECONV_ELTW_i8_2 { 1, 32, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 32, 1, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx +#define CASE_DECONV_ELTW_i8_3 { 1, 5, 5, 5, 5 }, { 1, 5, 5, 5, 5 }, { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx +#define CASE_DECONV_ELTW_i8_4 { 1, 16, 1, 4 }, { 1, 16, 1, 6 }, { 1, 16, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx +#define CASE_DECONV_ELTW_i8_5 { 1, 16, 2, 4 }, { 1, 16, 4, 6 }, { 1, 16, 4, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx + +class deconv_actv : public DeconvolutionFusingTest {}; +TEST_P(deconv_actv, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + activation("act", "deconv", activation_func::relu), + reorder("out", "act", p.default_format, data_types::f32) + ); + // Need much higher tolerance because of deconvolution -> convolution optimization + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_actv, ::testing::ValuesIn(std::vector{ + deconv_test_params{ CASE_DECONV_FP32_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_FP16_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 }, + // Here and below this test case and CASE_DECONV_S8S8_4 are commented because they fail for z_pad=0 which is unexpected + // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 }, + // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_9, 2, 3 }, + + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_9, 2, 3 }, + + deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 3 }, +})); + + +class deconv_bias : public DeconvolutionFusingTest {}; +TEST_P(deconv_bias, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + eltwise("bias_add", { "deconv", "bias" }, eltwise_mode::sum), + reorder("out", "bias_add", p.default_format, data_types::f32) + ); + + // Need much higher tolerance because of deconvolution -> convolution optimization + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_bias, ::testing::ValuesIn(std::vector{ + deconv_test_params{ CASE_DECONV_FP32_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_FP16_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 }, + + deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 }, +})); + +class deconv_scale : public DeconvolutionFusingTest {}; +TEST_P(deconv_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + scale("scale", "deconv", "scale_data"), + reorder("out", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(deconv_scale, fp16_scale_out) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + scale("scale", "deconv", "scale_data", optional_data_type{ data_types::f16 }), + reorder("out", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale, ::testing::ValuesIn(std::vector{ + deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 }, + // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 }, + // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 3 }, + + deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 3 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 3 }, +})); + +class deconv_actv_eltw_actv : public DeconvolutionFusingTest {}; +TEST_P(deconv_actv_eltw_actv, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("eltw_data", get_mem(get_output_layout(p))), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + activation("act1", "deconv", activation_func::relu), + eltwise("eltw", { "act1", "eltw_data" }, eltwise_mode::sum), + activation("act2", "eltw", activation_func::relu), + reorder("out", "act2", p.default_format, data_types::f32) + ); + + // Need much higher tolerance because of deconvolution -> convolution optimization + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_actv_eltw_actv, ::testing::ValuesIn(std::vector{ + // Some fusings disabled under deconvolution -> convolution optimization + deconv_test_params{ CASE_DECONV_FP32_1, 3, 5 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_FP16_1, 3, 5 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_U8S8_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3, 2, 5 }, + // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_S8S8_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3, 2, 5 }, + // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_9, 2, 5 }, + + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_9, 2, 5 }, + + deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 5 }, +})); + +class deconv_scale_actv_quant_i8 : public DeconvolutionFusingTest {}; +TEST_P(deconv_scale_actv_quant_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.f/p.kernel.count())), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + scale("scale", "deconv", "scale_data"), + activation("actv", "scale", activation_func::softsign), + quantize("quant", "actv", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("out", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ + deconv_test_params{ CASE_DECONV_FP32_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_FP16_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_U8S8_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3, 2, 5 }, + // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_S8S8_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3, 2, 5 }, + // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 5 }, + // FIXME no quantize implementation for bs_fs_yx_bsv16_fsv16 format AND add_required_reorders pass completely ruins data types + // add_required_reorders pass tries to reorder everything to output type if no format exists, this ruins fp32 -> int8 quantize + //deconv_test_params{ CASE_DECONV_FP32_3D_9, 3, 5 }, + + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 5 }, + //deconv_test_params{ CASE_DECONV_FP16_3D_9, 3, 5 }, + + deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 5 }, + + deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 5 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 5 }, +})); + +class deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8 : public DeconvolutionFusingTest {}; +TEST_P(deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("scale1_data", get_mem(get_per_channel_layout(p), 1.f / p.kernel.count())), + data("in1_lo", get_mem(get_per_channel_layout(p), 0)), + data("in1_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out1_lo", get_mem(get_single_element_layout(p), 0)), + data("out1_hi", get_mem(get_single_element_layout(p), 255)), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.out_shape))), + data("scale2_data", get_mem(get_per_channel_layout(p), 1.f / p.kernel.count())), + data("in2_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in2_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out2_lo", get_mem(get_single_element_layout(p), -127)), + data("out2_hi", get_mem(get_single_element_layout(p), 127)), + deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), + scale("scale1", "deconv", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + quantize("quant1", "actv1", "in1_lo", "in1_hi", "out1_lo", "out1_hi", 256, data_types::u8), + eltwise("eltw", { "quant1", "eltw_data" }, eltwise_mode::sum, p.default_type), + scale("scale2", "eltw", "scale2_data"), + activation("actv2", "scale2", activation_func::relu), + quantize("quant2", "actv2", "in2_lo", "in2_hi", "out2_lo", "out2_hi", 255, data_types::i8), + reorder("out", "quant2", p.default_format, data_types::f32) + ); + + tolerance = 2.1f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ + deconv_test_params{ CASE_DECONV_FP32_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_2, 2, 9 }, + // deconv_test_params{ CASE_DECONV_FP32_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_8, 2, 9 }, + + deconv_test_params{ CASE_DECONV_FP16_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_8, 2, 9 }, + + deconv_test_params{ CASE_DECONV_U8S8_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3, 2, 9 }, + // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_8, 2, 9 }, + + deconv_test_params{ CASE_DECONV_S8S8_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3, 2, 9 }, + // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_8, 2, 9 }, + + deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 9 }, + // deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 9 }, + // deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 9 }, + // deconv_test_params{ CASE_DECONV_FP32_3D_9, 6, 9 }, + + deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 9 }, + // deconv_test_params{ CASE_DECONV_FP16_3D_9, 6, 9 }, + + deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 9 }, + + deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 9 }, + deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 9 }, +})); + +class deconv_scale_activation_quantize_i8_eltwise_quantize_u8 : public ConvEltwTest {}; +TEST_P(deconv_scale_activation_quantize_i8_eltwise_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + deconvolution("deconv_prim", "input", { "weights" }, p.groups, p.stride, p.pad), + data("scale_data", get_mem(get_per_channel_layout(p), 1.f / p.kernel.count())), + scale("scale", "deconv_prim", "scale_data"), + activation("activation", "scale", activation_func::relu), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + quantize("quant", "activation", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), + eltwise("eltw", { "quant", "eltwise_data" }, eltwise_mode::sum, p.default_type), + data("in_low2", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high2", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low2", get_mem(get_single_element_layout(p), 0)), + data("out_high2", get_mem(get_single_element_layout(p), 255)), + quantize("quant2", "eltw", "in_low2", "in_high2", "out_low2", "out_high2", 256, data_types::u8), + reorder("reorder_bfyx", "quant2", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale_activation_quantize_i8_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_1, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_2, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_3, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_4, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_5, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_6, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_7, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_8, 2, 7 }, + + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_1, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_2, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_3, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_4, 2, 7 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_5, 2, 7 }, + +})); + +class deconv_activation_eltwise_diff_sizes : public ConvEltwTest {}; +TEST_P(deconv_activation_eltwise_diff_sizes, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), + deconvolution("deconv_prim", "input", { "weights" }, p.groups, p.stride, p.pad), + activation("activation", "deconv_prim", activation_func::relu), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, p.default_type), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_activation_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_1, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_2, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_3, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_4, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_5, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_6, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_7, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_FP32_8, 2, 4 }, + + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_1, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_2, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_3, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_4, 2, 4 }, + deconv_eltw_test_params{ CASE_DECONV_ELTW_i8_5, 2, 4 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/depth_to_space_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/depth_to_space_fusion_test.cpp new file mode 100644 index 00000000000..73ea2d05e22 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/depth_to_space_fusion_test.cpp @@ -0,0 +1,162 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct depth_to_space_test_params { + tensor input_size; + tensor output_size; + depth_to_space_mode mode; + data_types input_type; + format input_format; + size_t block_size; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class DepthToSpaceFusingsTest : public ::BaseFusingTest { +public: + void execute(depth_to_space_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(depth_to_space_test_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(depth_to_space_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; + } + + format get_input_format(depth_to_space_test_params &p) { + return p.input_format; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* -------------------------------- DepthToSpace cases ------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_DEPTH_TO_SPACE_F32_1 { 1, 16, 8, 10 }, { 1, 4, 16, 20 }, depth_to_space_mode::blocks_first, data_types::f32, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_F32_2 { 1, 32, 8, 8 }, { 1, 2, 32, 32 }, depth_to_space_mode::blocks_first, data_types::f32, format::b_fs_yx_fsv16, 4, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_F16_1 { 1, 12, 8, 8 }, { 1, 3, 16, 16 }, depth_to_space_mode::blocks_first, data_types::f16, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_F16_2 { 1, 16, 9, 8 }, { 1, 1, 36, 32 }, depth_to_space_mode::blocks_first, data_types::f16, format::b_fs_yx_fsv16, 4, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_U8_1 { 1, 128, 8, 8 }, { 1, 2, 64, 64 }, depth_to_space_mode::blocks_first, data_types::u8, format::bfyx, 8, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_U8_2 { 1, 128, 4, 8 }, { 1, 8, 16, 32 }, depth_to_space_mode::blocks_first, data_types::u8, format::b_fs_yx_fsv16, 4, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_I8_1 { 1, 16, 8, 8 }, { 1, 4, 16, 16 }, depth_to_space_mode::blocks_first, data_types::i8, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_DEPTH_TO_SPACE_I8_2 { 1, 256, 8, 8 }, { 1, 4, 64, 64 }, depth_to_space_mode::blocks_first, data_types::i8, format::b_fs_yx_fsv16, 8, data_types::f32, format::bfyx + +class depth_to_space_quantize_i8 : public DepthToSpaceFusingsTest {}; +TEST_P(depth_to_space_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + depth_to_space("depth_to_space", "input", p.block_size, p.mode), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -128)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + quantize("quant", "depth_to_space", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), + reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, depth_to_space_quantize_i8, ::testing::ValuesIn(std::vector{ + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_1, 2, 3 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_2, 2, 3 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_1, 2, 3 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_2, 2, 3 }, +})); + +class depth_to_space_scale_act_eltwise_quantize_u8 : public DepthToSpaceFusingsTest {}; +TEST_P(depth_to_space_scale_act_eltwise_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + depth_to_space("depth_to_space", "input", p.block_size, p.mode), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "depth_to_space", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), 0)), + data("out_high", get_mem(get_single_element_layout(p), 255)), + quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), + reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, depth_to_space_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_1, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_2, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_1, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_2, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_1, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_2, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_1, 2, 6 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_2, 2, 6 }, +})); + + +class depth_to_space_scale_act_eltw : public DepthToSpaceFusingsTest {}; +TEST_P(depth_to_space_scale_act_eltw, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + depth_to_space("depth_to_space", "input", p.block_size, p.mode), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "depth_to_space", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + reorder("reorder_bfyx", "eltw", format::bfyx, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, depth_to_space_scale_act_eltw, ::testing::ValuesIn(std::vector{ + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_1, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_2, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_1, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_2, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_1, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_2, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_1, 2, 5 }, + depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_2, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp new file mode 100644 index 00000000000..8fc104f3f0e --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/eltwise_fusion_test.cpp @@ -0,0 +1,534 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct eltwise_test_params { + tensor input_size; + data_types input_type; + data_types input_type2; + format input_format; + data_types default_type; + format default_format; + eltwise_mode mode; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class EltwiseFusingTest : public ::BaseFusingTest { +public: + void execute(eltwise_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + auto input_prim2 = get_mem(get_input_layout2(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + auto inputs = network_fused.get_input_ids(); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + if (std::find(inputs.begin(), inputs.end(), "input2") != inputs.end()) { + network_fused.set_input_data("input2", input_prim2); + network_not_fused.set_input_data("input2", input_prim2); + } + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(eltwise_test_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_input_layout2(eltwise_test_params& p) { + return layout{ p.input_type2, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(eltwise_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- Eltwise cases ---------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +#define CASE_ELTWISE_FP32_1 { 2, 16, 4, 4 }, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_FP32_2 { 2, 16, 4, 4 }, data_types::f32, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_FP32_3 { 2, 32, 4, 8 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_FP32_4 { 2, 16, 4, 4 }, data_types::f32, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx, eltwise_mode::sum +#define CASE_ELTWISE_FP16_1 { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_FP16_2 { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_FP16_3 { 2, 32, 4, 8 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_FP16_4 { 3, 32, 4, 4 }, data_types::f16, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::fs_b_yx_fsv32, eltwise_mode::sum +#define CASE_ELTWISE_I8_1 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_I8_2 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_I8_3 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_U8_1 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_U8_2 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_U8_3 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_FP32_FP16_1 { 2, 16, 4, 4 }, data_types::f32, data_types::f16, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_FP32_FP16_2 { 2, 16, 4, 4 }, data_types::f32, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_FP32_FP16_3 { 2, 32, 4, 4 }, data_types::f32, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_FP16_FP32_1 { 2, 16, 4, 4 }, data_types::f16, data_types::f32, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_FP16_FP32_2 { 2, 16, 4, 4 }, data_types::f16, data_types::f32, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_FP16_FP32_3 { 2, 32, 4, 4 }, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_I8_FP16_1 { 2, 16, 4, 4 }, data_types::i8, data_types::f16, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_I8_FP16_2 { 2, 16, 4, 4 }, data_types::i8, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_I8_FP16_3 { 2, 32, 4, 4 }, data_types::i8, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_I8_FP32_1 { 2, 16, 4, 4 }, data_types::i8, data_types::f32, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_I8_FP32_2 { 2, 16, 4, 4 }, data_types::i8, data_types::f32, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_I8_FP32_3 { 2, 32, 4, 4 }, data_types::i8, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_U8_FP16_1 { 2, 16, 4, 4 }, data_types::u8, data_types::f16, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_U8_FP16_2 { 2, 16, 4, 4 }, data_types::u8, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_U8_FP16_3 { 2, 32, 4, 4 }, data_types::u8, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum +#define CASE_ELTWISE_U8_FP32_1 { 2, 16, 4, 4 }, data_types::u8, data_types::f32, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum +#define CASE_ELTWISE_U8_FP32_2 { 2, 16, 4, 4 }, data_types::u8, data_types::f32, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum +#define CASE_ELTWISE_U8_FP32_3 { 2, 32, 4, 4 }, data_types::u8, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum + +#define CASE_ELTWISE_FP32_5 { 1, 5, 4, 4 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum +#define CASE_ELTWISE_FP32_6 { 2, 32, 4, 8 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum +#define CASE_ELTWISE_FP16_5 { 2, 32, 4, 8 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4, eltwise_mode::sum +#define CASE_ELTWISE_FP16_6 { 1, 32, 4, 8 }, data_types::f16, data_types::f16, format::byxf, data_types::f16, format::byxf, eltwise_mode::sum +#define CASE_ELTWISE_I8_4 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum +#define CASE_ELTWISE_U8_4 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum + +class eltwise_quantize : public EltwiseFusingTest {}; +TEST_P(eltwise_quantize, u8) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("out", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +TEST_P(eltwise_quantize, i8_per_channel) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -128)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("out", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_quantize, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP32_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP32_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP16_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP16_3, 3, 4 }, + // fsv4 + eltwise_test_params{ CASE_ELTWISE_FP16_5, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_5, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_6, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_I8_4, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_U8_4, 3, 4 }, +})); + +class eltwise_const_path : public EltwiseFusingTest {}; +TEST_P(eltwise_const_path, not_fuse_to_const_eltwise) { + auto p = GetParam(); + create_topologies( + data("const1", get_mem(get_input_layout2(p), -10, 10)), + data("const2", get_mem(get_input_layout2(p), -10, 10)), + input_layout("input", get_input_layout2(p)), + eltwise("eltwise", { "const1", "const2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "input" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_const_path, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_3, 2, 3 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 2, 3 }, + eltwise_test_params{ CASE_ELTWISE_FP32_5, 2, 3 }, + eltwise_test_params{ CASE_ELTWISE_FP32_6, 2, 3 }, + eltwise_test_params{ CASE_ELTWISE_I8_4, 2, 3 }, + eltwise_test_params{ CASE_ELTWISE_U8_4, 2, 3 }, +})); + +class eltwise_fp32_fsv16 : public EltwiseFusingTest {}; +TEST_P(eltwise_fp32_fsv16, add) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::b_fs_yx_fsv16, "eltwise_b_fs_yx_fsv16" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_fp32_fsv16, add_per_element) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_input_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::b_fs_yx_fsv16, "eltwise_b_fs_yx_fsv16" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fsv16, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 }, +})); + +class eltwise_fp32_fsv32 : public EltwiseFusingTest {}; +TEST_P(eltwise_fp32_fsv32, add) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_fp32_fsv32, add_per_element) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_input_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fsv32, ::testing::ValuesIn(std::vector{ + // There's no optimized eltwise kernel yet for fsv32 layout that supports fused_ops + // So only activation is fused via legacy mechanism + eltwise_test_params{ CASE_ELTWISE_FP16_4, 4, 5 }, +})); + +class eltwise_fp32_fsv4 : public EltwiseFusingTest {}; +TEST_P(eltwise_fp32_fsv4, add) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::b_fs_yx_fsv4, "eltwise_b_fs_yx_fsv4" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_fp32_fsv4, add_per_element) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_input_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::b_fs_yx_fsv4, "eltwise_b_fs_yx_fsv4" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fsv4, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP32_5, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_6, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_4, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_4, 3, 5 }, +})); + +class eltwise_fp32_fused_prims : public EltwiseFusingTest {}; +TEST_P(eltwise_fp32_fused_prims, scale_activation) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + scale("scale", "eltwise", "scale_data"), + activation("activation", "scale", activation_func::abs), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_fp32_fused_prims, eltwise_activation) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("eltwise_data", get_mem(get_input_layout2(p), -10, 10)), + eltwise("eltwise1", { "input", "input2" }, p.mode, data_types::f32), + eltwise("eltwise2", { "eltwise1", "eltwise_data" }, eltwise_mode::prod, p.default_type), + activation("activation", "eltwise2", activation_func::abs), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_fp32_fused_prims, eltwise_activation_with_broadcast) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("eltwise_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise1", { "input", "input2" }, p.mode, p.default_type), + eltwise("eltwise2", { "eltwise1", "eltwise_data" }, eltwise_mode::prod, p.default_type), + activation("activation", "eltwise2", activation_func::abs), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fused_prims, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP32_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP32_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP32_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP32_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP32_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP32_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP16_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP16_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_FP16_3, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP16_1, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP16_2, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_FP16_3, 3, 5 }, + // fsv4 + eltwise_test_params{ CASE_ELTWISE_FP32_5, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_FP32_6, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_I8_4, 3, 5 }, + eltwise_test_params{ CASE_ELTWISE_U8_4, 3, 5 }, +})); + +class eltwise_fp32_scale : public EltwiseFusingTest {}; +TEST_P(eltwise_fp32_scale, 6d) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + scale("scale", "eltwise", "scale_data"), + reorder("out", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_scale, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP32_4, 3, 4 }, +})); + +class eltwise_fp16_byxf : public EltwiseFusingTest {}; +TEST_P(eltwise_fp16_byxf, add) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), + activation("activation", "add", activation_func::negative), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + implementation_desc eltw_impl = { format::byxf, "generic_eltwise_ref" }; + bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp16_byxf, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_6, 3, 5 } +})); + +class eltwise_no_pitches_same_dims_quantize : public EltwiseFusingTest {}; +TEST_P(eltwise_no_pitches_same_dims_quantize, quantize_f32_output) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -128)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, p.input_type), + reorder("out", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_no_pitches_same_dims_quantize, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, +})); + +class eltwise_activation : public EltwiseFusingTest {}; +TEST_P(eltwise_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), + activation("activation", "eltwise", activation_func::relu, { 6.0f, 0.0f }), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(eltwise_activation, fp16_out) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + input_layout("input2", get_input_layout2(p)), + eltwise("eltwise", { "input", "input2" }, p.mode, data_types::f16), + activation("activation", "eltwise", activation_func::relu, { 6.0f, 0.0f }), + reorder("out", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(std::vector{ + eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 4 }, + eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 3, 4 } +})); diff --git a/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp new file mode 100644 index 00000000000..5d10df7c2c0 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/fully_connected_fusion_test.cpp @@ -0,0 +1,393 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct fully_connected_test_params { + tensor in_shape; + tensor out_shape; + tensor kernel; + tensor stride; + tensor pad; + tensor dilation; + uint32_t groups; + data_types data_type; + format input_format; + data_types weights_type; + format weights_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class FullyConnectedFusingTest : public ::BaseFusingTest { +public: + + void execute(fully_connected_test_params& p) { + auto input_prim = this->get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, this->bo_not_fused); + network network_fused(this->engine, this->topology_fused, this->bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + this->compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(fully_connected_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(fully_connected_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } + + size_t get_output_dim_size(fully_connected_test_params& p) { + size_t size = 2; + for (auto i : p.out_shape.spatial) { + if (i > 1) + size++; + } + return size; + } + + layout get_weights_layout(fully_connected_test_params& p) { + cldnn::tensor weights_tensor; + if (p.out_shape.spatial[1] > 1) { + // 3d case + weights_tensor = cldnn::tensor(p.kernel.batch[0], p.kernel.feature[0], 1, 1); + } + else { + weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]), + spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); + } + return layout{ p.weights_type, p.weights_format, weights_tensor }; + } + + layout get_bias_layout(fully_connected_test_params& p) { + if (p.out_shape.spatial[1] > 1) { + // 3d case + return layout{ p.default_type, format::bfyx, tensor{ 1, p.out_shape.spatial[1], 1, 1 } }; + } + else { + return layout{ p.default_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } + } +}; + + +#ifdef ENABLE_ONEDNN_FOR_GPU +class FullyConnectedFusingTestOneDNN : public BaseFusingTest { +public: + void execute(fully_connected_test_params& p) { + // Onednn post operation has issue in a machine that does not support imad. + if (!engine.get_device_info().supports_imad) + return; + + auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p)); + + auto impl_forcing_bo = bo_fused.get(); + const auto& impl_forcing = impl_forcing_bo->forcing; + + auto forcing_format = p.input_format; + for (auto& forcing : impl_forcing) { + if (forcing.first == "conv_prim") { + forcing_format = forcing.second.output_format; + } + } + + implementation_desc conv_impl = { forcing_format, "", impl_types::onednn }; + bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + auto find_conv = [](primitive_info& p) -> bool { + if (p.original_id == "conv_prim") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); + if (info_fused != pi_fused.end()) + std::cout << "kernel: " << info_fused->kernel_id << std::endl; + } + + layout get_input_layout(fully_connected_test_params& p) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; + } + + layout get_per_channel_layout(fully_connected_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{1, p.out_shape.feature[0], 1, 1} }; + } + + size_t get_output_dim_size(fully_connected_test_params& p) { + size_t size = 2; + for (auto i : p.out_shape.spatial) { + if (i > 1) + size++; + } + return size; + } +}; +#endif // ENABLE_ONEDNN_FOR_GPU + +} // namespace + +// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_FC_FP32_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_FP32_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_FP32_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_FP32_3D_1 { 5, 3, 1, 3 }, { 5, 3, 1, 5 }, { 5, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_FC_FP32_3D_2 { 2, 1, 1, 1 }, { 2, 1, 1, 32 }, { 32, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx +#define CASE_FC_FP32_3D_3 { 2, 32, 1, 32 }, { 2, 32, 1, 16 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx + +#define CASE_FC_U8S8_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_1 { 2, 32, 1, 3 }, { 2, 32, 1, 16 }, { 16, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_2 { 1, 1, 1, 3 }, { 1, 1, 1, 32 }, { 32, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_3 { 2, 3, 1, 1 }, { 2, 3, 1, 15 }, { 15, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx +#define CASE_FC_U8S8_3D_4 { 1, 512, 1, 1024 }, { 1, 384, 1, 1024 }, { 1024, 1024, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- FC cases --------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +class fc_fp32_activation : public FullyConnectedFusingTest {}; +TEST_P(fc_fp32_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + fully_connected("fc_prim", "input", "weights", "bias", "", padding(), get_output_dim_size(p)), + activation("activation", "fc_prim", activation_func::abs), + reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_activation, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_FP32_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_3, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_3D_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_3D_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_3D_3, 2, 3 }, +})); + +class fc_fp32_bias : public FullyConnectedFusingTest {}; +TEST_P(fc_fp32_bias, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + fully_connected("fc_prim", "input", "weights", ""), + eltwise("bias_add", { "fc_prim", "bias" }, eltwise_mode::sum), + reorder("reorder_bfyx", "bias_add", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_bias, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_FP32_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_FP32_3, 2, 3 }, +})); + +class fc_int8_scale : public FullyConnectedFusingTest {}; +TEST_P(fc_int8_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())), + fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_output_dim_size(p)), + scale("scale", "fc_prim", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(fc_int8_scale, fp16_scale_out) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())), + fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_output_dim_size(p)), + scale("scale", "fc_prim", "scale_data", optional_data_type{ data_types::f16 }), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 }, +})); + +class fc_int8_quantize_u8 : public FullyConnectedFusingTest {}; +TEST_P(fc_int8_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_output_dim_size(p)), + quantize("quantize", "fc_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_U8S8_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 3 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 3 }, +})); + +class fc_int8_scale_quantize_i8 : public FullyConnectedFusingTest {}; +TEST_P(fc_int8_scale_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_output_dim_size(p)), + scale("scale", "fc_prim", "scale_data"), + quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_quantize_i8, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_U8S8_1, 2, 4 }, + fully_connected_test_params{ CASE_FC_U8S8_2, 2, 4 }, + fully_connected_test_params{ CASE_FC_U8S8_3, 2, 4 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 4 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 4 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 4 }, +})); + +class fc_int8_scale_activation_quantize_i8 : public FullyConnectedFusingTest {}; +TEST_P(fc_int8_scale_activation_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_output_dim_size(p)), + scale("scale", "fc_prim", "scale_data"), + activation("activation_scale", "scale", activation_func::exp), + quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ + fully_connected_test_params{ CASE_FC_U8S8_1, 2, 5 }, + fully_connected_test_params{ CASE_FC_U8S8_2, 2, 5 }, + fully_connected_test_params{ CASE_FC_U8S8_3, 2, 5 }, + + fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 5 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 5 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_3, 2, 5 }, + + fully_connected_test_params{ CASE_FC_FP32_3D_1, 3, 5 }, + fully_connected_test_params{ CASE_FC_FP32_3D_2, 3, 5 }, + fully_connected_test_params{ CASE_FC_FP32_3D_3, 3, 5 }, +})); + +#ifdef ENABLE_ONEDNN_FOR_GPU + +// FC onednn sum case +class fc_int8_inputs_fused_fp32_sum : public FullyConnectedFusingTestOneDNN {}; +TEST_P(fc_int8_inputs_fused_fp32_sum, basic) { + auto p = GetParam(); + auto shift_layout = layout{ p.default_type, p.default_format, tensor{ 1, 1, 1, p.kernel.batch[0] } }; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("bias", get_mem(get_bias_layout(p))), + data("shift_data", get_mem(shift_layout, 1)), + fully_connected("fc_prim", "input", "weights", "bias", cldnn::data_types::f32, "", padding(), get_output_dim_size(p)), + eltwise("shift", { "fc_prim", "shift_data" }, eltwise_mode::sum, cldnn::data_types::f32), + crop("crop", "shift", get_output_layout(p).size, { 0, 0, 0, 0 }), + reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::ValuesIn(std::vector{ + // OneDNN has issue with small shapes - ticket 7064 + // fully_connected_test_params{ CASE_FC_U8S8_3D_1, 2, 4 }, + // fully_connected_test_params{ CASE_FC_U8S8_3D_2, 2, 4 }, + fully_connected_test_params{ CASE_FC_U8S8_3D_4, 2, 4 }, +})); +#endif diff --git a/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp new file mode 100644 index 00000000000..32751c8350b --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/fusion_test_common.hpp @@ -0,0 +1,230 @@ +// Copyright (C) 2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "test_utils.h" + +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +template +class BaseFusingTest : public ::testing::TestWithParam { +public: +#ifdef ENABLE_ONEDNN_FOR_GPU + cldnn::engine& engine = get_onednn_test_engine(); +#else + cldnn::engine& engine = get_test_engine(); +#endif + cldnn::topology topology_fused; + cldnn::topology topology_non_fused; + cldnn::build_options bo_fused; + cldnn::build_options bo_not_fused; + + float tolerance = 0.0f; + + static const int min_random = -200; + static const int max_random = 200; + + void SetUp() override { + bo_fused.set_option(build_option::optimize_data(true)); + bo_not_fused.set_option(build_option::optimize_data(false)); + bo_not_fused.set_option(build_option::allow_static_input_reorder(true)); + } + + void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) { + auto outputs_ref = not_fused.execute(); + auto outputs_fused = fused.execute(); + auto get_reorders_count = [](network& net) -> size_t { + size_t count = 0; + for (auto& pi : net.get_primitives_info()) { + if (pi.type_id == "reorder") { + auto exec_prims = net.get_executed_primitives(); + auto it = std::find_if(exec_prims.begin(), exec_prims.end(), [&](const std::pair& e) -> bool { + return e.first == pi.original_id; + }); + // We count executed reorders only + if (it != exec_prims.end()) + count++; + } + } + return count; + }; + + size_t reorders_count_fused = get_reorders_count(fused); + size_t reorders_count_not_fused = get_reorders_count(not_fused); + + std::stringstream description; + description << std::endl << "not fused: " << std::endl; + for (auto i : not_fused.get_primitives_info()) { + description << " " << i.original_id << " " << i.kernel_id << std::endl; + } + description << "fused: " << std::endl; + for (auto i : fused.get_primitives_info()) { + description << " " << i.original_id << " " << i.kernel_id << std::endl; + } + SCOPED_TRACE(description.str()); + // Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph + ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives); + ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives); + ASSERT_EQ(outputs_ref.size(), outputs_fused.size()); + ASSERT_EQ(outputs_ref.size(), size_t(1)); + + auto output_not_fused_prim = outputs_ref.begin()->second.get_memory(); + auto output_fused_prim = outputs_fused.begin()->second.get_memory(); + if (output_not_fused_prim->get_layout().data_type == data_types::f32) { + cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); + cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); + for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { + ASSERT_NEAR(ref[i], output_ptr[i], tolerance) << "i = " << i; + } + } else { + cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); + cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); + for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { + ASSERT_NEAR(float16_to_float32(ref[i]), float16_to_float32(output_ptr[i]), tolerance) << "i = " << i; + } + } + } + + cldnn::memory::ptr get_mem(cldnn::layout l) { + auto prim = engine.allocate_memory(l); + tensor s = l.size; + if (l.data_type == data_types::bin) { + VF rnd_vec = generate_random_1d(s.count() / 32, min_random, max_random); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::i8 || l.data_type == data_types::u8) { + VF rnd_vec = generate_random_1d(s.count(), min_random, max_random); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::f16) { + VF rnd_vec = generate_random_1d(s.count(), -1, 1); + set_values(prim, rnd_vec); + } else { + VF rnd_vec = generate_random_1d(s.count(), -1, 1); + set_values(prim, rnd_vec); + } + + return prim; + } + + cldnn::memory::ptr get_mem(cldnn::layout l, float fill_value) { + auto prim = engine.allocate_memory(l); + tensor s = l.size; + if (l.data_type == data_types::bin) { + VF rnd_vec(s.count() / 32, static_cast(fill_value)); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::f16) { + VF rnd_vec(s.count(), float32_to_float16(fill_value)); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::f32) { + VF rnd_vec(s.count(), fill_value); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::u8) { + VF rnd_vec(s.count(), static_cast(fill_value)); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::i8) { + VF rnd_vec(s.count(), static_cast(fill_value)); + set_values(prim, rnd_vec); + } else { + throw std::runtime_error("get_mem: Unsupported precision"); + } + + return prim; + } + + cldnn::memory::ptr get_repeatless_mem(cldnn::layout l, int min, int max) { + auto prim = engine.allocate_memory(l); + tensor s = l.size; + if (l.data_type == data_types::f32) { + VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::f16) { + VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::i8) { + VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } + else if (l.data_type == data_types::bin) { + VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } + + return prim; + } + + cldnn::memory::ptr get_mem(cldnn::layout l, int min, int max) { + auto prim = engine.allocate_memory(l); + tensor s = l.size; + if (l.data_type == data_types::f32) { + VF rnd_vec = generate_random_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::f16) { + VF rnd_vec = generate_random_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::i8) { + VF rnd_vec = generate_random_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::u8) { + VF rnd_vec = generate_random_1d(s.count(), min, max); + set_values(prim, rnd_vec); + } else if (l.data_type == data_types::bin) { + VF rnd_vec = generate_random_1d(s.count() / 32, min, max); + set_values(prim, rnd_vec); + } + + return prim; + } + + layout get_output_layout(T& p) { + return layout{ p.data_type, p.input_format, p.out_shape }; + } + + layout get_weights_layout(T& p, const int32_t /* split */ = 1) { + cldnn::tensor weights_tensor; + if (p.groups == 1) { + weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]), + spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); + } else { + weights_tensor = cldnn::tensor(group(p.groups), batch(p.out_shape.feature[0] / p.groups), feature(p.in_shape.feature[0] / p.groups), + spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); + } + return layout{p.weights_type, p.weights_format, weights_tensor}; + } + + layout get_weights_layout(T& p, const int32_t /* split */, cldnn::format f) { + cldnn::tensor weights_tensor; + weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(static_cast(p.in_shape.feature[0] / p.groups)), + spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); + return layout{p.weights_type, f, weights_tensor}; + } + + layout get_bias_layout(T& p) { + return layout{ p.default_type, format::bfyx, tensor{1, p.out_shape.feature[0], 1, 1} }; + } + + layout get_weights_zp_layout(T& p) { + return layout{ p.weights_type, p.default_format, tensor{p.out_shape.feature[0], 1, 1, 1} }; + } + + layout get_activations_zp_layout(T& p) { + return layout{ p.data_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1} }; + } + + layout get_single_element_layout(T& p) { + return layout{ p.default_type, p.default_format, tensor{1, 1, 1, 1} }; + } + + template + void create_topologies(Args const&... args) { + topology_fused.add(args...); + topology_non_fused.add(args...); + } +}; diff --git a/src/plugins/intel_gpu/tests/fusions/gather_elements_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/gather_elements_fusion_test.cpp new file mode 100644 index 00000000000..a6a6e5045a5 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/gather_elements_fusion_test.cpp @@ -0,0 +1,243 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct gather_elements_test_params { + data_types data_type; + + format input_format; + tensor input_shape; + + format indices_format; + tensor indices_shape; + + format output_format; + tensor output_shape; + + cldnn::gather_elements::gather_elements_axis axis; + + data_types default_type; + format default_format; + + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class GatherElementsPrimitiveFusingTest : public ::BaseFusingTest { +public: + void execute(gather_elements_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + compare(network_not_fused, network_fused, p); + } + + size_t get_axis_dim(gather_elements_test_params& p) { + switch (p.axis) { + case cldnn::gather_elements::gather_elements_axis::along_x: + return p.input_shape.spatial[0]; + case cldnn::gather_elements::gather_elements_axis::along_y: + return p.input_shape.spatial[1]; + case cldnn::gather_elements::gather_elements_axis::along_z: + return p.input_shape.spatial[2]; + case cldnn::gather_elements::gather_elements_axis::along_w: + return p.input_shape.spatial[3]; + case cldnn::gather_elements::gather_elements_axis::along_f: + return p.input_shape.feature[0]; + case cldnn::gather_elements::gather_elements_axis::along_b: + return p.input_shape.batch[0]; + default: + return 1; + } + } + + layout get_input_layout(gather_elements_test_params& p) { + return layout{ p.data_type, p.input_format, p.input_shape }; + } + + layout get_indices_layout(gather_elements_test_params& p) { + return layout{ p.data_type, p.indices_format, p.indices_shape }; + } + + layout get_output_layout(gather_elements_test_params& p) { + return layout{ p.data_type, p.output_format, p.output_shape }; + } + + layout get_per_channel_layout(gather_elements_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.output_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------------------ GatherElements cases ------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +#define CASE_GATHER_ELEMENTS_FP16_4D_1 data_types::f16, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, cldnn::gather_elements::gather_elements_axis::along_y, data_types::f16, format::bfyx +#define CASE_GATHER_ELEMENTS_FP16_4D_2 data_types::f16, format::bfyx, { 3, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, cldnn::gather_elements::gather_elements_axis::along_b, data_types::f16, format::bfyx +#define CASE_GATHER_ELEMENTS_FP16_4D_3 data_types::f16, format::bfyx, { 1, 3, 2, 9 }, format::bfyx, { 1, 3, 5, 9 }, format::bfyx, { 1, 3, 5, 9 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f16, format::bfyx + +#define CASE_GATHER_ELEMENTS_FP16_5D_1 data_types::f16, format::bfzyx, { 3, 2, 5, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f16, format::bfzyx +#define CASE_GATHER_ELEMENTS_FP16_5D_2 data_types::f16, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 7, 4, 3 }, format::bfzyx, { 5, 4, 7, 4, 3 }, cldnn::gather_elements::gather_elements_axis::along_z, data_types::f16, format::bfzyx + +#define CASE_GATHER_ELEMENTS_FP16_6D_1 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, cldnn::gather_elements::gather_elements_axis::along_f, data_types::f16, format::bfwzyx +#define CASE_GATHER_ELEMENTS_FP16_6D_2 data_types::f16, format::bfwzyx, { 2, 1, 2, 3, 2, 1 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_w, data_types::f16, format::bfwzyx +#define CASE_GATHER_ELEMENTS_FP16_6D_3 data_types::f16, format::bfwzyx, { 2, 2, 3, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f16, format::bfwzyx + + +#define CASE_GATHER_ELEMENTS_FP32_4D_1 data_types::f32, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, cldnn::gather_elements::gather_elements_axis::along_y, data_types::f32, format::bfyx +#define CASE_GATHER_ELEMENTS_FP32_4D_2 data_types::f32, format::bfyx, { 3, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, cldnn::gather_elements::gather_elements_axis::along_b, data_types::f32, format::bfyx +#define CASE_GATHER_ELEMENTS_FP32_4D_3 data_types::f32, format::bfyx, { 1, 3, 2, 9 }, format::bfyx, { 1, 3, 5, 9 }, format::bfyx, { 1, 3, 5, 9 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f32, format::bfyx + +#define CASE_GATHER_ELEMENTS_FP32_5D_1 data_types::f32, format::bfzyx, { 3, 2, 5, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f32, format::bfzyx +#define CASE_GATHER_ELEMENTS_FP32_5D_2 data_types::f32, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 7, 4, 3 }, format::bfzyx, { 5, 4, 7, 4, 3 }, cldnn::gather_elements::gather_elements_axis::along_z, data_types::f32, format::bfzyx + +#define CASE_GATHER_ELEMENTS_FP32_6D_1 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, cldnn::gather_elements::gather_elements_axis::along_f, data_types::f32, format::bfwzyx +#define CASE_GATHER_ELEMENTS_FP32_6D_2 data_types::f32, format::bfwzyx, { 2, 1, 2, 3, 2, 1 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_w, data_types::f32, format::bfwzyx +#define CASE_GATHER_ELEMENTS_FP32_6D_3 data_types::f32, format::bfwzyx, { 2, 2, 3, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f32, format::bfwzyx + +class gather_elements_quantize : public GatherElementsPrimitiveFusingTest {}; +TEST_P(gather_elements_quantize, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_elements_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p))-1)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + gather_elements("gather_elements_prim", "input", "gather_elements_indices", p.output_format, p.output_shape, p.axis), + quantize("quantize", "gather_elements_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_quantize, ::testing::ValuesIn(std::vector{ + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_1, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_2, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_3, 2, 3 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_1, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_2, 2, 3 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_1, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_2, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_3, 2, 3 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_1, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_2, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_3, 2, 3 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_1, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_2, 2, 3 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_1, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_2, 2, 3 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_3, 2, 3 }, +})); + + +class gather_elements_scale_activation : public GatherElementsPrimitiveFusingTest {}; +TEST_P(gather_elements_scale_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_elements_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p))-1)), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + gather_elements("gather_elements_prim", "input", "gather_elements_indices", p.output_format, p.output_shape, p.axis), + activation("activation", "gather_elements_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_scale_activation, ::testing::ValuesIn(std::vector{ + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_1, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_2, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_3, 2, 4 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_1, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_2, 2, 4 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_1, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_2, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_3, 2, 4 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_1, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_2, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_3, 2, 4 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_1, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_2, 2, 4 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_1, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_2, 2, 4 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_3, 2, 4 }, +})); + + +class gather_elements_activation_scale_eltwise : public GatherElementsPrimitiveFusingTest {}; +TEST_P(gather_elements_activation_scale_eltwise, basic) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_elements_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p))-1)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)), + data("eltwise_data", get_mem(get_output_layout(p))), + gather_elements("gather_elements_prim", "input", "gather_elements_indices", p.output_format, p.output_shape, p.axis), + activation("activation", "gather_elements_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_activation_scale_eltwise, ::testing::ValuesIn(std::vector{ + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_1, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_2, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_3, 2, 5 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_1, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_2, 2, 5 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_1, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_2, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_3, 2, 5 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_1, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_2, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_3, 2, 5 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_1, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_2, 2, 5 }, + + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_1, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_2, 2, 5 }, + gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_3, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/gather_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/gather_fusion_test.cpp new file mode 100644 index 00000000000..44f52fa476b --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/gather_fusion_test.cpp @@ -0,0 +1,193 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct gather_test_params { + tensor dictionary_shape; + tensor indices_shape; + tensor out_shape; + format out_format; + cldnn::gather::gather_axis axis; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class GatherPrimitiveFusingTest : public ::BaseFusingTest { +public: + void execute(gather_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(gather_test_params& p) { + return layout{ p.data_type, p.input_format, p.dictionary_shape }; + } + + layout get_indices_layout(gather_test_params& p) { + return layout{ p.data_type, format::bfyx, p.indices_shape }; + } + + size_t get_axis_dim(gather_test_params& p) { + switch (p.axis) { + case cldnn::gather::gather_axis::along_x: + return p.dictionary_shape.spatial[0]; + case cldnn::gather::gather_axis::along_y: + return p.dictionary_shape.spatial[1]; + case cldnn::gather::gather_axis::along_z: + return p.dictionary_shape.spatial[2]; + case cldnn::gather::gather_axis::along_w: + return p.dictionary_shape.spatial[3]; + case cldnn::gather::gather_axis::along_f: + return p.dictionary_shape.feature[0]; + case cldnn::gather::gather_axis::along_b: + return p.dictionary_shape.batch[0]; + default: + return 1; + } + } + + layout get_per_channel_layout(gather_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------------------ Gather cases --------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_GATHER_FP32_1 { 2, 3, 1, 4 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GATHER_FP32_2 { 3, 2, 1, 2 }, { 2, 3, 1, 1 }, { 2, 3, 2, 2 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GATHER_FP32_3 { 3, 1, 1, 2 }, { 2, 1, 1, 1 }, { 3, 2, 1, 2 }, format::bfyx, cldnn::gather::gather_axis::along_f, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GATHER_FP32_4 { 5, 3, 2, 2 }, { 3, 1, 1, 1 }, { 5, 2, 2, 3 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GATHER_FP32_5 { 2, 3, 1, 2 }, { 1, 3, 1, 1 }, { 2, 3, 3, 1 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx + +#define CASE_GATHER_FP16_1 { 2, 3, 1, 4 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GATHER_FP16_2 { 3, 2, 1, 2 }, { 2, 3, 1, 1 }, { 2, 3, 2, 2 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GATHER_FP16_3 { 3, 1, 1, 2 }, { 2, 1, 1, 1 }, { 3, 2, 1, 2 }, format::bfyx, cldnn::gather::gather_axis::along_f, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GATHER_FP16_4 { 5, 3, 2, 2 }, { 3, 1, 1, 1 }, { 5, 2, 2, 3 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GATHER_FP16_5 { 2, 3, 1, 2 }, { 1, 3, 1, 1 }, { 2, 3, 3, 1 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f16, format::bfyx, data_types::f16, format::bfyx + +#define CASE_GATHER_5D_FP32_1 { 2, 3, 1, 4, 1 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_b, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_GATHER_5D_FP32_2 { 2, 3, 2, 2, 2 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_f, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_GATHER_5D_FP32_3 { 5, 3, 2, 2, 2 }, { 3, 1, 1, 1 }, { 5, 3, 2, 3, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_y, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_GATHER_5D_FP32_4 { 2, 3, 1, 4, 4 }, { 2, 1, 1, 1 }, { 2, 3, 1, 4, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_z, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_GATHER_5D_FP32_5 { 3, 1, 5, 2, 1 }, { 2, 1, 1, 1 }, { 3, 1, 2, 2, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_x, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx + +#define CASE_GATHER_5D_FP16_1 { 3, 2, 1, 2, 1 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_b, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_GATHER_5D_FP16_2 { 1, 3, 1, 2, 1 }, { 2, 1, 1, 1 }, { 1, 2, 1, 2, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_f, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_GATHER_5D_FP16_3 { 2, 3, 1, 3, 3 }, { 1, 2, 1, 1 }, { 2, 3, 1, 2, 3 }, format::bfzyx, cldnn::gather::gather_axis::along_y, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_GATHER_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 1, 1, 1 }, { 3, 2, 2, 2, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_z, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_GATHER_5D_FP16_5 { 1, 1, 2, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 3, 1, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_x, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx + +class gather_quantize : public GatherPrimitiveFusingTest {}; +TEST_P(gather_quantize, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)))), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + gather("gather_prim", "input", "gather_indices", p.axis, p.out_format, p.out_shape), + quantize("quantize", "gather_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_quantize, ::testing::ValuesIn(std::vector{ + gather_test_params{ CASE_GATHER_FP32_1, 2, 3 }, + gather_test_params{ CASE_GATHER_FP32_2, 2, 3 }, + gather_test_params{ CASE_GATHER_FP32_3, 2, 3 }, + gather_test_params{ CASE_GATHER_FP32_4, 2, 3 }, + gather_test_params{ CASE_GATHER_FP32_5, 2, 3 }, + + gather_test_params{ CASE_GATHER_FP16_1, 2, 3 }, + gather_test_params{ CASE_GATHER_FP16_2, 2, 3 }, + gather_test_params{ CASE_GATHER_FP16_3, 2, 3 }, + gather_test_params{ CASE_GATHER_FP16_4, 2, 3 }, + gather_test_params{ CASE_GATHER_FP16_5, 2, 3 }, + + gather_test_params{ CASE_GATHER_5D_FP32_1, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP32_2, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP32_3, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP32_4, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP32_5, 2, 3 }, + + gather_test_params{ CASE_GATHER_5D_FP16_1, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP16_2, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP16_3, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP16_4, 2, 3 }, + gather_test_params{ CASE_GATHER_5D_FP16_5, 2, 3 }, +})); + +class gather_scale_activation : public GatherPrimitiveFusingTest {}; +TEST_P(gather_scale_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)))), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + gather("gather_prim", "input", "gather_indices", p.axis, p.out_format, p.out_shape), + activation("activation", "gather_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_scale_activation, ::testing::ValuesIn(std::vector{ + gather_test_params{ CASE_GATHER_FP32_1, 2, 4 }, + gather_test_params{ CASE_GATHER_FP32_2, 2, 4 }, + gather_test_params{ CASE_GATHER_FP32_3, 2, 4 }, + gather_test_params{ CASE_GATHER_FP32_4, 2, 4 }, + gather_test_params{ CASE_GATHER_FP32_5, 2, 4 }, + + gather_test_params{ CASE_GATHER_FP16_1, 2, 4 }, + gather_test_params{ CASE_GATHER_FP16_2, 2, 4 }, + gather_test_params{ CASE_GATHER_FP16_3, 2, 4 }, + gather_test_params{ CASE_GATHER_FP16_4, 2, 4 }, + gather_test_params{ CASE_GATHER_FP16_5, 2, 4 }, + + gather_test_params{ CASE_GATHER_5D_FP32_1, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP32_2, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP32_3, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP32_4, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP32_5, 2, 4 }, + + gather_test_params{ CASE_GATHER_5D_FP16_1, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP16_2, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP16_3, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP16_4, 2, 4 }, + gather_test_params{ CASE_GATHER_5D_FP16_5, 2, 4 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/gather_nd_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/gather_nd_fusion_test.cpp new file mode 100644 index 00000000000..b97e16cabb4 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/gather_nd_fusion_test.cpp @@ -0,0 +1,231 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct gather_nd_test_params { + data_types data_type; + + format input_format; + tensor input_shape; + + format indices_format; + tensor indices_shape; + + format output_format; + tensor output_shape; + + int max_number_in_indices; + int indices_rank; + int batch_dims; + + data_types default_type; + format default_format; + + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class GatherNDPrimitiveFusingTest : public ::BaseFusingTest { +public: + void execute(gather_nd_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(gather_nd_test_params& p) { + return layout{ p.data_type, p.input_format, p.input_shape }; + } + + layout get_indices_layout(gather_nd_test_params& p) { + return layout{ p.data_type, p.indices_format, p.indices_shape }; + } + + layout get_output_layout(gather_nd_test_params& p) { + return layout{ p.data_type, p.output_format, p.output_shape }; + } + + layout get_per_channel_layout(gather_nd_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.output_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------------------ GatherND cases ------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +#define CASE_GATHER_ND_FP16_4D_1 data_types::f16, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 3, 1, 1, 1 }, format::bfyx, { 3, 7, 9, 8 }, 6, 2, 0, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_4D_2 data_types::f16, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 6, 1, 1, 1 }, format::bfyx, { 6, 8, 1, 9 }, 6, 2, 1, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_4D_3 data_types::f16, format::bfyx, { 5, 4, 7, 2 }, format::bfyx, { 5, 4, 1, 2 }, format::bfyx, { 40, 1, 1, 1 }, 6, 4, 3, data_types::f16, format::bfyx + +#define CASE_GATHER_ND_FP16_5D_1 data_types::f16, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfzyx, { 5, 6, 7, 8, 5 }, 5, 2, 0, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_5D_2 data_types::f16, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfyx, { 5, 5, 7, 8 }, 5, 2, 1, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_5D_3 data_types::f16, format::bfzyx, { 5, 4, 7, 8, 5 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 20, 1, 1, 1 }, 4, 3, 2, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_5D_4 data_types::f16, format::bfzyx, { 5, 4, 7, 8, 3 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 60, 7, 1, 1 }, 4, 4, 3, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_5D_5 data_types::f16, format::bfzyx, { 5, 4, 7, 2, 3 }, format::bfzyx, { 5, 4, 1, 2, 3 }, format::bfyx, { 120, 1, 1, 1 }, 4, 5, 4, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_5D_6 data_types::f16, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 1, 1, 3 }, format::bfzyx, { 20, 3, 7, 4, 1 }, 4, 5, 2, data_types::f16, format::bfyx + +#define CASE_GATHER_ND_FP16_6D_1 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 8, 5 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 20, 2, 6, 7 }, 5, 4, 2, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_6D_2 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 40, 6, 1, 1 }, 5, 4, 3, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_6D_3 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 2, 2 }, format::bfzyx, { 5, 4, 1, 2, 2 }, format::bfyx, { 80, 6, 1, 1 }, 5, 5, 4, data_types::f16, format::bfyx +#define CASE_GATHER_ND_FP16_6D_4 data_types::f16, format::bfwzyx, { 5, 4, 6, 3, 2, 2 }, format::bfwzyx, { 5, 4, 1, 3, 2, 2 }, format::bfyx, { 240, 1, 1, 1 }, 5, 6, 5, data_types::f16, format::bfyx + +#define CASE_GATHER_ND_FP32_4D_1 data_types::f32, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 3, 1, 1, 1 }, format::bfyx, { 3, 7, 9, 8 }, 6, 2, 0, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_4D_2 data_types::f32, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 6, 1, 1, 1 }, format::bfyx, { 6, 8, 1, 9 }, 6, 2, 1, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_4D_3 data_types::f32, format::bfyx, { 5, 4, 7, 2 }, format::bfyx, { 5, 4, 1, 2 }, format::bfyx, { 40, 1, 1, 1 }, 6, 4, 3, data_types::f32, format::bfyx + +#define CASE_GATHER_ND_FP32_5D_1 data_types::f32, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfzyx, { 5, 6, 7, 8, 5 }, 5, 2, 0, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_5D_2 data_types::f32, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfyx, { 5, 5, 7, 8 }, 5, 2, 1, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_5D_3 data_types::f32, format::bfzyx, { 5, 4, 7, 8, 5 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 20, 1, 1, 1 }, 4, 3, 2, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_5D_4 data_types::f32, format::bfzyx, { 5, 4, 7, 8, 3 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 60, 7, 1, 1 }, 4, 4, 3, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_5D_5 data_types::f32, format::bfzyx, { 5, 4, 7, 2, 3 }, format::bfzyx, { 5, 4, 1, 2, 3 }, format::bfyx, { 120, 1, 1, 1 }, 4, 5, 4, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_5D_6 data_types::f32, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 1, 1, 3 }, format::bfzyx, { 20, 3, 7, 4, 1 }, 4, 5, 2, data_types::f32, format::bfyx + +#define CASE_GATHER_ND_FP32_6D_1 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 8, 5 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 20, 2, 6, 7 }, 5, 4, 2, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_6D_2 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 40, 6, 1, 1 }, 5, 4, 3, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_6D_3 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 2, 2 }, format::bfzyx, { 5, 4, 1, 2, 2 }, format::bfyx, { 80, 6, 1, 1 }, 5, 5, 4, data_types::f32, format::bfyx +#define CASE_GATHER_ND_FP32_6D_4 data_types::f32, format::bfwzyx, { 5, 4, 6, 3, 2, 2 }, format::bfwzyx, { 5, 4, 1, 3, 2, 2 }, format::bfyx, { 240, 1, 1, 1 }, 5, 6, 5, data_types::f32, format::bfyx + +class gather_nd_quantize : public GatherNDPrimitiveFusingTest {}; +TEST_P(gather_nd_quantize, basic) { + auto p = GetParam(); + + auto input_rank = 0; + if (p.input_format == format::bfyx) { + input_rank = 4; + } else if (p.input_format == format::bfzyx) { + input_rank = 5; + } else if (p.input_format == format::bfwzyx) { + input_rank = 6; + } + + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_nd_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices - 1)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + gather_nd("gather_nd_prim", "input", "gather_nd_indices", input_rank, p.indices_rank, p.batch_dims), + quantize("quantize", "gather_nd_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_nd_quantize, ::testing::ValuesIn(std::vector{ + gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_1, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_2, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_3, 2, 3 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_1, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_2, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_3, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_4, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_5, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_6, 2, 3 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_1, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_2, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_3, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_4, 2, 3 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_1, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_2, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_3, 2, 3 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_1, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_2, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_3, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_4, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_5, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_6, 2, 3 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_1, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_2, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_3, 2, 3 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_4, 2, 3 }, +})); + +class gather_nd_activation_scale_eltwise : public GatherNDPrimitiveFusingTest {}; +TEST_P(gather_nd_activation_scale_eltwise, basic) { + auto p = GetParam(); + + auto input_rank = 0; + if (p.input_format == format::bfyx) { + input_rank = 4; + } else if (p.input_format == format::bfzyx) { + input_rank = 5; + } else if (p.input_format == format::bfwzyx) { + input_rank = 6; + } + + create_topologies( + input_layout("input", get_input_layout(p)), + data("gather_nd_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices - 1)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)), + data("eltwise_data", get_mem(get_output_layout(p))), + gather_nd("gather_nd_prim", "input", "gather_nd_indices", input_rank, p.indices_rank, p.batch_dims), + activation("activation", "gather_nd_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_nd_activation_scale_eltwise, ::testing::ValuesIn(std::vector{ + gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_1, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_2, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_3, 2, 5 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_1, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_2, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_3, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_4, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_5, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_6, 2, 5 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_1, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_2, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_3, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_4, 2, 5 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_1, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_2, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_3, 2, 5 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_1, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_2, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_3, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_4, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_5, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_6, 2, 5 }, + + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_1, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_2, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_3, 2, 5 }, + gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_4, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp new file mode 100644 index 00000000000..1488eab1542 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/gemm_fusion_test.cpp @@ -0,0 +1,370 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct gemm_test_params { + std::vector in_shapes; + tensor out_shape; + tensor kernel; + tensor pad; + data_types data_type_in0; + data_types data_type_in1; + data_types data_type_in2; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class GemmFusingTest : public ::BaseFusingTest { +public: + + void execute(gemm_test_params& p) { + auto input0_prim = get_mem(get_input_layout(p, 0)); + auto input1_prim = get_mem(get_input_layout(p, 1)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input0", input0_prim); + network_not_fused.set_input_data("input0", input0_prim); + network_fused.set_input_data("input1", input1_prim); + network_not_fused.set_input_data("input1", input1_prim); + if (p.in_shapes.size() > 2) { + auto input2_prim = get_mem(get_input_layout(p, 2)); + network_fused.set_input_data("input2", input2_prim); + network_not_fused.set_input_data("input2", input2_prim); + } + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(gemm_test_params& p, int in_no) { + auto pad = p.pad; + std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; + if (in_no == 0) + return layout{ p.data_type_in0, p.input_format, p.in_shapes.at(0), padding{ pad_ } }; + else if (in_no == 1) + return layout{ p.data_type_in1, p.input_format, p.in_shapes.at(1), padding{ pad_ } }; + else + return layout{ p.data_type_in2, p.input_format, p.in_shapes.at(2), padding{ pad_ } }; + } + + layout get_per_channel_layout(gemm_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shapes.at(0).feature[0], 1, 1 } }; + } + + layout get_output_layout(gemm_test_params& p) { + return layout{ p.default_type, p.input_format, p.out_shape }; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- Gemm cases ------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_GEMM_3IN_FP32_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_3IN_FP32_2 { { 1, 1, 63, 63 }, { 1, 1, 63, 63 }, { 1, 1, 63, 63 } }, { 1, 1, 63, 63 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_3IN_FP32_3 { { 1, 1, 128, 128 }, { 1, 1, 128, 128 }, { 1, 1, 128, 128 } }, { 1, 1, 128, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_3IN_FP32_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 }, { 1, 2, 256, 128 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_3IN_FP16_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_3IN_FP16_2 { { 1, 1, 31, 31 }, { 1, 1, 31, 31 }, { 1, 1, 31, 31 } }, { 1, 1, 31, 31 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_3IN_FP16_3 { { 1, 1, 64, 64 }, { 1, 1, 64, 64 }, { 1, 1, 64, 64 } }, { 1, 1, 64, 64 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_3IN_FP16_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 }, { 1, 2, 256, 128 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_3IN_S8S8_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_3IN_S8S8_2 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 }, { 1, 2, 256, 128 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_3IN_S8S8_3 { { 1, 1, 8, 16 }, { 1, 1, 32, 8 }, { 1, 1, 32, 16 } }, { 1, 1, 32, 16 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx + +#define CASE_GEMM_2IN_FP32_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_FP32_2 { { 1, 1, 63, 63 }, { 1, 1, 63, 63 } }, { 1, 1, 63, 63 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_FP32_3 { { 1, 1, 128, 128 }, { 1, 1, 128, 128 } }, { 1, 1, 128, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_FP32_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_FP16_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_2IN_FP16_2 { { 1, 1, 31, 31 }, { 1, 1, 31, 31 } }, { 1, 1, 31, 31 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_2IN_FP16_3 { { 1, 1, 64, 64 }, { 1, 1, 64, 64 } }, { 1, 1, 64, 64 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_2IN_FP16_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_2IN_U8U8_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_U8U8_2 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_U8U8_3 { { 1, 1, 16, 32 }, { 1, 1, 32, 16 } }, { 1, 1, 32, 32 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx + +#define CASE_GEMM_2IN_U8S8_1 { { 1, 1, 4, 2 }, { 1, 1, 8, 4 } }, { 1, 1, 8, 4 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::i8, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_2IN_S8U8_1 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx + +#define CASE_GEMM_ELTWISE_2IN_FP32_1 { { 1, 1, 4, 4 }, { 1, 1, 4, 4 } }, { 1, 1, 4, 4 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_ELTWISE_2IN_FP16_1 { { 1, 1, 32, 32 }, { 1, 1, 32, 32 } }, { 1, 1, 32, 32 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_GEMM_ELTWISE_2IN_U8S8_1 { { 1, 1, 4, 4 }, { 1, 1, 4, 4 } }, { 1, 1, 4, 4 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::i8, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_GEMM_ELTWISE_2IN_S8U8_1 { { 1, 1, 32, 32 }, { 1, 1, 32, 32 } }, { 1, 1, 32, 32 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx + +class gemm_3in_quantize_i8 : public GemmFusingTest {}; +TEST_P(gemm_3in_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + input_layout("input2", get_input_layout(p, 2)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + gemm("gemm_prim", { "input0", "input1", "input2" }, data_types::f32), + quantize("quantize", "gemm_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_3in_quantize_i8, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_3IN_FP16_1, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP16_2, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP16_3, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP16_4, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP32_1, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP32_2, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP32_3, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_FP32_4, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_S8S8_1, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_S8S8_2, 4, 5 }, + gemm_test_params{ CASE_GEMM_3IN_S8S8_3, 4, 5 }, +})); + +class gemm_2in_quantize_u8 : public GemmFusingTest {}; +TEST_P(gemm_2in_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + quantize("quantize", "gemm_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_quantize_u8, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 }, +})); + +class gemm_2in_quantize_float_in : public GemmFusingTest {}; +TEST_P(gemm_2in_quantize_float_in, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("in_lo", get_mem(get_per_channel_layout(p), 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + quantize("quantize", "gemm_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + implementation_desc gemm_impl = { format::bfyx, "gemm_tiled_opt" }; + bo_fused.set_option(build_option::force_implementations({ { "gemm_prim", gemm_impl } })); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_quantize_float_in, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 4 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP16_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP32_1, 3, 4 }, +})); + +class gemm_2in_scale : public GemmFusingTest {}; +TEST_P(gemm_2in_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + scale("scale", "gemm_prim", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +TEST_P(gemm_2in_scale, fp16_scale_out) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + scale("scale", "gemm_prim", "scale_data", optional_data_type{ data_types::f16 }), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_scale, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 }, + gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 }, +})); + +class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; +TEST_P(gemm_2in_act_scale_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + activation("activation", "gemm_prim", activation_func::exp), + scale("scale", "activation", "scale_data"), + quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_act_scale_quantize_i8, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_U8S8_1, 3, 6 }, + gemm_test_params{ CASE_GEMM_2IN_S8U8_1, 3, 6 }, +})); + +class gemm_2in_act_scale_quantize_eltwise_i8 : public GemmFusingTest {}; +TEST_P(gemm_2in_act_scale_quantize_eltwise_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + data("eltwise_data", get_mem(get_output_layout(p))), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + activation("activation", "gemm_prim", activation_func::exp), + scale("scale", "activation", "scale_data"), + quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_act_scale_quantize_eltwise_i8, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP32_1, 3, 7 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP16_1, 3, 7 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_U8S8_1, 3, 7 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_S8U8_1, 3, 7 }, +})); + +class gemm_2in_act_scale_eltwise : public GemmFusingTest {}; +TEST_P(gemm_2in_act_scale_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + data("eltwise_data", get_mem(get_output_layout(p))), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + scale("scale", "gemm_prim", "scale_data"), + activation("activation", "scale", activation_func::negative), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1e-4f; + execute(p); +} + +TEST_P(gemm_2in_act_scale_eltwise, broadcast_eltwise) { + auto p = GetParam(); + create_topologies( + input_layout("input0", get_input_layout(p, 0)), + input_layout("input1", get_input_layout(p, 1)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), + data("eltwise_data", get_mem(get_single_element_layout(p))), + gemm("gemm_prim", { "input0", "input1" }, data_types::f32), + scale("scale", "gemm_prim", "scale_data"), + activation("activation", "scale", activation_func::negative), + eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) + ); + + tolerance = 1e-4f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_act_scale_eltwise, ::testing::ValuesIn(std::vector{ + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP32_1, 3, 6 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP16_1, 3, 6 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_U8S8_1, 3, 6 }, + gemm_test_params{ CASE_GEMM_ELTWISE_2IN_S8U8_1, 3, 6 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/lrn_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/lrn_fusion_test.cpp new file mode 100644 index 00000000000..210e2d68cf1 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/lrn_fusion_test.cpp @@ -0,0 +1,292 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { + +struct lrn_test_params { + tensor in_shape; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + lrn_norm_region lrn_type; + std::string kernel_name; +}; + +class LrnFusingTest : public ::BaseFusingTest { +public: + void execute(lrn_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + build_options options; + implementation_desc lrn_impl = { p.input_format, p.kernel_name }; + options.set_option(build_option::optimize_data(true)); + options.set_option(build_option::force_implementations({ { "lrn_norm", lrn_impl } })); + network network_fused(this->engine, this->topology_fused, options); + network network_not_fused(this->engine, this->topology_non_fused, this->bo_not_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused.get_primitives_info().empty()); + ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); + + auto find_lrn = [&](primitive_info& p) -> bool { + if (p.original_id == "lrn_norm" || p.original_id == "reorder") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto pi_not_fused = network_not_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_lrn); + auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_lrn); + + ASSERT_TRUE(info_fused != pi_fused.end()); + ASSERT_TRUE(info_not_fused != pi_not_fused.end()); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(lrn_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape }; + } + + layout get_per_channel_layout(lrn_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- LRN cases -------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_LRN_FP32_1 { 2, 16, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_LRN_FP32_2 { 8, 16, 4, 4 }, data_types::f32, format::yxfb, data_types::f32, format::yxfb +#define CASE_LRN_FP32_3 { 2, 16, 4, 4 }, data_types::f32, format::byxf, data_types::f32, format::byxf +#define CASE_LRN_FP32_4 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx +#define CASE_LRN_FP32_5 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +#define CASE_LRN_FP32_TO_FP16_1 { 2, 16, 5, 5 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_LRN_FP32_TO_FP16_2 { 2, 16, 5, 5 }, data_types::f32, format::byxf, data_types::f16, format::byxf +#define CASE_LRN_FP32_TO_FP16_3 { 8, 16, 4, 4 }, data_types::f32, format::yxfb, data_types::f16, format::byxf +#define CASE_LRN_FP32_TO_FP16_4 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f16, format::bfyx +#define CASE_LRN_FP32_TO_FP16_5 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::bfyx + +#define CASE_LRN_FP16_1 { 2, 16, 4, 4 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_LRN_FP16_2 { 8, 16, 4, 4 }, data_types::f16, format::yxfb, data_types::f16, format::yxfb +#define CASE_LRN_FP16_3 { 2, 16, 4, 4 }, data_types::f16, format::byxf, data_types::f16, format::byxf +#define CASE_LRN_FP16_4 { 2, 16, 4, 4 }, data_types::f16, format::b_fs_yx_fsv4, data_types::f16, format::bfyx +#define CASE_LRN_FP16_5 { 2, 16, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx + +class lrn_fp32_quantize_u8_scale_activation : public LrnFusingTest {}; +TEST_P(lrn_fp32_quantize_u8_scale_activation, basic) { + auto p = GetParam(); + + uint32_t size = 5; + float k = 1.0f; + float alpha = (float)9.9e-05; + float beta = 0.75; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), + quantize("quantize", "lrn_norm", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + scale("scale", "quantize", "scale_data"), + activation("activation", "scale", activation_func::exp), + reorder("reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(lrn_fp32_quantize_u8_scale_activation, per_channel) { + auto p = GetParam(); + + uint32_t size = 5; + float k = 1.0f; + float alpha = (float)9.9e-05; + float beta = 0.75; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)), + lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), + quantize("quantize", "lrn_norm", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + scale("scale", "quantize", "scale_data"), + activation("activation", "scale", activation_func::exp), + reorder("reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_quantize_u8_scale_activation, ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 OutputDataType = FP32 + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_2, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, + lrn_test_params{ CASE_LRN_FP32_3, 2, 5, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, + lrn_test_params{ CASE_LRN_FP32_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, + + // InputDataType = FP32 OutputDataType = FP16 + lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_3, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_TO_FP16_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, +})); + +class lrn_fp32_quantize_i8_scale_activation : public LrnFusingTest {}; +TEST_P(lrn_fp32_quantize_i8_scale_activation, basic) { + auto p = GetParam(); + + uint32_t size = 5; + float k = 1.0f; + float alpha = (float)9.9e-05; + float beta = 0.75; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), + scale("scale", "lrn_norm", "scale_data"), + activation("activation", "scale", activation_func::exp), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_quantize_i8_scale_activation, ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 OutputDataType = INT8 + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, + lrn_test_params{ CASE_LRN_FP32_2, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, + lrn_test_params{ CASE_LRN_FP32_3, 2, 5, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, + lrn_test_params{ CASE_LRN_FP32_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, + + // InputDataType = FP16 OutputDataType = INT8/UINT8 can't be tested for now, because quantize + // primitive doesn't support input type FP16 while fusing (prepare_quantization.cpp :114 -> prepare_primitive_fusing.cpp :474) +})); + +class lrn_fp32_scale_activation_quantize_u8 : public LrnFusingTest {}; +TEST_P(lrn_fp32_scale_activation_quantize_u8, basic) { + auto p = GetParam(); + + uint32_t size = 5; + float k = 1.0f; + float alpha = (float)9.9e-05; + float beta = 0.75; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), + scale("scale", "lrn_norm", "scale_data"), + activation("activation", "scale", activation_func::exp), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ + // InputDataType = FP32 OutputDataType = UINT8 + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, + lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_2, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, + lrn_test_params{ CASE_LRN_FP32_3, 2, 5, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, + lrn_test_params{ CASE_LRN_FP32_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP32_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, +})); + +class lrn_fp16_scale_activation : public LrnFusingTest {}; +TEST_P(lrn_fp16_scale_activation, basic) { + auto p = GetParam(); + + uint32_t size = 5; + float k = 1.0f; + float alpha = (float)9.9e-05; + float beta = 0.75; + + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), + lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), + scale("scale", "lrn_norm", "scale_data"), + activation("activation", "scale", activation_func::exp), + reorder("reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp16_scale_activation, ::testing::ValuesIn(std::vector{ + // InputDataType = FP16 OutputDataType = FP16 + lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, + lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, + lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_across_channel, "lrn_ref" }, + lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, + lrn_test_params{ CASE_LRN_FP16_3, 2, 4, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, + lrn_test_params{ CASE_LRN_FP16_4, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, + lrn_test_params{ CASE_LRN_FP16_5, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/mvn_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/mvn_fusion_test.cpp new file mode 100644 index 00000000000..d0d1d5047e7 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/mvn_fusion_test.cpp @@ -0,0 +1,291 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct mvn_test_params { + tensor input_size; + tensor elwise_size; + data_types input_type; + format input_format; + bool across_channels; + bool normalize_variance; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class MVNFusingTest : public ::BaseFusingTest { +public: + void execute(mvn_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(mvn_test_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(mvn_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; + } +}; +} // namespace + + +/* ----------------------------------------------------------------------------------------------------- */ +/* --------------------------------------- MVN cases --------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_MVN_F32_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::f32, format::bfyx, false, true, data_types::f32, format::bfyx +#define CASE_MVN_F32_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::f32, format::bfyx, true, true, data_types::f32, format::bfyx +#define CASE_MVN_3D_F32_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, false, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_F32_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, true, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_F32_3 { 2, 8, 4, 4, 4 }, { 2, 8, 1, 1, 1 }, data_types::f32, format::bfzyx, true, true, data_types::f32, format::bfzyx +#define CASE_MVN_F16_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::f16, format::bfyx, false, true, data_types::f16, format::bfyx +#define CASE_MVN_F16_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::f16, format::bfyx, true, true, data_types::f16, format::bfyx +#define CASE_MVN_3D_F16_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::f16, format::bfzyx, false, true, data_types::f16, format::bfzyx +#define CASE_MVN_3D_F16_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::f16, format::bfzyx, true, true, data_types::f16, format::bfzyx +#define CASE_MVN_I8_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::i8, format::bfyx, false, true, data_types::f32, format::bfyx +#define CASE_MVN_I8_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::i8, format::bfyx, true, true, data_types::f32, format::bfyx +#define CASE_MVN_I8_3 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx +#define CASE_MVN_I8_4 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx +#define CASE_MVN_I8_5 { 2, 16, 8, 8 }, { 1, 1, 1, 8 }, data_types::i8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx +#define CASE_MVN_I8_6 { 2, 16, 8, 8 }, { 1, 1, 1, 1 }, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx +#define CASE_MVN_I8_7 { 2, 16, 1, 8 }, { 1, 1, 8, 1 }, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx +#define CASE_MVN_3D_I8_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_I8_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::i8, format::bfzyx, true, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_I8_3 { 2, 16, 8, 8, 8 }, { 2, 1, 8, 8, 1 }, data_types::i8, format::bfzyx, true, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_I8_4 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 1, 8 }, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_I8_5 { 2, 2, 1, 2, 1 }, { 2, 2, 2, 2, 2 }, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx +#define CASE_MVN_U8_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::u8, format::bfyx, false, true, data_types::f32, format::bfyx +#define CASE_MVN_U8_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::u8, format::bfyx, true, true, data_types::f32, format::bfyx +#define CASE_MVN_U8_3 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx +#define CASE_MVN_U8_4 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx +#define CASE_MVN_U8_5 { 2, 16, 8, 8 }, { 2, 1, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx +#define CASE_MVN_U8_6 { 2, 16, 8, 8 }, { 1, 1, 1, 8 }, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx +#define CASE_MVN_U8_7 { 1, 16, 16, 1 }, { 1, 16, 1, 16 }, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx +#define CASE_MVN_3D_U8_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_U8_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::u8, format::bfzyx, true, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_U8_3 { 2, 16, 8, 8, 8 }, { 2, 1, 1, 1, 1 }, data_types::u8, format::bfzyx, true, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_U8_4 { 2, 16, 8, 8, 8 }, { 1, 1, 1, 1, 1 }, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx +#define CASE_MVN_3D_U8_5 { 2, 16, 1, 8, 8 }, { 1, 1, 8, 1, 1 }, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx + +class mvn_activation : public MVNFusingTest {}; +TEST_P(mvn_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), + activation("act", "mvn", activation_func::hyperbolic_tan), + reorder("reorder_bfyx", "act", format::bfyx, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_activation, ::testing::ValuesIn(std::vector{ + mvn_test_params{ CASE_MVN_F32_1, 2, 3 }, + mvn_test_params{ CASE_MVN_F32_2, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_F32_1, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_F32_2, 2, 3 }, + mvn_test_params{ CASE_MVN_F16_1, 2, 3 }, + mvn_test_params{ CASE_MVN_F16_2, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_F16_1, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_F16_2, 2, 3 }, + mvn_test_params{ CASE_MVN_I8_1, 2, 3 }, + mvn_test_params{ CASE_MVN_I8_2, 2, 3 }, + mvn_test_params{ CASE_MVN_I8_3, 2, 3 }, + mvn_test_params{ CASE_MVN_I8_4, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_I8_1, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_I8_2, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_1, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_2, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_3, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_4, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_1, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_2, 2, 3 }, +})); + +class mvn_scale_quantize_i8 : public MVNFusingTest {}; +TEST_P(mvn_scale_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), + data("scale_data", get_mem(get_per_channel_layout(p))), + scale("scale", "mvn", "scale_data"), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -127, 127)), + data("out_high", get_mem(get_single_element_layout(p), -127, 127)), + quantize("quant", "scale", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), + reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_scale_quantize_i8, ::testing::ValuesIn(std::vector{ + // Full fusing for fp input not supported yet, it may lead to output padding and non-optimal kernel + // mvn_test_params{ CASE_MVN_F32_1, 2, 4 }, + // mvn_test_params{ CASE_MVN_F32_2, 2, 4 }, + // mvn_test_params{ CASE_MVN_3D_F32_1, 2, 4 }, + // mvn_test_params{ CASE_MVN_3D_F32_2, 2, 4 }, + // mvn_test_params{ CASE_MVN_F16_1, 2, 4 }, + // mvn_test_params{ CASE_MVN_F16_2, 2, 4 }, + // mvn_test_params{ CASE_MVN_3D_F16_1, 2, 4 }, + // mvn_test_params{ CASE_MVN_3D_F16_2, 2, 4 }, + mvn_test_params{ CASE_MVN_I8_1, 2, 4 }, + mvn_test_params{ CASE_MVN_I8_2, 2, 4 }, + mvn_test_params{ CASE_MVN_I8_3, 2, 4 }, + mvn_test_params{ CASE_MVN_I8_4, 2, 4 }, + mvn_test_params{ CASE_MVN_3D_I8_1, 2, 4 }, + mvn_test_params{ CASE_MVN_3D_I8_2, 2, 4 }, + mvn_test_params{ CASE_MVN_U8_1, 2, 4 }, + mvn_test_params{ CASE_MVN_U8_2, 2, 4 }, + mvn_test_params{ CASE_MVN_U8_3, 2, 4 }, + mvn_test_params{ CASE_MVN_U8_4, 2, 4 }, + mvn_test_params{ CASE_MVN_3D_U8_1, 2, 4 }, + mvn_test_params{ CASE_MVN_3D_U8_2, 2, 4 }, +})); + +class mvn_scale_activation_eltwise_fp32_quantize_i8 : public MVNFusingTest {}; +TEST_P(mvn_scale_activation_eltwise_fp32_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), + data("scale_data", get_mem(get_per_channel_layout(p))), + scale("scale", "mvn", "scale_data"), + activation("act", "scale", activation_func::hyperbolic_tan), + data("eltw_data", get_mem(layout{ p.input_type, p.default_format, p.elwise_size })), + eltwise("eltw", { "act", "eltw_data" }, eltwise_mode::sum, data_types::f32), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -128)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), + reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_scale_activation_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ + // Full using for fp input not supported yet, it may lead to output padding and non-optimal kernel + // mvn_test_params{ CASE_MVN_F32_1, 2, 7 }, + // mvn_test_params{ CASE_MVN_F32_2, 2, 7 }, + // mvn_test_params{ CASE_MVN_3D_F32_1, 2, 7 }, + // mvn_test_params{ CASE_MVN_3D_F32_2, 2, 7 }, + // mvn_test_params{ CASE_MVN_F16_1, 2, 7 }, + // mvn_test_params{ CASE_MVN_F16_2, 2, 7 }, + // mvn_test_params{ CASE_MVN_3D_F16_1, 2, 7 }, + // mvn_test_params{ CASE_MVN_3D_F16_2, 2, 7 }, + mvn_test_params{ CASE_MVN_I8_1, 2, 6 }, + mvn_test_params{ CASE_MVN_I8_2, 2, 6 }, + mvn_test_params{ CASE_MVN_I8_3, 2, 6 }, + mvn_test_params{ CASE_MVN_I8_4, 2, 6 }, + mvn_test_params{ CASE_MVN_I8_5, 2, 6 }, + mvn_test_params{ CASE_MVN_I8_6, 2, 6 }, + mvn_test_params{ CASE_MVN_I8_7, 3, 6 }, + mvn_test_params{ CASE_MVN_3D_I8_1, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_I8_2, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_I8_3, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_I8_4, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_I8_5, 3, 6 }, + mvn_test_params{ CASE_MVN_U8_1, 2, 6 }, + mvn_test_params{ CASE_MVN_U8_2, 2, 6 }, + mvn_test_params{ CASE_MVN_U8_3, 2, 6 }, + mvn_test_params{ CASE_MVN_U8_4, 2, 6 }, + mvn_test_params{ CASE_MVN_U8_5, 2, 6 }, + mvn_test_params{ CASE_MVN_U8_6, 2, 6 }, + mvn_test_params{ CASE_MVN_U8_7, 3, 6 }, + mvn_test_params{ CASE_MVN_3D_U8_1, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_U8_2, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_U8_3, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_U8_4, 2, 6 }, + mvn_test_params{ CASE_MVN_3D_U8_5, 3, 6 }, +})); + +class mvn_eltwise : public MVNFusingTest {}; +TEST_P(mvn_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", layout{ p.input_type, p.input_format, p.input_size }), + mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), + data("eltw_data", get_mem(layout{ p.input_type, p.default_format, p.elwise_size })), + eltwise("eltw", { "mvn", "eltw_data" }, eltwise_mode::sum, data_types::f32), + reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_eltwise, ::testing::ValuesIn(std::vector{ + mvn_test_params{ CASE_MVN_I8_5, 2, 3 }, + mvn_test_params{ CASE_MVN_I8_6, 2, 3 }, + mvn_test_params{ CASE_MVN_I8_7, 3, 3 }, + mvn_test_params{ CASE_MVN_3D_I8_3, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_I8_4, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_I8_5, 3, 3 }, + mvn_test_params{ CASE_MVN_U8_1, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_2, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_3, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_4, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_5, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_6, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_7, 3, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_1, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_2, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_3, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_4, 2, 3 }, + mvn_test_params{ CASE_MVN_3D_U8_5, 3, 3 }, +})); + +class mvn_eltwise_f16 : public MVNFusingTest {}; +TEST_P(mvn_eltwise_f16, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", layout{ p.input_type, p.input_format, p.input_size }), + mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), + data("eltw_data", get_mem(layout{ p.input_type, p.default_format, p.elwise_size })), + eltwise("eltw", { "mvn", "eltw_data" }, eltwise_mode::sum, data_types::f16), + reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) + ); + + tolerance = 0.1f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_eltwise_f16, ::testing::ValuesIn(std::vector{ + mvn_test_params{ CASE_MVN_I8_6, 2, 3 }, + mvn_test_params{ CASE_MVN_U8_2, 2, 3 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/normalize_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/normalize_fusion_test.cpp new file mode 100644 index 00000000000..0f94ebca4aa --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/normalize_fusion_test.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { + +struct normalize_test_params { + tensor in_shape; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + bool across_spatial; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + + +class NormalizeFusingTest : public ::BaseFusingTest { +public: + void execute(normalize_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(normalize_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape }; + } + + layout get_per_channel_layout(normalize_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } + + layout get_weights_layout(normalize_test_params& p) { + return layout { p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } +}; + +} // namespace + +// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; +#define CASE_NORMALIZE_I8_1 { 1, 2, 3, 3 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx + +class normalize_i8_quantize : public NormalizeFusingTest {}; +TEST_P(normalize_i8_quantize, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + normalize("normalizel2", "input", "weights", p.across_spatial), + quantize("quantize", "normalizel2", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, normalize_i8_quantize, ::testing::ValuesIn(std::vector{ + normalize_test_params{ CASE_NORMALIZE_I8_1, false, 2, 3 }, + normalize_test_params{ CASE_NORMALIZE_I8_1, true, 2, 3 }, +})); + +class normalize_i8_float : public NormalizeFusingTest {}; +TEST_P(normalize_i8_float, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("weights", get_mem(get_weights_layout(p))), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/255)), + normalize("normalizel2", "input", "weights", p.across_spatial), + scale("scale", "normalizel2", "scale_data"), + activation("activation", "scale", activation_func::abs), + reorder("output_reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, normalize_i8_float, ::testing::ValuesIn(std::vector{ + normalize_test_params{ CASE_NORMALIZE_I8_1, false, 2, 4 }, + normalize_test_params{ CASE_NORMALIZE_I8_1, true, 2, 4 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/permute_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/permute_fusion_test.cpp new file mode 100644 index 00000000000..74f165f133d --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/permute_fusion_test.cpp @@ -0,0 +1,590 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct permute_params { + tensor in_shape; + tensor out_shape; + std::vector permute_order; + tensor eltw_in_shape; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +struct permute_reorder_params { + tensor in_shape; + std::vector permute_order1; + std::vector permute_order2; + data_types permute_type; + data_types output_type; + format permute_format; + format output_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class PermuteFusingTest : public ::BaseFusingTest { +public: + + void execute(permute_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(permute_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape, padding{} }; + } + + layout get_per_channel_layout(permute_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } +}; + +class PermuteReorderFusingTest : public ::BaseFusingTest { +public: + + void execute(permute_reorder_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + compare(network_not_fused, network_fused, p, true); + } + + layout get_input_layout(permute_reorder_params& p) { + return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} }; + } +}; +} // namespace + +/* ------------------------------------------------------------------------------------------------------------ */ +/* ---------------------------------------- PERMUTE FUSE cases ------------------------------------------------ */ +/* ------------------------------------------------------------------------------------------------------------ */ +#define CASE_PERMUTE_F32_0 { 1, 16, 2, 2 }, { 1, 16, 2, 2 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_1 { 1, 15, 16, 16 }, { 1, 15, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_2 { 1, 8, 16, 16 }, { 16, 16, 8, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_3 { 1, 1, 3, 4 }, { 1, 3, 4, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_4 { 2, 16, 16, 16 }, { 2, 16, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 3, 1, 2, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +#define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F16_2 { 1, 32, 2, 3 }, { 2, 3, 32, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F16_3 { 3, 16, 1, 1 }, { 1, 1, 16, 3 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_F16_4 { 2, 15, 4, 5 }, { 4, 2, 5, 15 }, { 2, 0, 3, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_F16_5 { 1, 15, 1, 2 }, { 15, 2, 1, 1 }, { 1, 3, 2, 0 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_F16_6 { 1, 15, 4, 4 }, { 4, 4, 1, 15 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx + +#define CASE_PERMUTE_S8_0 { 1, 15, 4, 5 }, { 1, 15, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_S8_1 { 1, 15, 4, 5 }, { 5, 4, 15, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_S8_2 { 1, 16, 1, 2 }, { 1, 1, 16, 2 }, { 2, 0, 1, 3 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_S8_3 { 1, 16, 2, 2 }, { 2, 2, 16, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_U8_0 { 1, 15, 4, 5 }, { 15, 5, 1, 4 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_U8_1 { 1, 15, 16, 16 }, { 15, 16, 1, 16 }, { 1, 2, 0, 3 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_U8_2 { 1, 32, 5, 4 }, { 1, 32, 5, 4 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_PERMUTE_U8_3 { 1, 16, 4, 5 }, { 5, 4, 16, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +// 3d +#define CASE_PERMUTE_F32_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F32_3D_1 { 2, 15, 2, 3, 4 }, { 15, 2, 3, 4, 2 }, { 1, 2, 3, 4, 0 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F32_3D_2 { 2, 16, 4, 4, 5 }, { 4, 2, 4, 5, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F32_3D_3 { 1, 32, 4, 2, 2 }, { 2, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F32_3D_4 { 1, 16, 1, 1, 1 }, { 1, 1, 1, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx + +#define CASE_PERMUTE_F16_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F16_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 2, 15, 3 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F16_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F16_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_F16_3D_4 { 16, 16, 1, 1, 1 },{ 1, 16, 1, 1, 16 },{ 4, 0, 3, 2, 1 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx + +#define CASE_PERMUTE_S8_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_S8_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 15, 2, 3 }, { 4, 2, 1, 0, 3 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_S8_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_S8_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_U8_3D_0 { 16, 16, 1, 1, 1 }, { 1, 1, 16, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_U8_3D_1 { 16, 16, 1, 1, 1 }, { 1, 1, 1, 16, 16 }, { 4, 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_U8_3D_2 { 2, 16, 4, 4, 3 }, { 4, 2, 4, 3, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_U8_3D_3 { 1, 32, 4, 2, 1 }, { 1, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx + +// permute_tile_8x8_4x4 +#define CASE_PERMUTE_TILE_8x8_4x4_4D_0 { 1, 8, 8, 2 }, { 1, 2, 8, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_TILE_8x8_4x4_4D_1 { 1, 5, 8, 2 }, { 1, 2, 5, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_TILE_8x8_4x4_4D_2 { 1, 8, 5, 2 }, { 1, 2, 8, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_TILE_8x8_4x4_4D_3 { 1, 5, 5, 2 }, { 1, 2, 5, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_PERMUTE_TILE_8x8_4x4_5D_0 { 1, 8, 8, 2, 2 }, { 1, 2, 8, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_TILE_8x8_4x4_5D_1 { 1, 5, 8, 2, 2 }, { 1, 2, 5, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_TILE_8x8_4x4_5D_2 { 1, 8, 5, 2, 2 }, { 1, 2, 8, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_TILE_8x8_4x4_5D_3 { 1, 5, 5, 2, 2 }, { 1, 2, 5, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_PERMUTE_TILE_8x8_4x4_6D_0 { 1, 8, 8, 2, 2, 2 }, { 1, 2, 8, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx +#define CASE_PERMUTE_TILE_8x8_4x4_6D_1 { 1, 5, 8, 2, 2, 2 }, { 1, 2, 5, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx +#define CASE_PERMUTE_TILE_8x8_4x4_6D_2 { 1, 8, 5, 2, 2, 2 }, { 1, 2, 8, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx +#define CASE_PERMUTE_TILE_8x8_4x4_6D_3 { 1, 5, 5, 2, 2, 2 }, { 1, 2, 5, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx + +// permute_tile_8x8_4x4_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0 { 1, 16, 16, 2 }, { 1, 2, 16, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1 { 1, 15, 16, 2 }, { 1, 2, 15, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2 { 1, 16, 3, 2 }, { 1, 2, 16, 3 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3 { 1, 5, 7, 2 }, { 1, 2, 5, 7 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0 { 1, 16, 16, 2, 2 }, { 1, 2, 16, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1 { 1, 15, 16, 2, 2 }, { 1, 2, 15, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2 { 1, 16, 3, 2, 2 }, { 1, 2, 16, 3, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 +#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3 { 1, 5, 7, 2, 2 }, { 1, 2, 5, 7, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 + +class permute_activation_scale_eltwise: public PermuteFusingTest {}; +TEST_P(permute_activation_scale_eltwise, basic) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape })), + data("scale_data", get_mem(get_per_channel_layout(p), 5e-1f)), + permute("permute", "input", p.permute_order), + scale("scale", "permute", "scale_data"), + activation("actv", "scale", activation_func::relu), + eltwise("eltwise", { "actv", "eltwise_data" }, eltwise_mode::sum, p.data_type), + reorder("reorder_bfyx", "eltwise", p.default_format, p.default_type) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testing::ValuesIn(std::vector{ + permute_params{ CASE_PERMUTE_F32_0, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_1, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_2, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_3, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_4, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_5, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_6, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_7, 2, 5 }, + + permute_params{ CASE_PERMUTE_F16_0, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_1, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_2, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_3, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_4, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_5, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_6, 2, 5 }, + + permute_params{ CASE_PERMUTE_S8_0, 2, 5 }, + permute_params{ CASE_PERMUTE_S8_1, 2, 5 }, + permute_params{ CASE_PERMUTE_S8_2, 2, 5 }, + permute_params{ CASE_PERMUTE_S8_3, 2, 5 }, + + permute_params{ CASE_PERMUTE_U8_0, 2, 5 }, + permute_params{ CASE_PERMUTE_U8_1, 2, 5 }, + permute_params{ CASE_PERMUTE_U8_2, 2, 5 }, + permute_params{ CASE_PERMUTE_U8_3, 2, 5 }, + + permute_params{ CASE_PERMUTE_F32_3D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_3D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_3D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_3D_3, 2, 5 }, + permute_params{ CASE_PERMUTE_F32_3D_4, 2, 5 }, + + permute_params{ CASE_PERMUTE_F16_3D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_3D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_3D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_3D_3, 2, 5 }, + permute_params{ CASE_PERMUTE_F16_3D_4, 2, 5 }, + + permute_params{ CASE_PERMUTE_S8_3D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_S8_3D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_S8_3D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_S8_3D_3, 2, 5 }, + + permute_params{ CASE_PERMUTE_U8_3D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_U8_3D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_U8_3D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_U8_3D_3, 2, 5 }, + + // Fusing tests for permute_tile_8x8_4x4 + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_3, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_3, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_3, 2, 5 }, + + // Fusing tests for permute_tile_8x8_4x4_fsv16 + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2, 2, 5 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3, 2, 5 }, +})); + +class permute_quant_u8: public PermuteFusingTest {}; +TEST_P(permute_quant_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + permute("permute", "input", p.permute_order), + quantize("quant", "permute", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), + reorder("reorder_bfyx", "quant", p.default_format, p.default_type) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_quant_u8, ::testing::ValuesIn(std::vector{ + permute_params{ CASE_PERMUTE_F32_0, 2, 3 }, + permute_params{ CASE_PERMUTE_F32_1, 2, 3 }, + + permute_params{ CASE_PERMUTE_F16_0, 2, 3 }, + permute_params{ CASE_PERMUTE_F16_1, 2, 3 }, +})); + +class permute_scale_actv_eltw_scale_actv_quant_i8: public PermuteFusingTest {}; +TEST_P(permute_scale_actv_eltw_scale_actv_quant_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale1_data", get_mem(get_per_channel_layout(p), 1e-1f)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("eltw_data", get_mem(layout(p.data_type, p.input_format, p.out_shape))), + data("scale2_data", get_mem(get_per_channel_layout(p), 1e-1f)), + permute("permute", "input", p.permute_order), + scale("scale1", "permute", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.data_type), + scale("scale2", "eltw", "scale2_data"), + activation("actv2", "scale2", activation_func::relu), + quantize("quant", "actv2", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("out", "quant", p.default_format, p.default_type) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_actv_eltw_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ + permute_params{ CASE_PERMUTE_F32_0, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_1, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_2, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_3, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_4, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_5, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_6, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_7, 2, 8 }, + + permute_params{ CASE_PERMUTE_F16_0, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_1, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_2, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_3, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_4, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_5, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_6, 2, 8 }, + + permute_params{ CASE_PERMUTE_S8_0, 2, 8 }, + permute_params{ CASE_PERMUTE_S8_1, 2, 8 }, + permute_params{ CASE_PERMUTE_S8_2, 2, 8 }, + permute_params{ CASE_PERMUTE_S8_3, 2, 8 }, + + permute_params{ CASE_PERMUTE_U8_0, 2, 8 }, + permute_params{ CASE_PERMUTE_U8_1, 2, 8 }, + permute_params{ CASE_PERMUTE_U8_2, 2, 8 }, + permute_params{ CASE_PERMUTE_U8_3, 2, 8 }, + + permute_params{ CASE_PERMUTE_F32_3D_0, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_3D_1, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_3D_2, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_3D_3, 2, 8 }, + permute_params{ CASE_PERMUTE_F32_3D_4, 2, 8 }, + + permute_params{ CASE_PERMUTE_F16_3D_0, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_3D_1, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_3D_2, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_3D_3, 2, 8 }, + permute_params{ CASE_PERMUTE_F16_3D_4, 2, 8 }, + + permute_params{ CASE_PERMUTE_S8_3D_0, 2, 8 }, + permute_params{ CASE_PERMUTE_S8_3D_1, 2, 8 }, + permute_params{ CASE_PERMUTE_S8_3D_2, 2, 8 }, + permute_params{ CASE_PERMUTE_S8_3D_3, 2, 8 }, + + permute_params{ CASE_PERMUTE_U8_3D_0, 2, 8 }, + permute_params{ CASE_PERMUTE_U8_3D_1, 2, 8 }, + permute_params{ CASE_PERMUTE_U8_3D_2, 2, 8 }, + permute_params{ CASE_PERMUTE_U8_3D_3, 2, 8 }, +})); + +class permute_scale_eltwise_actv_scale_actv: public PermuteFusingTest {}; +TEST_P(permute_scale_eltwise_actv_scale_actv, basic) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape })), + data("scale_data1", get_mem(get_per_channel_layout(p), 1e-1f)), + data("scale_data2", get_mem(get_per_channel_layout(p), 1e-1f)), + permute("permute", "input", p.permute_order), + scale("scale1", "permute", "scale_data1"), + activation("actv1", "scale1", activation_func::relu), + eltwise("eltwise", { "actv1", "eltwise_data" }, eltwise_mode::sum, p.default_type), + scale("scale2", "eltwise", "scale_data2"), + activation("actv2", "scale2", activation_func::relu), + reorder("reorder_bfyx", "actv2", p.default_format, p.default_type) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::testing::ValuesIn(std::vector{ + permute_params{ CASE_PERMUTE_F32_0, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_1, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_2, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_3, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_4, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_5, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_6, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_7, 2, 7 }, + + permute_params{ CASE_PERMUTE_F16_0, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_1, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_2, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_3, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_4, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_5, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_6, 2, 7 }, + + permute_params{ CASE_PERMUTE_S8_0, 2, 7 }, + permute_params{ CASE_PERMUTE_S8_1, 2, 7 }, + permute_params{ CASE_PERMUTE_S8_2, 2, 7 }, + permute_params{ CASE_PERMUTE_S8_3, 2, 7 }, + + permute_params{ CASE_PERMUTE_U8_0, 2, 7 }, + permute_params{ CASE_PERMUTE_U8_1, 2, 7 }, + permute_params{ CASE_PERMUTE_U8_2, 2, 7 }, + permute_params{ CASE_PERMUTE_U8_3, 2, 7 }, + + permute_params{ CASE_PERMUTE_F32_3D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_3D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_3D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_3D_3, 2, 7 }, + permute_params{ CASE_PERMUTE_F32_3D_4, 2, 7 }, + + permute_params{ CASE_PERMUTE_F16_3D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_3D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_3D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_3D_3, 2, 7 }, + permute_params{ CASE_PERMUTE_F16_3D_4, 2, 7 }, + + permute_params{ CASE_PERMUTE_S8_3D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_S8_3D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_S8_3D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_S8_3D_3, 2, 7 }, + + permute_params{ CASE_PERMUTE_U8_3D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_U8_3D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_U8_3D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_U8_3D_3, 2, 7 }, + + // Fusing tests for permute_tile_8x8_4x4 + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_3, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_3, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_3, 2, 7 }, + + // Fusing tests for permute_tile_8x8_4x4_fsv16 + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2, 2, 7 }, + permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3, 2, 7 }, +})); + +/* ------------------------------------------------------------------------------------------------------------ */ +/* ---------------------------- PERMUTE FUSE REDUNDANT REORDER cases ------------------------------------------ */ +/* ------------------------------------------------------------------------------------------------------------ */ + +#define CASE_PERMUTE_REORDER_F32_0 { 1, 16, 32, 2 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_F32_1 { 2, 7, 9, 27 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx +#define CASE_PERMUTE_REORDER_F32_2 { 1, 16, 4, 5, 16 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx +#define CASE_PERMUTE_REORDER_F16_0 { 1, 16, 2, 4 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_F16_1 { 1, 16, 4, 5, 16 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx +#define CASE_PERMUTE_REORDER_F16_2 { 1, 5, 1, 2, 14 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx + +// type change +#define CASE_PERMUTE_REORDER_S8_TO_F32_0 { 1, 15, 4, 5 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx +#define CASE_PERMUTE_REORDER_S8_TO_F32_1 { 1, 2, 15, 4, 5 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx +#define CASE_PERMUTE_REORDER_F32_TO_F16_0 { 1, 5, 1, 2, 14 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx +#define CASE_PERMUTE_REORDER_U8_TO_F16_0 { 1, 17, 1, 2, 7 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx + +// dim change +#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 { 1, 16, 8, 16 }, { 1, 2, 0, 3 }, { 0, 3, 1, 4, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfzyx +#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 { 1, 16, 8, 16 }, { 0, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx +#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 { 1, 16, 4, 5, 18 },{ 0, 4, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 { 1, 16, 4, 5, 16 },{ 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfzyx, format::bfyx +#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 { 1, 16, 8, 4, 16 }, { 0, 4, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx +#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx +#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx + +// permute_opt for blocked format +#define CASE_PERMUTE_REORDER_TILED_F32_0 { 1, 256, 2, 64 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F32_1 { 1, 78, 2, 259 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F32_2 { 1, 48, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 4, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx + +// permute_opt for blocked format => reorder to differnt dim +#define CASE_PERMUTE_REORDER_TILED_F32_3 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F32_4 { 2, 273, 19, 19 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F32_5 { 2, 546, 2, 2 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx + +// permute opt for blocked format => reorder to different dim/type +#define CASE_PERMUTE_REORDER_TILED_I8_4 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F16_5 { 1, 48, 3, 256 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 }, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx +#define CASE_PERMUTE_REORDER_TILED_F16_6 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 5, 4, 3, 1 }, data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx + +// permute opt for non_blocked format => reorder to differnt dim/type +#define CASE_PERMUTE_REORDER_TILED_F16_7 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F16_8 { 1, 28, 2, 2, 3, 256 }, { 0, 5, 1, 2, 3, 4 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfwzyx, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx +#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx +#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 5, 3, 1 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx + +class permute_redundant_reorder : public PermuteReorderFusingTest {}; +TEST_P(permute_redundant_reorder, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + permute("permute1", "input", p.permute_order1), + reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused + permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_redundant_reorder, ::testing::ValuesIn(std::vector{ + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_2, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_3, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_4, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_I8_4, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_5, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_6, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_7, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_8, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_9, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 4 }, +})); + +class permute_act_reorder : public PermuteReorderFusingTest {}; + +TEST_P(permute_act_reorder, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + permute("permute1", "input", p.permute_order1), + activation("activation", "permute1", activation_func::abs), + reorder("reorder1", "activation", p.output_format, p.output_type), // to be fused + permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_act_reorder, ::testing::ValuesIn(std::vector{ + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_1, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_2, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_3, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_5, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_6, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 }, + permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/pooling_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/pooling_fusion_test.cpp new file mode 100644 index 00000000000..7b24bc26f27 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/pooling_fusion_test.cpp @@ -0,0 +1,571 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct pooling_test_params { + tensor in_shape; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + pooling_mode pool_mode; + std::string kernel_name; +}; + +class PoolingFusingTest : public ::BaseFusingTest { +public: + void execute(pooling_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + build_options options; + options.set_option(build_option::optimize_data(true)); + if (!p.kernel_name.empty()) { + implementation_desc impl = { p.input_format, p.kernel_name }; + options.set_option(build_option::force_implementations({ { "pooling", impl } })); + } + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, options); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused.get_primitives_info().empty()); + ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); + + auto find_and_check = [&](primitive_info& p) -> bool { + if (p.original_id == "pooling" || p.original_id == "output_reorder") + return true; + return false; + }; + + auto pi_fused = network_fused.get_primitives_info(); + auto pi_not_fused = network_not_fused.get_primitives_info(); + auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_and_check); + auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_and_check); + + ASSERT_TRUE(info_fused != pi_fused.end()); + ASSERT_TRUE(info_not_fused != pi_not_fused.end()); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(pooling_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape }; + } + + layout get_per_channel_layout(pooling_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* --------------------------------------- Pooling cases ----------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_POOLING_F32_1 { 1, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F32_2 { 2, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F32_3 { 1, 32, 10, 10 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F32_4 { 1, 32, 10, 10 }, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_F32_5 { 1, 32, 10, 10 }, data_types::f32, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F32_6 { 1, 32, 40, 40 }, data_types::f32, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F32_7 { 16, 32, 10, 10 }, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_8 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_9 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_10 { 16, 32, 10, 10, 10 }, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F32_11 { 1, 1, 3, 3 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx + +#define CASE_POOLING_F32_F16_1 { 1, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_2 { 2, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_3 { 1, 32, 10, 10 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_4 { 1, 32, 10, 10 }, data_types::f32, format::fs_b_yx_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_5 { 1, 32, 10, 10 }, data_types::f32, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_6 { 1, 32, 40, 40 }, data_types::f32, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_7 { 16, 32, 10, 10 }, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_8 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_9 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F32_F16_10 { 16, 32, 10, 10, 10 }, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx + +#define CASE_POOLING_F16_1 { 1, 16, 8, 8 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F16_3 { 1, 32, 10, 10 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_F16_4 { 1, 32, 10, 10 }, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_F16_5 { 1, 32, 10, 10 }, data_types::f16, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F16_6 { 1, 32, 40, 40 }, data_types::f16, format::byxf, data_types::f32, format::bfyx +#define CASE_POOLING_F16_7 { 16, 32, 10, 10 }, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F16_8 { 16, 32, 10, 10 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F16_9 { 16, 32, 10, 10, 10 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_F16_10 { 16, 32, 10, 10, 10 }, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx + +#define CASE_POOLING_F16_FP16_1 { 1, 32, 10, 10 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_2 { 1, 32, 10, 10 }, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_3 { 1, 32, 10, 10 }, data_types::f16, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_4 { 1, 32, 40, 40 }, data_types::f16, format::byxf, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_5 { 16, 32, 10, 10 }, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_6 { 16, 32, 10, 10 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_7 { 16, 32, 10, 10, 10 }, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx +#define CASE_POOLING_F16_FP16_8 { 16, 32, 10, 10, 10 }, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx + +#define CASE_POOLING_U8_1 { 1, 16, 8, 8 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_U8_2 { 2, 16, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_U8_3 { 1, 32, 10, 10 }, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 +#define CASE_POOLING_U8_5 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx +#define CASE_POOLING_U8_6 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx + +#define CASE_POOLING_U8_FP16_3 { 1, 32, 10, 10 }, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 +#define CASE_POOLING_U8_FP16_5 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx +#define CASE_POOLING_U8_FP16_6 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx + +#define CASE_POOLING_I8_1 { 1, 16, 8, 8 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_POOLING_I8_2 { 2, 16, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_POOLING_I8_5 { 1, 32, 10, 10 }, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 +#define CASE_POOLING_I8_6 { 16, 32, 10, 10, 10 }, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx + +#define CASE_POOLING_I8_FP16_5 { 1, 32, 10, 10 }, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 +#define CASE_POOLING_I8_FP16_6 { 16, 32, 10, 10, 10 }, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx + +class pooling_f32_activation : public PoolingFusingTest {}; +TEST_P(pooling_f32_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), + activation("act", "pooling", activation_func::relu), + reorder("output_reorder", "act", format::bfyx, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_f32_activation, ::testing::ValuesIn(std::vector{ + pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::average, "" }, +})); + +class pooling_f32_scale : public PoolingFusingTest {}; +TEST_P(pooling_f32_scale, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 3, 3 }.count())), + pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), + scale("scale", "pooling", "scale_data"), + reorder("output_reorder", "scale", format::bfyx, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +TEST_P(pooling_f32_scale, fp16_scale_out) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 3, 3 }.count())), + pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), + scale("scale", "pooling", "scale_data", optional_data_type{ data_types::f16 }), + reorder("output_reorder", "scale", format::bfyx, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_f32_scale, ::testing::ValuesIn(std::vector{ + pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::average, "" }, +})); + +class pooling_scale_activation_quantize : public PoolingFusingTest {}; +TEST_P(pooling_scale_activation_quantize, basic) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::relu), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(pooling_scale_activation_quantize, i8_output_data_type) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127, 127)), + data("out_hi", get_mem(get_single_element_layout(p), -127, 127)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::relu), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +TEST_P(pooling_scale_activation_quantize, per_channel) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), 0)), + data("out_hi", get_mem(get_single_element_layout(p), 255)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::atan), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.0f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_scale_activation_quantize, ::testing::ValuesIn(std::vector{ + // Input type: FP32 + pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_4, 2, 5, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F32_4, 2, 5, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F32_5, 2, 5, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F32_5, 2, 5, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F32_6, 2, 5, pooling_mode::average, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F32_6, 2, 5, pooling_mode::max, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F32_7, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_7, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_8, 2, 5, pooling_mode::average, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F32_8, 2, 5, pooling_mode::max, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F32_9, 2, 5, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_9, 2, 5, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_10, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + + // Input type: INT8 + pooling_test_params{ CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_I8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, + + // Input type: UINT8 + pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, +})); + +INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, pooling_scale_activation_quantize, ::testing::ValuesIn(std::vector{ + pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt" }, //currently not enabled, fusing not upported +})); + +class pooling_scale_activation : public PoolingFusingTest {}; +TEST_P(pooling_scale_activation, basic) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + scale("scale", "pooling", "scale_data"), + activation("activation", "scale", activation_func::relu), + reorder("output_reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +TEST_P(pooling_scale_activation, eltwise_mul) { + auto p = GetParam(); + + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), + pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), + eltwise("scale", { "pooling", "scale_data" }, eltwise_mode::prod, p.default_type), + activation("activation", "scale", activation_func::relu), + reorder("output_reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_scale_activation, ::testing::ValuesIn(std::vector{ + // Input type: F32 + pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F32_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F32_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F32_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F32_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F32_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F32_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F32_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F32_9, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_9, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + + // Input type: INT8 + pooling_test_params{ CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_I8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + + // Input type: UINT8 + pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + + // Input type: FP16 Output type: F32 + pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + + // Input type: FP16 + pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + + // Input type: FP32 + pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, + pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, + pooling_test_params{ CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, + pooling_test_params{ CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, + pooling_test_params{ CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, + pooling_test_params{ CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, + pooling_test_params{ CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, + pooling_test_params{ CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, + + // Input type: INT8 + pooling_test_params{ CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + + // Input type: UINT8 + pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, + pooling_test_params{ CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, + pooling_test_params{ CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, +})); + +#ifdef ENABLE_ONEDNN_FOR_GPU +class PoolingOneDNNFusingTest : public ::BaseFusingTest { +public: + void execute(pooling_test_params& p) { + // Onednn post operation has issue in a machine that does not support imad. + if (!engine.get_device_info().supports_imad) + return; + + auto input_prim = get_mem(get_input_layout(p)); + + build_options onednn_options; + build_options cldnn_options; + + onednn_options.set_option(build_option::optimize_data(true)); + cldnn_options.set_option(build_option::optimize_data(true)); + + implementation_desc onednn_impl = { p.input_format, "", impl_types::onednn }; + implementation_desc cldnn_impl = { p.input_format, "", impl_types::ocl }; + onednn_options.set_option(build_option::force_implementations({ { "pooling", onednn_impl } })); + cldnn_options.set_option(build_option::force_implementations({ { "pooling", cldnn_impl } })); + + // for onednn fusing test, topology_non_fused means cldnn, topology_fused is onednn + network network_fused_cldnn(this->engine, this->topology_non_fused, cldnn_options); + network network_fused_onednn(this->engine, this->topology_fused, onednn_options); + + network_fused_cldnn.set_input_data("input", input_prim); + network_fused_onednn.set_input_data("input", input_prim); + + ASSERT_FALSE(network_fused_cldnn.get_primitives_info().empty()); + ASSERT_FALSE(network_fused_onednn.get_primitives_info().empty()); + + auto find_and_check = [&](primitive_info& p) -> bool { + if (p.original_id == "pooling" || p.original_id == "output_reorder") + return true; + return false; + }; + + auto pi_fused_onednn = network_fused_onednn.get_primitives_info(); + auto pi_fused_cldnn = network_fused_onednn.get_primitives_info(); + auto info_fused_onednn = std::find_if(pi_fused_onednn.begin(), pi_fused_onednn.end(), find_and_check); + auto info_fused_cldnn = std::find_if(pi_fused_cldnn.begin(), pi_fused_cldnn.end(), find_and_check); + + ASSERT_TRUE(info_fused_onednn != pi_fused_onednn.end()); + ASSERT_TRUE(info_fused_cldnn != pi_fused_cldnn.end()); + + compare(network_fused_cldnn, network_fused_onednn, p); + } + + layout get_input_layout(pooling_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape }; + } + + layout get_per_channel_layout(pooling_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } +}; + +class pooling_onednn_activation1 : public PoolingOneDNNFusingTest {}; +TEST_P(pooling_onednn_activation1, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), + activation("act", "pooling", activation_func::relu), + reorder("output_reorder", "act", format::bfyx, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +class pooling_onednn_activation2 : public PoolingOneDNNFusingTest {}; +TEST_P(pooling_onednn_activation2, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + pooling("pooling", "input", p.pool_mode, { 1, 1, 3, 3 }, { 1, 1, 1, 1 }), + activation("act", "pooling", activation_func::relu), + reorder("output_reorder", "act", format::bfyx, data_types::f32) + ); + + tolerance = 1e-05f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_onednn_activation1, ::testing::ValuesIn(std::vector{ + // pooling_test_params{ CASE_POOLING_F32_1, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_F16_1, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_U8_1, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_U8_2, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_1, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_I8_2, 2, 2, pooling_mode::max, "" }, +})); + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_onednn_activation2, ::testing::ValuesIn(std::vector{ + pooling_test_params{ CASE_POOLING_F32_11, 2, 2, pooling_mode::max, "" }, + pooling_test_params{ CASE_POOLING_F32_11, 2, 2, pooling_mode::average, "" }, + pooling_test_params{ CASE_POOLING_F32_11, 2, 2, pooling_mode::average_no_padding, "" }, +})); +#endif diff --git a/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp new file mode 100644 index 00000000000..1073868921f --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/reduce_fusion_test.cpp @@ -0,0 +1,287 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct reduce_test_params { + cldnn::tensor in_shape; + cldnn::tensor out_shape; + cldnn::data_types data_type; + cldnn::format input_format; + data_types default_type; + cldnn::format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; + cldnn::reduce_mode reduce_mode; + std::vector reduce_axes; + bool keep_dims; + std::string kernel_name; +}; + +class ReduceFusingTest : public ::BaseFusingTest { +public: + void execute(reduce_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + void update_out_shape(reduce_test_params& p) { + for (auto& axis : p.reduce_axes) { + switch (axis) { + case 0: // batch + p.out_shape.batch[0] = 1; + break; + case 1: // feature + p.out_shape.feature[0] = 1; + break; + case 2: // x + p.out_shape.spatial[0] = 1; + break; + case 3: // y + p.out_shape.spatial[1] = 1; + break; + case 4: // z + p.out_shape.spatial[2] = 1; + break; + case 5: // w + p.out_shape.spatial[3] = 1; + break; + } + } + } + + layout get_input_layout(reduce_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape }; + } + + layout get_per_channel_layout(reduce_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- Reduce cases ----------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +#define CASE_REDUCE_F32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_1 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_3 { 16, 16, 16, 8, 8, 8 }, { 16, 16, 16, 8, 8, 8 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F32_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +#define CASE_REDUCE_F16_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_F16_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +#define CASE_REDUCE_I32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i32, format::bfyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I32_4 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx + +#define CASE_REDUCE_I8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_I8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +#define CASE_REDUCE_U8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_REDUCE_U8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx + +class reduce_eltwise_activation_quantize : public ReduceFusingTest {}; +TEST_P(reduce_eltwise_activation_quantize, basic) { + auto p = GetParam(); + update_out_shape(p); + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), + data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -128)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("eltwise_data", get_mem(get_output_layout(p))), + reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), + eltwise("eltwise", { "reduce", "eltwise_data" }, eltwise_mode::sum, p.default_type), + activation("activation", "eltwise", activation_func::relu), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +TEST_P(reduce_eltwise_activation_quantize, per_channel) { + auto p = GetParam(); + update_out_shape(p); + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -128)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + data("eltwise_data", get_mem(get_output_layout(p))), + reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), + eltwise("eltwise", { "reduce", "eltwise_data" }, eltwise_mode::sum, p.default_type), + activation("activation", "eltwise", activation_func::relu), + quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), + reorder("output_reorder", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector{ + reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_2, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_ref" }, + + reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_0, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + + reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_2, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_ref" }, + + reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" } +})); + +class reduce_scale_activation : public ReduceFusingTest {}; +TEST_P(reduce_scale_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_single_element_layout(p), -0.125f)), + reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), + scale("scale", "reduce", "scale_data"), + activation("activation", "scale", activation_func::cos), + reorder("output_reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-02f; + execute(p); +} + +TEST_P(reduce_scale_activation, per_channel) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)), + reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), + scale("scale", "reduce", "scale_data"), + activation("activation", "scale", activation_func::cos), + reorder("output_reorder", "activation", p.default_format, data_types::f32) + ); + + tolerance = 1e-02f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_scale_activation, ::testing::ValuesIn(std::vector{ + reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, { reduce::along_x, reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_2, 2, 4, reduce_mode::mean, { reduce::along_x, reduce::along_y }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::sum, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, + + reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::max, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_1, 2, 4, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, { reduce::along_x, reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_2, 2, 4, reduce_mode::mean, { reduce::along_x, reduce::along_y }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, + reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::sum, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, +})); + +INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector{ + // No layout format available for quantize/scale + reduce_test_params{ CASE_REDUCE_F32_3, 2, 4, reduce_mode::l1, { reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_F16_3, 2, 4, reduce_mode::min, { reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I32_2, 2, 4, reduce_mode::max, { reduce::along_x, reduce::along_y }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I32_3, 2, 4, reduce_mode::sum, { reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_I8_3, 2, 4, reduce_mode::mean, { reduce::along_x }, true, "reduce_ref" }, + reduce_test_params{ CASE_REDUCE_U8_3, 2, 4, reduce_mode::l2, { reduce::along_x }, true, "reduce_ref" } +})); diff --git a/src/plugins/intel_gpu/tests/fusions/resample_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/resample_fusion_test.cpp new file mode 100644 index 00000000000..859c942567c --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/resample_fusion_test.cpp @@ -0,0 +1,310 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct resample_test_params { + tensor in_shape; + tensor out_shape; + data_types data_type; + format input_format; + resample_type type; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class ResamplePrimitiveFusingTest : public ::BaseFusingTest { +public: + + void execute(resample_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(resample_test_params& p) { + return layout{ p.data_type, p.input_format, p.in_shape, padding{} }; + } + + layout get_per_channel_layout(resample_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- Resample cases --------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_RESAMPLE_FP32_1 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f32, format::bfyx, resample_type::nearest, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_2 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f32, format::bfyx, resample_type::bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_3 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f32, format::bfyx, resample_type::caffe_bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_4 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::bfyx, resample_type::nearest, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_5 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::bfyx, resample_type::bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_6 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::bfyx, resample_type::caffe_bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_7 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f32, format::bfzyx, resample_type::nearest, data_types::f32, format::bfzyx +#define CASE_RESAMPLE_FP32_8 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f32, format::bfzyx, resample_type::caffe_bilinear, data_types::f32, format::bfzyx +#define CASE_RESAMPLE_FP32_9 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_FP32_10 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f32, format::bfyx + +#define CASE_RESAMPLE_FP16_1 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f16, format::bfyx, resample_type::nearest, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_2 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f16, format::bfyx, resample_type::bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_3 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f16, format::bfyx, resample_type::caffe_bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_4 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::bfyx, resample_type::nearest, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_5 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::bfyx, resample_type::bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_6 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::bfyx, resample_type::caffe_bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_7 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f16, format::bfzyx, resample_type::nearest, data_types::f16, format::bfzyx +#define CASE_RESAMPLE_FP16_8 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f16, format::bfzyx, resample_type::caffe_bilinear, data_types::f16, format::bfzyx +#define CASE_RESAMPLE_FP16_9 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_10 { 2, 32, 4, 5 }, { 2, 32, 7, 8 }, data_types::f16, format::fs_b_yx_fsv32, resample_type::bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_11 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_12 { 2, 32, 4, 5 }, { 2, 32, 7, 8 }, data_types::f16, format::fs_b_yx_fsv32, resample_type::caffe_bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_13 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f16, format::bfyx +#define CASE_RESAMPLE_FP16_14 { 1, 32, 4, 5 }, { 1, 32, 2, 3 }, data_types::f16, format::fs_b_yx_fsv32, resample_type::caffe_bilinear, data_types::f16, format::bfyx + +#define CASE_RESAMPLE_I8_1 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx +#define CASE_RESAMPLE_I8_2 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx +#define CASE_RESAMPLE_I8_3 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_I8_4 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx + +#define CASE_RESAMPLE_U8_1 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx +#define CASE_RESAMPLE_U8_2 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx +#define CASE_RESAMPLE_U8_3 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx +#define CASE_RESAMPLE_U8_4 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx + +class resample_quantize : public ResamplePrimitiveFusingTest {}; +TEST_P(resample_quantize, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + resample("resample_prim", "input", p.out_shape, p.in_shape.feature[0], p.type), + quantize("quantize", "resample_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_quantize, ::testing::ValuesIn(std::vector{ + resample_test_params{ CASE_RESAMPLE_FP32_1, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_2, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_3, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_4, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_5, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_6, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_7, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_8, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_9, 2, 3 }, + resample_test_params{ CASE_RESAMPLE_FP32_10, 2, 3 }, + + // FQ can't be fused to FP16 primitive for now + // resample_test_params{ CASE_RESAMPLE_FP16_1, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_2, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_3, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_4, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_5, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_6, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_7, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_8, 2, 3 }, + // resample_test_params{ CASE_RESAMPLE_FP16_9, 2, 3 }, +})); + +class resample_scale_activation_eltwise : public ResamplePrimitiveFusingTest {}; +TEST_P(resample_scale_activation_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + data("eltwise_data", get_mem(get_output_layout(p), -10, 10)), + resample("resample_prim", "input", p.out_shape, p.in_shape.feature[0], p.type), + scale("scale", "resample_prim", "scale_data"), + activation("activation", "scale", activation_func::abs), + eltwise("eltwise", { "activation", "eltwise_data" }, eltwise_mode::sum), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ + resample_test_params{ CASE_RESAMPLE_FP32_1, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_2, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_3, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_4, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_5, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_6, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_7, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_8, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_9, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP32_10, 2, 5 }, + + resample_test_params{ CASE_RESAMPLE_FP16_1, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_2, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_3, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_4, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_5, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_6, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_7, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_8, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_9, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_10, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_11, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_12, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_13, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_FP16_14, 2, 5 }, + + resample_test_params{ CASE_RESAMPLE_I8_1, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_I8_2, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_I8_3, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_I8_4, 2, 5 }, + + resample_test_params{ CASE_RESAMPLE_U8_1, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_U8_2, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_U8_3, 2, 5 }, + resample_test_params{ CASE_RESAMPLE_U8_4, 2, 5 }, +})); + +class resample_quantize_concat : public ResamplePrimitiveFusingTest {}; +TEST_P(resample_quantize_concat, along_f) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + resample("resample1", "input", p.out_shape, p.in_shape.feature[0], p.type), + data("in_lo_1", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi_1", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo_1", get_mem(get_single_element_layout(p), -128)), + data("out_hi_1", get_mem(get_single_element_layout(p), 127)), + quantize("quant1", "resample1", "in_lo_1", "in_hi_1", "out_lo_1", "out_hi_1", 256, data_types::i8), + resample("resample2", "input", p.out_shape, p.in_shape.feature[0], p.type), + data("in_lo_2", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi_2", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo_2", get_mem(get_single_element_layout(p), -127)), + data("out_hi_2", get_mem(get_single_element_layout(p), 127)), + quantize("quant2", "resample2", "in_lo_2", "in_hi_2", "out_lo_2", "out_hi_2", 255, data_types::i8), + concatenation("concat", { "quant1", "quant2" }, cldnn::concatenation::along_f), + reorder("reorder_bfyx", "concat", cldnn::format::bfyx, p.default_type) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_quantize_concat, ::testing::ValuesIn(std::vector{ + resample_test_params{ CASE_RESAMPLE_FP32_1, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_2, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_4, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_5, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_6, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_10, 3, 6 }, + + resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_4, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_5, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_6, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_7, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_11, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_12, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_13, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_14, 3, 6 }, + + resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6 }, + + resample_test_params{ CASE_RESAMPLE_U8_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_U8_4, 3, 6 }, +})); + +class resample_scale_concat : public ResamplePrimitiveFusingTest {}; +TEST_P(resample_scale_concat, along_f) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + resample("resample1", "input", p.out_shape, p.in_shape.feature[0], p.type), + data("scale1_scale", get_mem(get_per_channel_layout(p), -10, 10)), + data("scale1_shift", get_mem(get_per_channel_layout(p), -10, 10)), + scale("scale1", "resample1", "scale1_scale", "scale1_shift"), + resample("resample2", "input", p.out_shape, p.in_shape.feature[0], p.type), + data("scale2_scale", get_mem(get_per_channel_layout(p), -10, 10)), + data("scale2_shift", get_mem(get_per_channel_layout(p), -10, 10)), + scale("scale2", "resample2", "scale2_scale", "scale2_shift"), + concatenation("concat", { "scale1", "scale2" }, cldnn::concatenation::along_f), + reorder("reorder_bfyx", "concat", cldnn::format::bfyx, p.default_type) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_scale_concat, ::testing::ValuesIn(std::vector{ + resample_test_params{ CASE_RESAMPLE_FP32_1, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_2, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_4, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_5, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_6, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP32_10, 3, 6 }, + + resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_4, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_5, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_6, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_7, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_11, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_12, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_13, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_FP16_14, 3, 6 }, + + resample_test_params{ CASE_RESAMPLE_I8_1, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_I8_2, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6 }, + + resample_test_params{ CASE_RESAMPLE_U8_1, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_U8_2, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_U8_3, 3, 6 }, + resample_test_params{ CASE_RESAMPLE_U8_4, 3, 6 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/scatter_elements_update_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/scatter_elements_update_fusion_test.cpp new file mode 100644 index 00000000000..03a1bbc7b35 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/scatter_elements_update_fusion_test.cpp @@ -0,0 +1,174 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct scatter_elements_update_test_params { + tensor input_shape; + tensor indices_shape; + cldnn::scatter_elements_update::scatter_elements_update_axis axis; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class ScatterElementsUpdatePrimitiveFusingTest : public ::BaseFusingTest{ +public: + void execute(scatter_elements_update_test_params& p) { + + auto input_prim = get_mem(get_input_layout(p), -5, 5); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(scatter_elements_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.input_shape }; + } + + layout get_indices_layout(scatter_elements_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.indices_shape }; + } + + layout get_updates_layout(scatter_elements_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.indices_shape }; + } + + size_t get_axis_dim(scatter_elements_update_test_params& p) { + switch (p.axis) { + case cldnn::scatter_elements_update::scatter_elements_update_axis::along_x: + return p.input_shape.spatial[0]; + case cldnn::scatter_elements_update::scatter_elements_update_axis::along_y: + return p.input_shape.spatial[1]; + case cldnn::scatter_elements_update::scatter_elements_update_axis::along_z: + return p.input_shape.spatial[2]; + case cldnn::scatter_elements_update::scatter_elements_update_axis::along_w: + return p.input_shape.spatial[3]; + case cldnn::scatter_elements_update::scatter_elements_update_axis::along_f: + return p.input_shape.feature[0]; + case cldnn::scatter_elements_update::scatter_elements_update_axis::along_b: + return p.input_shape.batch[0]; + default: + return 1; + } + } + + layout get_per_channel_layout(scatter_elements_update_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.input_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------------------ ScatterElementsUpdate cases ------------------------------ */ +/* ----------------------------------------------------------------------------------------------------- */ + +// input shape along the update axis should be larger than the total number of elements in the update tensor. +// This is not a limitation of operation itself, but a limitation of test implementation. +#define CASE_SCATTER_ELEMENTS_UPDATE_FP32_1 { 8, 4, 1, 1 }, { 2, 4, 1, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ELEMENTS_UPDATE_FP32_2 { 2, 8, 1, 2 }, { 2, 2, 1, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_f, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ELEMENTS_UPDATE_FP32_3 { 2, 3, 10, 10 }, { 2, 2, 1, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx + +#define CASE_SCATTER_ELEMENTS_UPDATE_FP16_1 { 2, 2, 14, 12 }, { 2, 2, 3, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_x, data_types::f16, format::bfyx, data_types::f16, format::bfyx + +#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_1 { 24, 3, 1, 4, 1 }, { 4, 3, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_b, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_2 { 2, 17, 2, 2, 2 }, { 1, 2, 2, 2, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_f, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_3 { 5, 3, 2, 20, 22 }, { 5, 1, 1, 2, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_y, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx + +#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_1 { 13, 2, 1, 2, 1 }, { 2, 2, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_b, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_2 { 1, 13, 1, 2, 1 }, { 1, 2, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_f, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_3 { 2, 3, 1, 13, 13 }, { 2, 3, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_y, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx + +class scatter_elements_update_quantize : public ScatterElementsUpdatePrimitiveFusingTest {}; +TEST_P(scatter_elements_update_quantize, basic) { + auto p = GetParam(); + const auto &seu = scatter_elements_update("scatter_elements_update_prim", "input", "scatter_elements_update_indices", "scatter_elements_update_updates", p.axis); + const auto &q = quantize("quantize", "scatter_elements_update_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8); + const auto &r = reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_elements_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), + data("scatter_elements_update_updates", get_mem(get_updates_layout(p), 0, 100)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + seu, + q, + r + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_elements_update_quantize, ::testing::ValuesIn(std::vector{ + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_1, 2, 3 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_2, 2, 3 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_3, 2, 3 }, + + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP16_1, 2, 3 }, + + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_1, 2, 3 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_2, 2, 3 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_3, 2, 3 }, + + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_1, 2, 3 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_2, 2, 3 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_3, 2, 3 }, +})); + +class scatter_elements_update_scale_activation_eltwise : public ScatterElementsUpdatePrimitiveFusingTest {}; +TEST_P(scatter_elements_update_scale_activation_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_elements_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), + data("scatter_elements_update_updates", get_mem(get_updates_layout(p), 0, 5)), + data("scale_data", get_mem(get_per_channel_layout(p), -1, 1)), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.input_shape })), + scatter_elements_update("scatter_elements_update_prim", "input", "scatter_elements_update_indices", "scatter_elements_update_updates", p.axis), + activation("activation", "scatter_elements_update_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), + reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) + ); + + tolerance = 1e-2f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_elements_update_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_1, 2, 5 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_2, 2, 5 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_3, 2, 5 }, + + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP16_1, 2, 5 }, + + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_1, 2, 5 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_2, 2, 5 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_3, 2, 5 }, + + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_1, 2, 5 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_2, 2, 5 }, + scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_3, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/scatter_nd_update_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/scatter_nd_update_fusion_test.cpp new file mode 100644 index 00000000000..5307a9d734a --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/scatter_nd_update_fusion_test.cpp @@ -0,0 +1,301 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct scatter_nd_update_test_params { + tensor input_shape; + tensor indices_shape; + tensor updates_shape; + int indices_rank; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class ScatterNDUpdatePrimitiveFusingTest : public ::BaseFusingTest { +public: + void execute(scatter_nd_update_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(scatter_nd_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.input_shape }; + } + + layout get_indices_layout(scatter_nd_update_test_params& p) { + return layout{ p.data_type, get_default_format(p.indices_rank), p.indices_shape }; + } + + layout get_updates_layout(scatter_nd_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.updates_shape }; + } + + layout get_per_channel_layout(scatter_nd_update_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.input_shape.feature[0], 1, 1 } }; + } + + format get_default_format(int rank = 4) { + if (rank <= 4) + return cldnn::format::bfyx; + else if (rank == 5) + return cldnn::format::bfzyx; + else + return cldnn::format::bfwzyx; + } + + template + T generate_random_val(int min, int max, int k = 8) { + static std::default_random_engine generator(random_seed); + // 1/k is the resolution of the floating point numbers + std::uniform_int_distribution distribution(k * min, k * max); + T val = (T)distribution(generator); + val /= k; + + return val; + } + + template + std::vector generate_unique_indices(scatter_nd_update_test_params& p) { + std::set> unique_indices; + std::vector result; + auto indices_shape = p.indices_shape.sizes(get_default_format(p.indices_rank)); + auto last_indices_dim = indices_shape.back(); + + auto count = 1; + for (size_t i = 0; i < indices_shape.size() - 1; i++) + count *= indices_shape[i]; + + while (unique_indices.size() != count) { + std::vector indices; + for (size_t i = 0; i < last_indices_dim; i++) + indices.push_back(generate_random_val(0, indices_shape[i])); + + unique_indices.insert(indices); + } + + std::for_each(unique_indices.begin(), + unique_indices.end(), + [&](const std::vector& indices) { + result.insert(result.end(), indices.begin(), indices.end()); + }); + + return result; + } + + cldnn::memory::ptr get_indices_mem(scatter_nd_update_test_params& p) { + auto indices_layout = get_indices_layout(p); + auto prim = engine.allocate_memory(indices_layout); + if (indices_layout.data_type == data_types::f32) { + VF rnd_vec = generate_unique_indices(p); + set_values(prim, rnd_vec); + } else if (indices_layout.data_type == data_types::f16) { + VF rnd_vec = generate_unique_indices(p); + set_values(prim, rnd_vec); + } else if (indices_layout.data_type == data_types::i8) { + VF rnd_vec = generate_unique_indices(p); + set_values(prim, rnd_vec); + } else { + throw std::runtime_error("Unsupported data type for indicies of scatter_nd_update primitive"); + } + + return prim; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------- */ +/* ------------------------------------ ScatterNDUpdate cases ------------------------------------ */ +/* ----------------------------------------------------------------------------------------------- */ +#define CASE_SCATTER_ND_UPDATE_FP16_4D_1 { 6, 1, 1, 1 }, { 3, 1, 1, 1 }, { 3, 1, 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_4D_2 { 6, 6, 1, 1 }, { 3, 2, 1, 1 }, { 3, 1, 1, 1 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_4D_3 { 6, 7, 8, 9 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_4D_4 { 6, 7, 8, 9 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_4D_5 { 6, 7, 8, 9 }, { 6, 2, 1, 1 }, { 6, 9, 1, 8 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_4D_6 { 6, 7, 8, 9 }, { 6, 3, 1, 1 }, { 6, 8, 1, 1 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx + +#define CASE_SCATTER_ND_UPDATE_FP16_5D_1 { 6, 7, 8, 9, 10 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_2 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 1 }, { 5, 10, 1, 8, 9 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_3 { 6, 7, 8, 9, 10 }, { 5, 3, 1, 1 }, { 5, 9, 1, 1, 8 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_4 { 6, 7, 8, 9, 10 }, { 5, 4, 1, 1 }, { 5, 8, 1, 1, 1 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_5 { 6, 7, 8, 9, 10 }, { 5, 5, 1, 1 }, { 5, 1, 1, 1, 1 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_6 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 2 }, { 5, 2, 8, 9, 10 }, 3, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_7 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 3 }, { 5, 2, 1, 8, 9 }, 3, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_8 { 6, 7, 8, 9, 10 }, { 5, 2, 4, 3 }, { 5, 2, 1, 8, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_5D_9 { 6, 7, 8, 9, 10 }, { 5, 2, 3, 3 }, { 5, 2, 8, 9, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfyx + +#define CASE_SCATTER_ND_UPDATE_FP16_6D_1 { 6, 7, 8, 9, 10, 11 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10, 11 }, 1, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_6D_2 { 6, 7, 8, 9, 10, 11 }, { 5, 2, 1, 1 }, { 5, 11, 1, 8, 9, 10 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_6D_3 { 6, 7, 8, 9, 10, 11 }, { 5, 3, 1, 1 }, { 5, 10, 1, 1, 8, 9 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_6D_4 { 6, 7, 8, 9, 10, 11 }, { 5, 4, 1, 1 }, { 5, 9, 1, 1, 1, 8 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_6D_5 { 6, 7, 8, 9, 2, 2 }, { 5, 5, 1, 1 }, { 5, 8, 1, 1, 1, 1 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP16_6D_6 { 6, 7, 8, 9, 2, 2 }, { 5, 6, 1, 1 }, { 5, 1, 1, 1, 1, 1 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx + +#define CASE_SCATTER_ND_UPDATE_FP32_4D_1 { 6, 1, 1, 1 }, { 3, 1, 1, 1 }, { 3, 1, 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_4D_2 { 6, 6, 1, 1 }, { 3, 2, 1, 1 }, { 3, 1, 1, 1 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_4D_3 { 6, 7, 8, 1 }, { 5, 1, 1, 1 }, { 5, 7, 8, 1 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_4D_4 { 6, 7, 8, 9 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_4D_5 { 6, 7, 8, 9 }, { 6, 2, 1, 1 }, { 6, 9, 1, 8 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_4D_6 { 6, 7, 8, 9 }, { 6, 3, 1, 1 }, { 6, 8, 1, 1 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx + +#define CASE_SCATTER_ND_UPDATE_FP32_5D_1 { 6, 7, 8, 9, 10 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_5D_2 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 1 }, { 5, 10, 1, 8, 9 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_5D_3 { 6, 7, 8, 9, 10 }, { 5, 3, 1, 1 }, { 5, 9, 1, 1, 8 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_5D_4 { 6, 7, 8, 9, 10 }, { 5, 4, 1, 1 }, { 5, 8, 1, 1, 1 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_5D_5 { 6, 7, 8, 9, 10 }, { 5, 5, 1, 1 }, { 5, 1, 1, 1, 1 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx + +#define CASE_SCATTER_ND_UPDATE_FP32_6D_1 { 6, 7, 8, 9, 10, 11 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10, 11 }, 1, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_6D_2 { 6, 7, 8, 9, 10, 11 }, { 5, 2, 1, 1 }, { 5, 11, 1, 8, 9, 10 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_6D_3 { 6, 7, 8, 9, 10, 11 }, { 5, 3, 1, 1 }, { 5, 10, 1, 1, 8, 9 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_6D_4 { 6, 7, 8, 9, 10, 11 }, { 5, 4, 1, 1 }, { 5, 9, 1, 1, 1, 8 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_6D_5 { 6, 7, 8, 9, 2, 2 }, { 5, 5, 1, 1 }, { 5, 8, 1, 1, 1, 1 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx +#define CASE_SCATTER_ND_UPDATE_FP32_6D_6 { 6, 7, 8, 9, 2, 2 }, { 5, 6, 1, 1 }, { 5, 1, 1, 1, 1, 1 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx + +class scatter_nd_update_quantize : public ScatterNDUpdatePrimitiveFusingTest {}; +TEST_P(scatter_nd_update_quantize, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_nd_update_indices", get_indices_mem(p)), + data("scatter_nd_update_updates", get_mem(get_updates_layout(p), 0, 100)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + scatter_nd_update("scatter_nd_update_prim", "input", "scatter_nd_update_indices", "scatter_nd_update_updates", p.indices_rank), + quantize("quantize", "scatter_nd_update_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.input_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_nd_update_quantize, ::testing::ValuesIn(std::vector{ + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_1, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_2, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_3, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_4, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_5, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_6, 2, 3 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_1, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_2, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_3, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_7, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_9, 2, 3 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_3, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_4, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_5, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_6, 2, 3 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_1, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_2, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_3, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_4, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_5, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_6, 2, 3 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_1, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_2, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_3, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_4, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_5, 2, 3 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_1, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_2, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_3, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_4, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_5, 2, 3 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_6, 2, 3 }, +})); + +class scatter_nd_update_scale_activation_eltwise : public ScatterNDUpdatePrimitiveFusingTest {}; +TEST_P(scatter_nd_update_scale_activation_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_nd_update_indices", get_indices_mem(p)), + data("scatter_nd_update_updates", get_mem(get_updates_layout(p), 0, 100)), + data("scale_data", get_mem(get_per_channel_layout(p), -1, 1)), + data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.input_shape })), + scatter_nd_update("scatter_nd_update_prim", "input", "scatter_nd_update_indices", "scatter_nd_update_updates", p.indices_rank), + activation("activation", "scatter_nd_update_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), + reorder("reorder_bfyx", "eltwise", p.input_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_nd_update_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_1, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_2, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_3, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_4, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_5, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_6, 2, 5 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_1, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_2, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_3, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_6, 2, 5 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_3, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_4, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_5, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_6, 2, 5 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_1, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_2, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_3, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_4, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_5, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_6, 2, 5 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_1, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_2, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_3, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_4, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_5, 2, 5 }, + + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_1, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_2, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_3, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_4, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_5, 2, 5 }, + scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_6, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/scatter_update_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/scatter_update_fusion_test.cpp new file mode 100644 index 00000000000..f215510be29 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/scatter_update_fusion_test.cpp @@ -0,0 +1,242 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct scatter_update_test_params { + tensor dictionary_shape; + tensor indices_shape; + tensor updates_shape; + cldnn::scatter_update::scatter_update_axis axis; + data_types data_type; + format input_format; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class ScatterUpdatePrimitiveFusingTest : public ::BaseFusingTest { +public: + void execute(scatter_update_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(scatter_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.dictionary_shape }; + } + + layout get_indices_layout(scatter_update_test_params& p) { + return layout{ p.data_type, format::bfyx, p.indices_shape }; + } + + layout get_updates_layout(scatter_update_test_params& p) { + return layout{ p.data_type, p.input_format, p.updates_shape }; + } + + size_t get_axis_dim(scatter_update_test_params& p) { + switch (p.axis) { + case cldnn::scatter_update::scatter_update_axis::along_x: + return p.dictionary_shape.spatial[0]; + case cldnn::scatter_update::scatter_update_axis::along_y: + return p.dictionary_shape.spatial[1]; + case cldnn::scatter_update::scatter_update_axis::along_z: + return p.dictionary_shape.spatial[2]; + case cldnn::scatter_update::scatter_update_axis::along_w: + return p.dictionary_shape.spatial[3]; + case cldnn::scatter_update::scatter_update_axis::along_f: + return p.dictionary_shape.feature[0]; + case cldnn::scatter_update::scatter_update_axis::along_b: + return p.dictionary_shape.batch[0]; + default: + return 1; + } + } + + layout get_per_channel_layout(scatter_update_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.dictionary_shape.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ------------------------------------------ ScatterUpdate cases -------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_SCATTER_UPDATE_FP32_1 { 2, 4, 1, 1 }, { 2, 1, 1, 1 }, { 2, 4, 1, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_UPDATE_FP32_2 { 8, 1, 1, 1 }, { 4, 1, 1, 1 }, { 4, 1, 1, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_UPDATE_FP32_3 { 4, 3, 1, 1 }, { 2, 2, 1, 1 }, { 2, 2, 1, 3 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_UPDATE_FP32_4 { 2, 5, 1, 2 }, { 2, 2, 1, 1 }, { 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f32, format::bfyx, data_types::f32, format::bfyx +#define CASE_SCATTER_UPDATE_FP32_5 { 2, 2, 1, 4 }, { 2, 2, 1, 1 }, { 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx + +#define CASE_SCATTER_UPDATE_FP16_1 { 2, 4, 1, 1 }, { 1, 1, 1, 2 }, { 2, 1, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_UPDATE_FP16_2 { 8, 2, 1, 20 }, { 2, 3, 1, 1 }, { 2, 3, 20, 2 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_UPDATE_FP16_3 { 2, 2, 4, 1 }, { 3, 1, 1, 1 }, { 2, 2, 3, 1 }, cldnn::scatter_update::scatter_update_axis::along_x, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_UPDATE_FP16_4 { 6, 2, 1, 1 }, { 1, 2, 1, 2 }, { 1, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx +#define CASE_SCATTER_UPDATE_FP16_5 { 3, 1, 1, 5 }, { 2, 2, 1, 1 }, { 3, 1, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f16, format::bfyx, data_types::f16, format::bfyx + +#define CASE_SCATTER_UPDATE_5D_FP32_1 { 4, 3, 1, 4, 1 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP32_2 { 2, 3, 2, 2, 2 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP32_3 { 5, 3, 2, 4, 2 }, { 3, 1, 1, 1 }, { 5, 3, 2, 3, 2 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP32_4 { 2, 3, 1, 4, 4 }, { 2, 1, 1, 1 }, { 2, 3, 1, 4, 2 }, cldnn::scatter_update::scatter_update_axis::along_z, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP32_5 { 3, 1, 5, 2, 1 }, { 2, 1, 1, 1 }, { 3, 1, 2, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_x, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx + +#define CASE_SCATTER_UPDATE_5D_FP16_1 { 3, 2, 1, 2, 1 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP16_2 { 1, 3, 1, 2, 1 }, { 2, 1, 1, 1 }, { 1, 2, 1, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP16_3 { 2, 3, 1, 3, 3 }, { 1, 2, 1, 1 }, { 2, 3, 1, 2, 3 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 1, 1, 1 }, { 3, 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_z, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx +#define CASE_SCATTER_UPDATE_5D_FP16_5 { 1, 1, 4, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 3, 1, 1 }, cldnn::scatter_update::scatter_update_axis::along_x, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx + +class scatter_update_quantize : public ScatterUpdatePrimitiveFusingTest {}; +TEST_P(scatter_update_quantize, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), + data("scatter_update_updates", get_mem(get_updates_layout(p), 0, 1000)), + data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_lo", get_mem(get_single_element_layout(p), -127)), + data("out_hi", get_mem(get_single_element_layout(p), 127)), + scatter_update("scatter_update_prim", "input", "scatter_update_indices", "scatter_update_updates", p.axis), + quantize("quantize", "scatter_update_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), + reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_update_quantize, ::testing::ValuesIn(std::vector{ + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_1, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_2, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_3, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_4, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_5, 2, 3 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_1, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_2, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_3, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 2, 3 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_4, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_5, 2, 3 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_1, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_2, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_3, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_4, 2, 3 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_5, 2, 3 }, +})); + +class scatter_update_scale_activation : public ScatterUpdatePrimitiveFusingTest {}; +TEST_P(scatter_update_scale_activation, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), + data("scatter_update_updates", get_mem(get_updates_layout(p), 0, 1000)), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + scatter_update("scatter_update_prim", "input", "scatter_update_indices", "scatter_update_updates", p.axis), + activation("activation", "scatter_update_prim", activation_func::abs), + scale("scale", "activation", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_update_scale_activation, ::testing::ValuesIn(std::vector{ + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_1, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_2, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_3, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_4, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_5, 2, 4 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_1, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_2, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_3, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 2, 4 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_4, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_5, 2, 4 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_1, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_2, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_3, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_4, 2, 4 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_5, 2, 4 }, +})); + +class scatter_update_scale_activation_eltwise : public ScatterUpdatePrimitiveFusingTest {}; +TEST_P(scatter_update_scale_activation_eltwise, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + data("scatter_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), + data("scatter_update_updates", get_mem(get_updates_layout(p), 0, 1000)), + data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), + data("eltw_data", get_mem(layout(p.default_type, p.default_format, p.dictionary_shape))), + scatter_update("scatter_update_prim", "input", "scatter_update_indices", "scatter_update_updates", p.axis), + activation("activation", "scatter_update_prim", activation_func::abs), + eltwise("eltw", { "activation", "eltw_data" }, eltwise_mode::sum, p.default_type), + scale("scale", "eltw", "scale_data"), + reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) + ); + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_update_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_1, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_2, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_3, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_4, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_5, 3, 5 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_1, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_2, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_3, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 3, 5 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_4, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_5, 3, 5 }, + + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_1, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_2, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_3, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_4, 3, 5 }, + scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_5, 3, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/space_to_batch_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/space_to_batch_fusion_test.cpp new file mode 100644 index 00000000000..e904f121b3b --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/space_to_batch_fusion_test.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct space_to_batch_test_params { + tensor input_size; + tensor output_size; + data_types input_type; + format input_format; + tensor block_shape; + tensor pads_begin; + tensor pads_end; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class SpaceToBatchFusingsTest : public ::BaseFusingTest { +public: + void execute(space_to_batch_test_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(space_to_batch_test_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(space_to_batch_test_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; + } +}; +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* ---------------------------------------- SpaceToBatch cases ----------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ +#define CASE_SPACE_TO_BATCH_F32_1 { 1, 4, 8, 8 }, { 16, 2, 3, 8 }, data_types::f32, format::bfyx, { 1, 2, 4, 1 }, { 0, 0, 4, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_F32_2 { 2, 16, 4, 6 }, { 24, 4, 4, 3 }, data_types::f32, format::b_fs_yx_fsv16, { 1, 4, 1, 3 }, { 0, 0, 0, 0 }, { 0, 0, 0, 3 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_F16_1 { 1, 1, 6, 8 }, { 48, 1, 1, 1 }, data_types::f16, format::bfyx, { 1, 1, 6, 8 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_F16_2 { 1, 32, 1, 5 }, { 20, 4, 1, 4 }, data_types::f16, format::b_fs_yx_fsv16, { 1, 10, 1, 2 }, { 0, 8, 0, 0 }, { 0, 0, 0, 3 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_U8_1 { 3, 12, 4, 8 }, { 48, 6, 2, 3 }, data_types::u8, format::bfyx, { 1, 2, 2, 4 }, { 0, 0, 0, 4 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_U8_2 { 2, 16, 3, 6 }, { 30, 4, 1, 6 }, data_types::u8, format::b_fs_yx_fsv16, { 1, 5, 3, 1 }, { 0, 4, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_I8_1 { 1, 2, 8, 1 }, { 4, 2, 2, 1 }, data_types::i8, format::bfyx, { 1, 1, 4, 1 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx +#define CASE_SPACE_TO_BATCH_I8_2 { 1, 32, 4, 8 }, { 48, 2, 6, 3 }, data_types::i8, format::b_fs_yx_fsv16, { 1, 16, 1, 3 }, { 0, 0, 2, 0 }, { 0, 0, 0, 1 }, data_types::f32, format::bfyx + +class space_to_batch_quantize_i8 : public SpaceToBatchFusingsTest {}; +TEST_P(space_to_batch_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + space_to_batch("space_to_batch", "input", p.block_shape, p.pads_begin, p.pads_end, p.output_size), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -128)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + quantize("quant", "space_to_batch", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_batch_quantize_i8, ::testing::ValuesIn(std::vector{ + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_1, 2, 3 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_2, 2, 3 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_1, 2, 3 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_2, 2, 3 }, +})); + +class space_to_batch_scale_act_eltwise_quantize_u8 : public SpaceToBatchFusingsTest {}; +TEST_P(space_to_batch_scale_act_eltwise_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + space_to_batch("space_to_batch", "input", p.block_shape, p.pads_begin, p.pads_end, p.output_size), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "space_to_batch", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), 0)), + data("out_high", get_mem(get_single_element_layout(p), 255)), + quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), + reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_batch_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_1, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_2, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_1, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_2, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_1, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_2, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_1, 2, 6 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_2, 2, 6 }, +})); + + +class space_to_batch_scale_act_eltw : public SpaceToBatchFusingsTest {}; +TEST_P(space_to_batch_scale_act_eltw, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + space_to_batch("space_to_batch", "input", p.block_shape, p.pads_begin, p.pads_end, p.output_size), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "space_to_batch", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_batch_scale_act_eltw, ::testing::ValuesIn(std::vector{ + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_1, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_2, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_1, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_2, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_1, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_2, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_1, 2, 5 }, + space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_2, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/fusions/space_to_depth_fusion_test.cpp b/src/plugins/intel_gpu/tests/fusions/space_to_depth_fusion_test.cpp new file mode 100644 index 00000000000..b7c92c7e6e8 --- /dev/null +++ b/src/plugins/intel_gpu/tests/fusions/space_to_depth_fusion_test.cpp @@ -0,0 +1,162 @@ +// Copyright (C) 2018-2021 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" +#include "fusion_test_common.hpp" + +#include +#include +#include +#include +#include + +#include + +using namespace cldnn; +using namespace ::tests; + +namespace { +struct space_to_depth_params { + tensor input_size; + tensor output_size; + space_to_depth::depth_mode mode; + data_types input_type; + format input_format; + size_t block_size; + data_types default_type; + format default_format; + size_t expected_fused_primitives; + size_t expected_not_fused_primitives; +}; + +class SpaceToDepthFusingsTest : public ::BaseFusingTest { +public: + void execute(space_to_depth_params& p) { + auto input_prim = get_mem(get_input_layout(p)); + + network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); + network network_fused(this->engine, this->topology_fused, bo_fused); + + network_fused.set_input_data("input", input_prim); + network_not_fused.set_input_data("input", input_prim); + + compare(network_not_fused, network_fused, p); + } + + layout get_input_layout(space_to_depth_params& p) { + return layout{ p.input_type, p.input_format, p.input_size }; + } + + layout get_per_channel_layout(space_to_depth_params& p) { + return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; + } + + format get_input_format(space_to_depth_params &p) { + return p.input_format; + } +}; + +} // namespace + +/* ----------------------------------------------------------------------------------------------------- */ +/* -------------------------------- SpaceToDepth cases ------------------------------------------------- */ +/* ----------------------------------------------------------------------------------------------------- */ + +#define CASE_SPACE_TO_DEPTH_F32_1 { 2, 2, 8, 10 }, { 2, 8, 4, 5 }, space_to_depth::depth_mode::blocks_first, data_types::f32, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_F32_2 { 1, 2, 6, 6, 6 }, { 1, 54, 2, 2, 2 }, space_to_depth::depth_mode::depth_first, data_types::f32, format::bfzyx, 3, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_F16_1 { 1, 3, 6, 6 }, { 1, 12, 3, 3 }, space_to_depth::depth_mode::blocks_first, data_types::f16, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_F16_2 { 2, 1, 3, 3 }, { 2, 9, 1, 1 }, space_to_depth::depth_mode::blocks_first, data_types::f16, format::b_fs_yx_fsv16, 3, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_U8_1 { 2, 2, 8, 10 }, { 2, 8, 4, 5 }, space_to_depth::depth_mode::blocks_first, data_types::u8, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_U8_2 { 1, 2, 6, 6, 6 }, { 1, 54, 2, 2, 2 }, space_to_depth::depth_mode::depth_first, data_types::u8, format::bfzyx, 3, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_I8_1 { 1, 3, 6, 6 }, { 1, 12, 3, 3 }, space_to_depth::depth_mode::blocks_first, data_types::i8, format::bfyx, 2, data_types::f32, format::bfyx +#define CASE_SPACE_TO_DEPTH_I8_2 { 2, 1, 3, 3 }, { 2, 9, 1, 1 }, space_to_depth::depth_mode::blocks_first, data_types::i8, format::b_fs_yx_fsv16, 3, data_types::f32, format::bfyx + +class space_to_depth_quantize_i8 : public SpaceToDepthFusingsTest {}; +TEST_P(space_to_depth_quantize_i8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + space_to_depth("space_to_depth", "input", p.mode, p.block_size), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), -128)), + data("out_high", get_mem(get_single_element_layout(p), 127)), + quantize("quant", "space_to_depth", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), + reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_depth_quantize_i8, ::testing::ValuesIn(std::vector{ + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_1, 2, 3 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_2, 2, 3 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_1, 2, 3 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_2, 2, 3 }, +})); + +class space_to_depth_scale_act_eltwise_quantize_u8 : public SpaceToDepthFusingsTest {}; +TEST_P(space_to_depth_scale_act_eltwise_quantize_u8, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + space_to_depth("space_to_depth", "input", p.mode, p.block_size), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "space_to_depth", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), + data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), + data("out_low", get_mem(get_single_element_layout(p), 0)), + data("out_high", get_mem(get_single_element_layout(p), 255)), + quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), + reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) + ); + + tolerance = 1.f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_depth_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_1, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_2, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_1, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_2, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_1, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_2, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_1, 2, 6 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_2, 2, 6 }, +})); + + +class space_to_depth_scale_act_eltw : public SpaceToDepthFusingsTest {}; +TEST_P(space_to_depth_scale_act_eltw, basic) { + auto p = GetParam(); + create_topologies( + input_layout("input", get_input_layout(p)), + space_to_depth("space_to_depth", "input", p.mode, p.block_size), + data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), + scale("scale1", "space_to_depth", "scale1_data"), + activation("actv1", "scale1", activation_func::relu), + data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), + eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), + reorder("reorder_bfyx", "eltw", format::bfyx, data_types::f32) + ); + + tolerance = 1e-5f; + execute(p); +} + +INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_depth_scale_act_eltw, ::testing::ValuesIn(std::vector{ + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_1, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_2, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_1, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_2, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_1, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_2, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_1, 2, 5 }, + space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_2, 2, 5 }, +})); diff --git a/src/plugins/intel_gpu/tests/test_cases/fusings_gpu_test.cpp b/src/plugins/intel_gpu/tests/test_cases/fusings_gpu_test.cpp deleted file mode 100644 index bc1f60587f5..00000000000 --- a/src/plugins/intel_gpu/tests/test_cases/fusings_gpu_test.cpp +++ /dev/null @@ -1,10336 +0,0 @@ -// Copyright (C) 2018-2021 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "test_utils.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -using namespace cldnn; -using namespace ::tests; - -struct resample_test_params { - tensor in_shape; - tensor out_shape; - data_types data_type; - format input_format; - resample_type type; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -struct bc_test_params { - tensor in_shape; - tensor out_shape; - tensor kernel; - tensor stride; - tensor pad; - tensor dilation; - uint32_t groups; - data_types data_type; - format input_format; - data_types weights_type; - format weights_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -struct bc_force_kernel_params { - tensor in_shape; - tensor out_shape; - tensor kernel; - tensor stride; - tensor pad; - tensor dilation; - uint32_t groups; - data_types data_type; - format input_format; - data_types weights_type; - format weights_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; - std::string kernel_name; -}; - -struct conv_eltw_test_params { - tensor in_shape; - tensor out_shape; - tensor eltw_shape; - tensor kernel; - tensor stride; - tensor pad; - tensor dilation; - uint32_t groups; - data_types data_type; - format input_format; - data_types weights_type; - format weights_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -struct gemm_test_params { - std::vector in_shapes; - tensor out_shape; - tensor kernel; - tensor pad; - data_types data_type_in0; - data_types data_type_in1; - data_types data_type_in2; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -struct normalize_test_params { - tensor in_shape; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - bool across_spatial; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -template -class BaseFusingTest : public ::testing::TestWithParam { -public: -#ifdef ENABLE_ONEDNN_FOR_GPU - cldnn::engine& engine = get_onednn_test_engine(); -#else - cldnn::engine& engine = get_test_engine(); -#endif - cldnn::topology topology_fused; - cldnn::topology topology_non_fused; - cldnn::build_options bo_fused; - cldnn::build_options bo_not_fused; - - float tolerance = 0.0f; - - static const int min_random = -200; - static const int max_random = 200; - - void SetUp() override { - bo_fused.set_option(build_option::optimize_data(true)); - bo_not_fused.set_option(build_option::optimize_data(false)); - bo_not_fused.set_option(build_option::allow_static_input_reorder(true)); - } - - void compare(network& not_fused, network& fused, T& p, bool count_reorder = false) { - auto outputs_ref = not_fused.execute(); - auto outputs_fused = fused.execute(); - auto get_reorders_count = [](network& net) -> size_t { - size_t count = 0; - for (auto& pi : net.get_primitives_info()) { - if (pi.type_id == "reorder") { - auto exec_prims = net.get_executed_primitives(); - auto it = std::find_if(exec_prims.begin(), exec_prims.end(), [&](const std::pair& e) -> bool { - return e.first == pi.original_id; - }); - // We count executed reorders only - if (it != exec_prims.end()) - count++; - } - } - return count; - }; - - size_t reorders_count_fused = get_reorders_count(fused); - size_t reorders_count_not_fused = get_reorders_count(not_fused); - - std::stringstream description; - description << std::endl << "not fused: " << std::endl; - for (auto i : not_fused.get_primitives_info()) { - description << " " << i.original_id << " " << i.kernel_id << std::endl; - } - description << "fused: " << std::endl; - for (auto i : fused.get_primitives_info()) { - description << " " << i.original_id << " " << i.kernel_id << std::endl; - } - SCOPED_TRACE(description.str()); - // Subtract reorders count to handle execution in different layouts when input/output reorders can be added in the graph - ASSERT_EQ(fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_fused), p.expected_fused_primitives); - ASSERT_EQ(not_fused.get_executed_primitives().size() - (count_reorder ? 0 : reorders_count_not_fused), p.expected_not_fused_primitives); - ASSERT_EQ(outputs_ref.size(), outputs_fused.size()); - ASSERT_EQ(outputs_ref.size(), size_t(1)); - - auto output_not_fused_prim = outputs_ref.begin()->second.get_memory(); - auto output_fused_prim = outputs_fused.begin()->second.get_memory(); - if (output_not_fused_prim->get_layout().data_type == data_types::f32) { - cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); - cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); - for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { - ASSERT_NEAR(ref[i], output_ptr[i], tolerance) << "i = " << i; - } - } else { - cldnn::mem_lock ref(output_not_fused_prim, get_test_stream()); - cldnn::mem_lock output_ptr(output_fused_prim, get_test_stream()); - for (size_t i = 0; i < output_fused_prim->get_layout().count(); i++) { - ASSERT_NEAR(float16_to_float32(ref[i]), float16_to_float32(output_ptr[i]), tolerance) << "i = " << i; - } - } - } - - cldnn::memory::ptr get_mem(cldnn::layout l) { - auto prim = engine.allocate_memory(l); - tensor s = l.size; - if (l.data_type == data_types::bin) { - VF rnd_vec = generate_random_1d(s.count() / 32, min_random, max_random); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::i8 || l.data_type == data_types::u8) { - VF rnd_vec = generate_random_1d(s.count(), min_random, max_random); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::f16) { - VF rnd_vec = generate_random_1d(s.count(), -1, 1); - set_values(prim, rnd_vec); - } else { - VF rnd_vec = generate_random_1d(s.count(), -1, 1); - set_values(prim, rnd_vec); - } - - return prim; - } - - cldnn::memory::ptr get_mem(cldnn::layout l, float fill_value) { - auto prim = engine.allocate_memory(l); - tensor s = l.size; - if (l.data_type == data_types::bin) { - VF rnd_vec(s.count() / 32, static_cast(fill_value)); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::f16) { - VF rnd_vec(s.count(), float32_to_float16(fill_value)); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::f32) { - VF rnd_vec(s.count(), fill_value); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::u8) { - VF rnd_vec(s.count(), static_cast(fill_value)); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::i8) { - VF rnd_vec(s.count(), static_cast(fill_value)); - set_values(prim, rnd_vec); - } else { - throw std::runtime_error("get_mem: Unsupported precision"); - } - - return prim; - } - - cldnn::memory::ptr get_repeatless_mem(cldnn::layout l, int min, int max) { - auto prim = engine.allocate_memory(l); - tensor s = l.size; - if (l.data_type == data_types::f32) { - VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::f16) { - VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::i8) { - VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } - else if (l.data_type == data_types::bin) { - VF rnd_vec = generate_random_norepetitions_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } - - return prim; - } - - cldnn::memory::ptr get_mem(cldnn::layout l, int min, int max) { - auto prim = engine.allocate_memory(l); - tensor s = l.size; - if (l.data_type == data_types::f32) { - VF rnd_vec = generate_random_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::f16) { - VF rnd_vec = generate_random_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::i8) { - VF rnd_vec = generate_random_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::u8) { - VF rnd_vec = generate_random_1d(s.count(), min, max); - set_values(prim, rnd_vec); - } else if (l.data_type == data_types::bin) { - VF rnd_vec = generate_random_1d(s.count() / 32, min, max); - set_values(prim, rnd_vec); - } - - return prim; - } - - layout get_output_layout(T& p) { - return layout{ p.data_type, p.input_format, p.out_shape }; - } - - layout get_weights_layout(T& p, const int32_t /* split */ = 1) { - cldnn::tensor weights_tensor; - if (p.groups == 1) { - weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]), - spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); - } else { - weights_tensor = cldnn::tensor(group(p.groups), batch(p.out_shape.feature[0] / p.groups), feature(p.in_shape.feature[0] / p.groups), - spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); - } - return layout{ p.weights_type, p.weights_format, weights_tensor }; - } - - layout get_weights_layout(T& p, const int32_t /* split */, cldnn::format f) { - cldnn::tensor weights_tensor; - weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(static_cast(p.in_shape.feature[0] / p.groups)), - spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); - return layout{ p.weights_type, f, weights_tensor }; - } - - layout get_bias_layout(T& p) { - return layout{ p.default_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } - - layout get_weights_zp_layout(T& p) { - return layout{ p.weights_type, p.default_format, tensor{ p.out_shape.feature[0], 1, 1, 1 } }; - } - - layout get_activations_zp_layout(T& p) { - return layout{ p.data_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } - - layout get_single_element_layout(T& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, 1, 1, 1 } }; - } - - template - void create_topologies(Args const&... args) { - topology_fused.add(args...); - topology_non_fused.add(args...); - } -}; - -template -class WeightsPrimitiveFusingTest : public ::BaseFusingTest { -public: - - void execute(T& p) { - auto input_prim = this->get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, this->bo_not_fused); - network network_fused(this->engine, this->topology_fused, this->bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - this->compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(T& p) { - auto pad = p.pad; - std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; - return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; - } - - layout get_per_channel_layout(T& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } - - size_t get_fc_output_dim_size(bc_test_params& p) { - size_t size = 2; - for (auto i : p.out_shape.spatial) { - if (i > 1) - size++; - } - return size; - } - - layout get_fc_weights_layout(T& p) { - cldnn::tensor weights_tensor; - if (p.out_shape.spatial[1] > 1) { - // 3d case - weights_tensor = cldnn::tensor(p.kernel.batch[0], p.kernel.feature[0], 1, 1); - } - else { - weights_tensor = cldnn::tensor(batch(p.out_shape.feature[0]), feature(p.in_shape.feature[0]), - spatial(p.kernel.spatial[0], p.kernel.spatial[1], p.kernel.spatial[2])); - } - return layout{ p.weights_type, p.weights_format, weights_tensor }; - } - - layout get_fc_bias_layout(T& p) { - if (p.out_shape.spatial[1] > 1) { - // 3d case - return layout{ p.default_type, format::bfyx, tensor{ 1, p.out_shape.spatial[1], 1, 1 } }; - } - else { - return layout{ p.default_type, format::bfyx, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } - } -}; - -class ResamplePrimitiveFusingTest : public ::BaseFusingTest { -public: - - void execute(resample_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(resample_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape, padding{} }; - } - - layout get_per_channel_layout(resample_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } -}; - -class GemmFusingTest : public ::BaseFusingTest { -public: - - void execute(gemm_test_params& p) { - auto input0_prim = get_mem(get_input_layout(p, 0)); - auto input1_prim = get_mem(get_input_layout(p, 1)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input0", input0_prim); - network_not_fused.set_input_data("input0", input0_prim); - network_fused.set_input_data("input1", input1_prim); - network_not_fused.set_input_data("input1", input1_prim); - if (p.in_shapes.size() > 2) { - auto input2_prim = get_mem(get_input_layout(p, 2)); - network_fused.set_input_data("input2", input2_prim); - network_not_fused.set_input_data("input2", input2_prim); - } - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(gemm_test_params& p, int in_no) { - auto pad = p.pad; - std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; - if (in_no == 0) - return layout{ p.data_type_in0, p.input_format, p.in_shapes.at(0), padding{ pad_ } }; - else if (in_no == 1) - return layout{ p.data_type_in1, p.input_format, p.in_shapes.at(1), padding{ pad_ } }; - else - return layout{ p.data_type_in2, p.input_format, p.in_shapes.at(2), padding{ pad_ } }; - } - - layout get_per_channel_layout(gemm_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shapes.at(0).feature[0], 1, 1 } }; - } - - layout get_output_layout(gemm_test_params& p) { - return layout{ p.default_type, p.input_format, p.out_shape }; - } -}; - -class ConvEltwTest : public ::BaseFusingTest { -public: - - void execute(conv_eltw_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - auto find_prim = [](primitive_info& p) -> bool { - // Add more ids when needed - if (p.original_id == "deconv_prim") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_prim); - if (info_fused != pi_fused.end()) - std::cout << "kernel: " << info_fused->kernel_id << std::endl; - } - - layout get_input_layout(conv_eltw_test_params& p) { - auto pad = p.pad; - std::vector pad_ = { 0, 0, pad.spatial[0], pad.spatial[1] }; - return layout{ p.data_type, p.input_format, p.in_shape, padding{ pad_ } }; - } - - layout get_per_channel_layout(conv_eltw_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } -}; - -// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; -#define CASE_CONV_FP32_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_CONV_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_CONV_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_CONV_FP32_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx -#define CASE_CONV_FP32_5 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_FP32_6 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_7 { 1, 16, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_8 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_9 { 1, 32, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_10 { 32, 16, 4, 5, 4 }, { 32, 32, 4, 5, 4 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_11 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_12 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_13 { 1, 16, 18, 5, 4 }, { 1, 16, 16, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_FP32_14 { 1, 3, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::bfyx, data_types::f32, format::bfyx - - -#define CASE_CONV_FP16_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_CONV_FP16_2 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::os_is_yx_isv16_osv16, data_types::f16, format::bfyx -#define CASE_CONV_FP16_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::os_is_yx_isv16_osv16, data_types::f16, format::bfyx -#define CASE_CONV_FP16_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::gs_oiyx_gsv16, data_types::f16, format::bfyx -#define CASE_CONV_FP16_5 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::i8, format::bfyx, data_types::f16, format::bfyx -#define CASE_CONV_FP16_6 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_7 { 1, 16, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_8 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_9 { 1, 32, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_10 { 32, 16, 4, 5, 4 }, { 32, 32, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_11 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_12 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::g_os_is_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_CONV_FP16_13 { 16, 32, 4, 5 }, { 16, 64, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::bfyx, data_types::f16, format::bfyx - -#define CASE_CONV_U8S8_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_2 { 1, 15, 5, 5 }, { 1, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_4 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_5 { 1, 16, 5, 5 }, { 1, 32, 5, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_6 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_7 { 1, 64, 7, 7 }, { 1, 32, 7, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_8 { 1, 3, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_9 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_10 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_11 { 32, 15, 4, 5 }, { 32, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_12 { 32, 15, 5, 5 }, { 32, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_13 { 32, 16, 4, 5 }, { 32, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_14 { 32, 17, 4, 5 }, { 32, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::u8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_CONV_U8S8_15 { 1, 15, 2, 2 }, { 1, 30, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx - -#define CASE_CONV_S8S8_1 { 1, 15, 4, 5 }, { 1, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_2 { 1, 15, 5, 5 }, { 1, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_4 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_5 { 1, 16, 5, 5 }, { 1, 32, 5, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_6 { 1, 17, 4, 5 }, { 1, 17, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_7 { 1, 64, 7, 7 }, { 1, 32, 7, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_8 { 1, 3, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_9 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_10 { 16, 32, 5, 5 }, { 16, 32, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bs_fs_yx_bsv16_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_11 { 1, 4, 1280, 720 }, { 1, 4, 1280, 720 }, { 1, 1, 5, 5 }, tensor{ 1 }, tensor{ { 0, 0, 2, 2 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv4, data_types::i8, format::os_is_yx_osv16_isv4, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_12 { 32, 15, 4, 5 }, { 32, 30, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_13 { 32, 15, 5, 5 }, { 32, 30, 3, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_14 { 32, 16, 4, 5 }, { 32, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONV_S8S8_15 { 32, 17, 4, 5 }, { 32, 17, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 17, data_types::i8, format::bfyx, data_types::i8, format::goiyx, data_types::f32, format::bfyx - -#define CASE_CONV3D_U8S8_1 { 1, 15, 5, 4, 5 }, { 1, 30, 3, 2, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_U8S8_2 { 1, 15, 5, 5, 5 }, { 1, 30, 3, 3, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_U8S8_3 { 1, 16, 5, 4, 5 }, { 1, 32, 5, 4, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_U8S8_4 { 1, 17, 5, 4, 5 }, { 1, 17, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 17, data_types::u8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_U8S8_5 { 1, 3, 5, 4, 5 }, { 1, 32, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx - -#define CASE_CONV3D_S8S8_1 { 1, 15, 5, 4, 5 }, { 1, 30, 3, 2, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_S8S8_2 { 1, 15, 5, 5, 5 }, { 1, 30, 3, 3, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_S8S8_3 { 1, 16, 5, 4, 5 }, { 1, 32, 5, 4, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_S8S8_4 { 1, 17, 5, 4, 5 }, { 1, 17, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 17, data_types::i8, format::bfzyx, data_types::i8, format::goizyx, data_types::f32, format::bfzyx -#define CASE_CONV3D_S8S8_5 { 1, 3, 5, 4, 5 }, { 1, 18, 5, 4, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx - -// in_shape; out_shape; eltw_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; -#define CASE_CONV_ELTW_FP32_1 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_5 { 1, 32, 4, 5, 4 }, { 1, 32, 2, 3, 2 }, { 1, 32, 2, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_ELTW_FP32_6 { 1, 32, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, { 1, 16, 2, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_CONV_ELTW_FP32_7 { 1, 16, 3, 5 }, { 1, 32, 1, 3 }, { 1, 32, 3, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_FP32_8 { 1, 32, 3, 5, 4 }, { 1, 16, 1, 3, 2 }, { 1, 1, 2, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 2, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::g_os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx - -#define CASE_CONV_ELTW_i8_1 { 1, 16, 3, 5 }, { 1, 32, 1, 3 }, { 1, 32, 3, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_i8_2 { 1, 16, 3, 5, 3 }, { 1, 32, 2, 4, 2 }, { 1, 1, 2, 4, 2 }, { 1, 1, 2, 2, 2 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx -#define CASE_CONV_ELTW_i8_3 { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx -#define CASE_CONV_ELTW_i8_4 { 1, 16, 1, 4 }, { 1, 16, 1, 2 }, { 1, 16, 1, 1 }, { 1, 1, 1, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_CONV_ELTW_i8_5 { 1, 16, 1, 4, 1 }, { 1, 16, 1, 2, 1 }, { 1, 16, 2, 1, 1 }, { 1, 1, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx - -#define CASE_BIN_CONV1 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx -#define CASE_BIN_CONV2 { 1, 16, 4, 5 }, { 1, 30, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx -#define CASE_BIN_CONV3 { 1, 184, 12, 21 }, { 1, 224, 12, 21 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::bin, format::b_fs_yx_32fp, data_types::bin, format::os_is_yx_osv32_isv32p, data_types::f32, format::bfyx - -#define CASE_FC_FP32_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_FP32_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::yxfb, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_FP32_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_FP32_3D_1 { 5, 3, 1, 3 }, { 5, 3, 1, 5 }, { 5, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx -#define CASE_FC_FP32_3D_2 { 2, 1, 1, 1 }, { 2, 1, 1, 32 }, { 32, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx -#define CASE_FC_FP32_3D_3 { 2, 32, 1, 32 }, { 2, 32, 1, 16 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::os_iyx_osv16, data_types::f32, format::bfyx - -#define CASE_FC_U8S8_1 { 1, 1, 3, 1 }, { 1, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_U8S8_2 { 2, 1, 3, 1 }, { 2, 4, 1, 1 }, { 4, 1, 3, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_U8S8_3 { 2, 32, 1, 1 }, { 2, 16, 1, 1 }, { 16, 32, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv4, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_U8S8_3D_1 { 2, 32, 1, 3 }, { 2, 32, 1, 16 }, { 16, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_U8S8_3D_2 { 1, 1, 1, 3 }, { 1, 1, 1, 32 }, { 32, 3, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_U8S8_3D_3 { 2, 3, 1, 1 }, { 2, 3, 1, 15 }, { 15, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_FC_U8S8_3D_4 { 1, 512, 1, 1024 }, { 1, 384, 1, 1024 }, { 1024, 1024, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx - -#define CASE_NORMALIZE_I8_1 { 1, 2, 3, 3 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- FP32 convolution cases ------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ -/* ----------- NOTE: A part of tests is disabled until all FP kernels don't support fusings ------------ */ - -class ConvFusingTest : public WeightsPrimitiveFusingTest { -public: - void execute(bc_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "conv_prim") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - if (info_fused != pi_fused.end()) - std::cout << "kernel: " << info_fused->kernel_id << std::endl; - } -}; - -class conv_fp32_reorder_fsv16_to_bfyx : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_fsv16_to_bfyx, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32), - convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation), - reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32) - ); - - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_1, 2, 2 }, - bc_test_params{ CASE_CONV_FP32_2, 2, 2 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 2 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 2 }, - bc_test_params{ CASE_CONV_FP32_5, 2, 2 }, - bc_test_params{ CASE_CONV_FP32_14, 2, 2 }, - - bc_test_params{ CASE_CONV_FP16_1, 2, 2 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 2 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 2 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 2 }, - bc_test_params{ CASE_CONV_FP16_5, 2, 2 }, - bc_test_params{ CASE_CONV_FP16_13, 2, 2 } -})); - -class conv_fp32_reorder_fsv16_to_bfyx_conv : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_fsv16_to_bfyx_conv, basic) { - auto p = GetParam(); - - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - auto dw_stride = tensor{ 0, 0, 1, 1 }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - reorder("reorder_fsv16", "input", format::b_fs_yx_fsv16, data_types::f32), - convolution("conv_prim", "reorder_fsv16", { "weights" }, p.groups, p.stride, p.pad, p.dilation), - reorder("reorder_bfyx", "conv_prim", format::bfyx, data_types::f32), - convolution("conv_output", "reorder_bfyx", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - activation("activation", "conv_output", activation_func::abs), - reorder("reorder_output", "activation", p.default_format, data_types::f32) - ); - - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_fsv16_to_bfyx_conv, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_1, 3, 4 }, - bc_test_params{ CASE_CONV_FP32_2, 3, 4 }, - bc_test_params{ CASE_CONV_FP32_3, 3, 4 }, - bc_test_params{ CASE_CONV_FP32_4, 3, 4 }, - bc_test_params{ CASE_CONV_FP32_5, 3, 4 }, - bc_test_params{ CASE_CONV_FP32_14, 3, 4 }, - - bc_test_params{ CASE_CONV_FP16_1, 3, 4 }, - bc_test_params{ CASE_CONV_FP16_2, 3, 4 }, - bc_test_params{ CASE_CONV_FP16_3, 3, 4 }, - bc_test_params{ CASE_CONV_FP16_4, 3, 4 }, - bc_test_params{ CASE_CONV_FP16_5, 3, 4 }, - bc_test_params{ CASE_CONV_FP16_13, 3, 4 }, -})); - - -class conv_fp32_activation : public ConvFusingTest {}; -TEST_P(conv_fp32_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::abs), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 3 }, - - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, -})); - - -class conv_fp32_scale : public ConvFusingTest {}; -TEST_P(conv_fp32_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale, ::testing::ValuesIn(std::vector{ - // bc_test_params{ CASE_CONV_FP32_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_10, 2, 3 }, - - // bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_10, 2, 3 }, -})); - -class conv_fp32_bias : public ConvFusingTest {}; -TEST_P(conv_fp32_bias, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, std::vector{}, p.groups, p.stride, p.pad, p.dilation), - eltwise("add_bias", { "conv_prim", "bias" }, eltwise_mode::sum), - reorder("reorder_bfyx", "add_bias", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_bias, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_10, 2, 3 }, - - bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_10, 2, 3 }, -})); - -class conv_fp32_double_bias : public ConvFusingTest {}; -TEST_P(conv_fp32_double_bias, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias1", get_mem(get_bias_layout(p))), - data("bias2", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, std::vector{}, p.groups, p.stride, p.pad, p.dilation), - eltwise("add_bias1", { "conv_prim", "bias1" }, eltwise_mode::sum), - eltwise("add_bias2", { "add_bias1", "bias2" }, eltwise_mode::sum), - reorder("reorder_bfyx", "add_bias2", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_double_bias, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, -})); - -class conv_fp32_prelu_eltwise : public ConvFusingTest {}; -TEST_P(conv_fp32_prelu_eltwise, basic_sum) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_prelu_eltwise, basic_prod) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::prod), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_prelu_eltwise, eltw_broadcast_sum) { - auto p = GetParam(); - tensor eltw_shape = p.default_format.spatial_num() == 2 ? tensor{ 1, 1, 1, 1 } : tensor{ 1, 1, 1, 1, 1 }; - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, eltw_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_prelu_eltwise, eltw_broadcast_prod) { - auto p = GetParam(); - tensor eltw_shape = p.default_format.spatial_num() == 2 ? tensor{ 1, 1, 1, 1 } : tensor{ 1, 1, 1, 1, 1 }; - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, eltw_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::prod), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_prelu_eltwise, vector_ops) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_prelu_eltwise, vector_ops_mixed_types) { - auto p = GetParam(); - auto slope_type = p.default_type == data_types::f32 ? data_types::f16 : data_types::f32; - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(layout{ slope_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } })), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_prelu_eltwise, ::testing::ValuesIn(std::vector{ - // bc_test_params{ CASE_CONV_FP32_1, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 4 }, - - // bc_test_params{ CASE_CONV_FP32_1, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 4 }, -})); - -class conv_fp32_multi_eltwise_2 : public ConvFusingTest {}; -TEST_P(conv_fp32_multi_eltwise_2, basic) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1", "conv_prim", "eltwise_data", eltwise_mode::sum), - eltwise("eltwise2", "eltwise1", "conv_prim", eltwise_mode::prod), - reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_2, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 4 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 4 }, -})); - - -class conv_fp32_multi_eltwise_2_clamp : public ConvFusingTest {}; -TEST_P(conv_fp32_multi_eltwise_2_clamp, basic) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise1_data", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1", "conv_prim", "eltwise1_data", eltwise_mode::sum), - activation("activation", "eltwise1", activation_func::clamp, { 0.5f, 2.5f }), - eltwise("eltwise2", "activation", "conv_prim", eltwise_mode::prod), - reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_2_clamp, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 2, 5 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 5 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 5 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 5 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 5 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 5 }, -})); - - -class conv_fp32_multi_eltwise_4_clamp : public ConvFusingTest {}; -TEST_P(conv_fp32_multi_eltwise_4_clamp, basic) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise1_data", get_mem(get_output_layout(p))), - data("eltwise2_data", get_mem(get_output_layout(p))), - data("eltwise4_data", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1_add", "conv_prim", "eltwise1_data", eltwise_mode::sum), - activation("activation", "eltwise1_add", activation_func::clamp, { 0.5f, 2.5f }), - eltwise("eltwise2_mul", "activation", "conv_prim", eltwise_mode::prod), - eltwise("eltwise3_div", "eltwise2_mul", "eltwise2_data", eltwise_mode::prod), - eltwise("eltwise4_add", "eltwise3_div", "eltwise4_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise4_add", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_4_clamp, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 2, 7 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 7 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 7 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 7 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 7 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 7 }, -})); - -class conv_fp32_eltwise_fusing_extend_ops : public ConvFusingTest {}; -TEST_P(conv_fp32_eltwise_fusing_extend_ops, pattern01_simple_sub) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data1", get_mem(get_output_layout(p))), - data("eltwise_data2", get_mem(get_output_layout(p))), - data("eltwise_data4", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1_sum", "conv_prim", "eltwise_data1", eltwise_mode::sum), - eltwise("eltwise2_sub", "conv_prim", "eltwise_data2", eltwise_mode::sub), - eltwise("eltwise3_prod", "eltwise1_sum", "eltwise2_sub", eltwise_mode::prod), - eltwise("eltwise4_sum", "eltwise3_prod", "eltwise_data4", eltwise_mode::sum), - concatenation("concat", { "eltwise4_sum", "eltwise4_sum" }, cldnn::concatenation::along_f), - reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_eltwise_fusing_extend_ops, pattern02_sub_scale) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data1", get_mem(get_output_layout(p))), - data("eltwise_data2", get_mem(get_output_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1_sum", "conv_prim", "eltwise_data1", eltwise_mode::sum), - eltwise("eltwise2_sub", "conv_prim", "eltwise1_sum", eltwise_mode::sub), - eltwise("eltwise3_prod", "eltwise2_sub", "eltwise_data2", eltwise_mode::prod), - scale("scale", "eltwise3_prod", "scale_data"), - concatenation("concat", { "scale", "scale" }, cldnn::concatenation::along_f), - reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_fp32_eltwise_fusing_extend_ops, pattern03_sub_div) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data1", get_mem(get_output_layout(p))), - data("eltwise_data2", get_mem(get_output_layout(p), 1.0f)), - data("eltwise_data3", get_mem(get_output_layout(p))), - data("eltwise_data4", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1_sum", "conv_prim", "eltwise_data1", eltwise_mode::sum), - eltwise("eltwise2_div", "eltwise1_sum", "eltwise_data2", eltwise_mode::div), - eltwise("eltwise3_prod", "eltwise2_div", "eltwise_data3", eltwise_mode::prod), - eltwise("eltwise4_sum", "eltwise3_prod", "eltwise_data4", eltwise_mode::sum), - concatenation("concat", { "eltwise4_sum", "eltwise4_sum" }, cldnn::concatenation::along_f), - reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_eltwise_fusing_extend_ops, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 3, 7 }, - bc_test_params{ CASE_CONV_FP32_3, 3, 7 }, - bc_test_params{ CASE_CONV_FP32_4, 3, 7 }, - - bc_test_params{ CASE_CONV_FP16_2, 3, 7 }, - bc_test_params{ CASE_CONV_FP16_3, 3, 7 }, - bc_test_params{ CASE_CONV_FP16_4, 3, 7 }, -})); - -class conv_fp32_eltwise_fusing_2conv : public ConvFusingTest {}; -TEST_P(conv_fp32_eltwise_fusing_2conv, basic) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("bias0", get_mem(get_bias_layout(p))), - data("weights0", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim0", "input", { "weights0" }, { "bias0" }, p.groups, p.stride, p.pad, p.dilation), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1", "conv_prim0", "conv_prim", eltwise_mode::sum), - eltwise("eltwise2", "conv_prim0", "conv_prim", eltwise_mode::sum), - eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod), - concatenation("concat", { "eltwise3", "eltwise3" }, cldnn::concatenation::along_f), - reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim0", conv_impl }, { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_eltwise_fusing_2conv, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 4, 7 }, - bc_test_params{ CASE_CONV_FP32_3, 4, 7 }, - bc_test_params{ CASE_CONV_FP32_4, 4, 7 }, - - bc_test_params{ CASE_CONV_FP16_2, 4, 7 }, - bc_test_params{ CASE_CONV_FP16_3, 4, 7 }, - bc_test_params{ CASE_CONV_FP16_4, 4, 7 }, -})); - - -class conv_fp32_multi_eltwise_3_fusing : public ConvFusingTest {}; -TEST_P(conv_fp32_multi_eltwise_3_fusing, basic) { - if (engine.get_device_info().supports_immad) { - return; - } - - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data1", get_mem(get_output_layout(p))), - data("eltwise_data2", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum), - eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum), - eltwise("eltwise3", "eltwise1", "eltwise2", eltwise_mode::prod), - reorder("reorder_bfyx", "eltwise3", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_3_fusing, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 2, 5 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 5 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 5 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 5 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 5 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 5 }, -})); - - - -class conv_fp32_multi_eltwise_quantization : public ConvFusingTest {}; -TEST_P(conv_fp32_multi_eltwise_quantization, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("eltwise_data1", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum), - eltwise("eltwise2", "eltwise1", "quantize", eltwise_mode::prod), - reorder("reorder_bfyx", "eltwise2", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_quantization, ::testing::ValuesIn(std::vector{ -// bc_test_params{ CASE_CONV_FP32_2, 4, 5 }, - bc_test_params{ CASE_CONV_FP32_4, 4, 5 }, - - bc_test_params{ CASE_CONV_FP16_2, 4, 5 }, - bc_test_params{ CASE_CONV_FP16_3, 4, 5 }, - bc_test_params{ CASE_CONV_FP16_4, 4, 5 }, -})); - - -class conv_fp32_multi_eltwise_concat : public ConvFusingTest {}; -TEST_P(conv_fp32_multi_eltwise_concat, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data1", get_mem(get_output_layout(p))), - data("eltwise_data2", get_mem(get_output_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("weights", get_mem(get_weights_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise1", "conv_prim", "eltwise_data1", eltwise_mode::sum), - eltwise("eltwise2", "conv_prim", "eltwise_data2", eltwise_mode::sum), - concatenation("concat", - { "eltwise1", "eltwise2" }, - concatenation::concatenation_axis::along_f, - data_types::i8, - "", - padding{ { 0, 0, 0, 0 }, 0 }), - reorder("reorder_bfyx", "concat", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_multi_eltwise_concat, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 5, 5 }, - bc_test_params{ CASE_CONV_FP32_3, 5, 5 }, - bc_test_params{ CASE_CONV_FP32_4, 5, 5 }, - - bc_test_params{ CASE_CONV_FP16_2, 5, 5 }, - bc_test_params{ CASE_CONV_FP16_3, 5, 5 }, - bc_test_params{ CASE_CONV_FP16_4, 5, 5 }, -})); - -class conv_fp32_eltwise_b_fs_zyx_fsv16 : public ConvFusingTest {}; - -TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, vector_ops) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::b_fs_zyx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -class conv_fp32_swish : public ConvFusingTest {}; -TEST_P(conv_fp32_swish, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("sigmoid", "conv_prim", activation_func::logistic), - eltwise("mul", { "conv_prim", "sigmoid" }, eltwise_mode::prod), - reorder("reorder_bfyx", "mul", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_swish, ::testing::ValuesIn(std::vector{ - // bc_test_params{ CASE_CONV_FP32_1, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 4 }, - - // bc_test_params{ CASE_CONV_FP32_1, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 4 }, -})); - -TEST_P(conv_fp32_eltwise_b_fs_zyx_fsv16, splitted_vector_ops) { - auto p = GetParam(); - - std::vector weights_idx; - for (size_t w = 0; w < p.groups; w++) { - create_topologies(data("weights" + std::to_string(w), get_mem(get_weights_layout(p, p.groups)))); - weights_idx.push_back(("weights" + std::to_string(w))); - } - - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", weights_idx, {}, 1, p.stride, p.pad, p.dilation), - eltwise("eltwise", "conv_prim", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::b_fs_zyx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1e-5f; - // commented because split mode is disabled - // execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_eltwise_b_fs_zyx_fsv16, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_6, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_7, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_8, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_9, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_11, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_12, 2, 3 }, - // bc_test_params{ CASE_CONV_FP32_13, 2, 3 }, - leads to mvn_scale_activation_quantize_i8_eltwise_fp32_quantize_i8.basic/11 test failure - - bc_test_params{ CASE_CONV_FP16_6, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_7, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_8, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_9, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_11, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_12, 2, 3 }, -})); - -class conv_fp32_quantize_u8_first_conv : public ConvFusingTest {}; -TEST_P(conv_fp32_quantize_u8_first_conv, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - reorder("reordered_input", "input", format::b_fs_yx_fsv16, p.data_type), - convolution("conv_prim", "reordered_input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_quantize_u8_first_conv, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_14, 2, 3 }, -})); - -class conv_fp32_quantize_u8 : public ConvFusingTest {}; -TEST_P(conv_fp32_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_quantize_u8, ::testing::ValuesIn(std::vector{ - // For now only b_fs_yx_fsv16 supports this case - bc_test_params{ CASE_CONV_FP32_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 3 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, -})); - -class conv_fp32_scale_quantize_i8 : public ConvFusingTest {}; -TEST_P(conv_fp32_scale_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - // Output elements are in range [-127, 127] - // 1.0f difference is allowed, since quantize can return different values in ref and scale_shift kernels - // due to big error of division (in ref kernel). - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_quantize_i8, ::testing::ValuesIn(std::vector{ - // For now only b_fs_yx_fsv16 supports this case - bc_test_params{ CASE_CONV_FP32_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 4 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 4 }, -})); - -class conv_fp32_scale_activation_quantize_i8 : public ConvFusingTest {}; -TEST_P(conv_fp32_scale_activation_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ - // For now only b_fs_yx_fsv16 supports this case - bc_test_params{ CASE_CONV_FP32_2, 2, 5 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 5 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 5 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 5 }, -})); - -class conv_fp32_scale_activation_quantize_u8_eltwise_fp32 : public ConvFusingTest {}; -TEST_P(conv_fp32_scale_activation_quantize_u8_eltwise_fp32, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, p.default_type), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_u8_eltwise_fp32, ::testing::ValuesIn(std::vector{ - // For now only b_fs_yx_fsv16 supports this case - bc_test_params{ CASE_CONV_FP32_2, 2, 6 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 6 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 6 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 6 }, -})); - -class conv_fp32_scale_activation_quantize_i8_activation : public ConvFusingTest {}; -TEST_P(conv_fp32_scale_activation_quantize_i8_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("slope_data", get_mem(get_per_channel_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - activation("activation_quantize", "quantize", activation_func::relu), - reorder("reorder_bfyx", "activation_quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 2, 6 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 6 }, - - bc_test_params{ CASE_CONV_FP16_2, 2, 6 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 6 }, -})); - - -class conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public ConvFusingTest {}; -TEST_P(conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_lo1", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("out_hi1", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("eltwise_data", get_mem(layout{ data_types::i8, p.input_format, p.out_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), - reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP32_2, 2, 7 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 7 }, -})); - -class conv_fp32_activation_eltwise_in_u8_fp32 : public WeightsPrimitiveFusingTest {}; -TEST_P(conv_fp32_activation_eltwise_in_u8_fp32, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(layout{ data_types::i8, p.input_format, p.out_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::relu_negative_slope), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_eltwise_in_u8_fp32, ::testing::ValuesIn(std::vector{ - // bc_test_params{ CASE_CONV_FP32_1, 2, 4 }, - eltwise fusing not supported - bc_test_params{ CASE_CONV_FP32_2, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_3, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_4, 2, 4 }, - // bc_test_params{ CASE_CONV_FP32_5, 2, 4 }, - eltwise fusing not supported - bc_test_params{ CASE_CONV_FP32_6, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_7, 2, 4 }, - // bc_test_params{ CASE_CONV_FP32_8, 2, 4 }, - unknown bug - bc_test_params{ CASE_CONV_FP32_9, 2, 4 }, - bc_test_params{ CASE_CONV_FP32_10, 2, 4 }, -})); - -class conv_fp32_activation_eltwise_diff_sizes : public ConvEltwTest {}; -TEST_P(conv_fp32_activation_eltwise_diff_sizes, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::relu_negative_slope), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_1, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_2, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_3, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_4, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_5, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_6, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_7, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_8, 3, 4 }, -})); - -class conv_scale_activation_eltwise_fp32_quantize_i8 : public ConvEltwTest {}; -TEST_P(conv_scale_activation_eltwise_fp32_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - data("scale_data", get_mem(get_per_channel_layout(p))), - scale("scale", "conv", "scale_data"), - activation("activation", "scale", activation_func::hyperbolic_tan), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), - eltwise("eltw", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127, 127)), - data("out_high", get_mem(get_single_element_layout(p), -127, 127)), - quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_scale_activation_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_1, 2, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_2, 2, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_3, 2, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_4, 2, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_5, 3, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_6, 3, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_7, 3, 6 }, - conv_eltw_test_params{ CASE_CONV_ELTW_FP32_8, 3, 6 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* -------------------------------------- binary convolution cases ------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -class conv_bin_activation : public ConvFusingTest {}; -TEST_P(conv_bin_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - activation("activation", "bin_conv_prim", activation_func::relu), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_BIN_CONV1, 2, 3 }, -})); - -class conv_bin_scale_activation : public ConvFusingTest {}; -TEST_P(conv_bin_scale_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - scale("scale", "bin_conv_prim", "scale_data"), - activation("activation", "scale", activation_func::relu), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_scale_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_BIN_CONV1, 2, 4 }, - bc_test_params{ CASE_BIN_CONV2, 2, 4 }, -})); - -class conv_bin_quantize_bin : public ConvFusingTest {}; -TEST_P(conv_bin_quantize_bin, channel_wise_quantize) { - auto p = GetParam(); - auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("in_lo", in_thresh), - data("in_hi", in_thresh), - data("out_lo", get_mem(get_per_channel_layout(p), -1)), - data("out_hi", get_mem(get_per_channel_layout(p), 1)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - quantize("quantize_data", "bin_conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 2, data_types::bin), - reorder("reorder_bfyx", "quantize_data", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_bin_quantize_bin, blob_wise_quantize) { - auto p = GetParam(); - auto in_thresh = get_mem(get_single_element_layout(p), min_random, max_random); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("in_lo", in_thresh), - data("in_hi", in_thresh), - data("out_lo", get_mem(get_single_element_layout(p), -1)), - data("out_hi", get_mem(get_single_element_layout(p), 1)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - quantize("quantize_data", "bin_conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 2, data_types::bin), - reorder("reorder_bfyx", "quantize_data", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_quantize_bin, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_BIN_CONV1, 2, 3 }, - bc_test_params{ CASE_BIN_CONV2, 2, 3 }, -})); - -class conv_bin_scale_conv_dw : public ConvFusingTest {}; -TEST_P(conv_bin_scale_conv_dw, dw_kernel_3x3_stride2) { - auto p = GetParam(); - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - - auto dw_stride = tensor{ 1, 1, 2, 2 }; - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - scale("scale", "bin_conv_prim", "scale_data"), - convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - reorder("reorder_bfyx", "conv_dw", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_bin_scale_conv_dw, dw_kernel_3x3_stride1) { - auto p = GetParam(); - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - - auto dw_stride = tensor{ 1, 1, 1, 1 }; - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - scale("scale", "bin_conv_prim", "scale_data"), - convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - reorder("reorder_bfyx", "conv_dw", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_scale_conv_dw, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_BIN_CONV2, 3, 4 }, - bc_test_params{ CASE_BIN_CONV3, 3, 4 }, -})); - -class conv_bin_scale_conv_dw_prelu : public ConvFusingTest {}; -TEST_P(conv_bin_scale_conv_dw_prelu, dw_kernel_3x3_stride2) { - auto p = GetParam(); - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - - auto dw_stride = tensor{ 1, 1, 2, 2 }; - auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - scale("scale", "bin_conv_prim", "scale_data"), - convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - data("slope_data", get_mem(get_per_channel_layout(p))), - activation("activation", "conv_dw", "slope_data", activation_func::relu_negative_slope), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_bin_scale_conv_dw_prelu, dw_kernel_3x3_stride1) { - auto p = GetParam(); - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - - auto dw_stride = tensor{ 1, 1, 1, 1 }; - auto in_thresh = get_mem(get_per_channel_layout(p), min_random, max_random); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1e-1f)), - binary_convolution("bin_conv_prim", "input", { "weights" }, p.stride, p.pad, p.dilation, p.out_shape, p.groups), - scale("scale", "bin_conv_prim", "scale_data"), - convolution("conv_dw", "scale", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - data("slope_data", get_mem(get_per_channel_layout(p))), - activation("activation", "conv_dw", "slope_data", activation_func::relu_negative_slope), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_bin_scale_conv_dw_prelu, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_BIN_CONV2, 3, 5 }, - bc_test_params{ CASE_BIN_CONV3, 3, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- INT8 convolution cases ------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -class conv_int8_scale : public ConvFusingTest {}; -TEST_P(conv_int8_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_int8_scale, fp16_scale_out) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data", optional_data_type{ data_types::f16 }), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 3 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, -})); - -class conv_int8_eltwise : public ConvFusingTest {}; -TEST_P(conv_int8_eltwise, fp16_eltwise_out) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("scale", { "conv_prim", "scale_data" }, eltwise_mode::prod, data_types::f16), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 3 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, -})); - -class conv_int8_scale_shift_swish : public ConvFusingTest {}; -TEST_P(conv_int8_scale_shift_swish, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - data("shift_data", get_mem(get_per_channel_layout(p), 1)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("scale0", { "conv_prim", "scale_data" }, eltwise_mode::prod), - eltwise("scale1", { "conv_prim", "scale_data" }, eltwise_mode::prod), - eltwise("shift0", { "scale0", "shift_data" }, eltwise_mode::sum), - eltwise("shift1", { "scale1", "shift_data" }, eltwise_mode::sum), - activation("sigmoid", "shift0", activation_func::logistic), - eltwise("mul", { "shift1", "sigmoid" }, eltwise_mode::prod), - reorder("reorder_bfyx", "mul", p.default_format, data_types::f32) - ); - - tolerance = 1e-3f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 8 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 8 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 8 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 8 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 8 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 8 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 8 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 8 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 8 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 8 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 8 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 8 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 8 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 8 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 8 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 8 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 8 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 8 }, -})); - -class conv_int8_prelu_eltwise : public ConvFusingTest {}; -TEST_P(conv_int8_prelu_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_int8_prelu_eltwise, fsv16) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", "slope_data", activation_func::relu_negative_slope), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - if (p.default_format.dimension() == 4) { - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - } else { - // TODO Add 5D int8 optimized convolution implementations - return; - } - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_prelu_eltwise, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_7, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_8, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_7, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_8, 2, 4 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 4 }, -})); - -class conv_int8_activation_eltwise_quantize : public ConvFusingTest {}; -TEST_P(conv_int8_activation_eltwise_quantize, fsv16) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::negative), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - if (p.default_format.dimension() == 4) { - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - } else { - // TODO Add 5D int8 optimized convolution implementations - return; - } - - tolerance = 1.f; - execute(p); -} - -TEST_P(conv_int8_activation_eltwise_quantize, fsv32) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::negative), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - if (p.default_format.dimension() == 4) { - implementation_desc conv_impl = { format::b_fs_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - } else { - // TODO Add 5D int8 optimized convolution implementations - return; - } - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise_quantize, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_7, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_8, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_7, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_8, 2, 5 }, -})); - -class conv_int8_activation_eltwise : public ConvFusingTest {}; -TEST_P(conv_int8_activation_eltwise, fsv16) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::negative), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - if (p.default_format.dimension() == 4) { - implementation_desc conv_impl = { format::b_fs_yx_fsv16, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - } else { - // TODO Add 5D int8 optimized convolution implementations - return; - } - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(conv_int8_activation_eltwise, fsv32) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::negative), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - if (p.default_format.dimension() == 4) { - implementation_desc conv_impl = { format::b_fs_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - } else { - // TODO Add 5D int8 optimized convolution implementations - return; - } - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_7, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_8, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_7, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_8, 2, 4 }, -})); - -class conv_int8_quantize_u8 : public ConvFusingTest {}; -TEST_P(conv_int8_quantize_u8, per_channel) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -TEST_P(conv_int8_quantize_u8, per_tensor) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), -10)), - data("in_hi", get_mem(get_single_element_layout(p), 10)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_quantize_u8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_8, 2, 3 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, -})); - -class conv_int8_scale_quantize_i8 : public ConvFusingTest {}; -TEST_P(conv_int8_scale_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - // Output elements are in range [-127, 127] - // 1.0f difference is allowed, since quantize can return different values in ref and scale_shift kernels - // due to big error of division (in ref kernel). - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_quantize_i8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_9, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 4 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 4 }, -})); - -class conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8 : public ConvFusingTest {}; -TEST_P(conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - implementation_desc conv_impl = { format::b_fs_yx_fsv4, "convolution_gpu_b_fs_yx_fsv4_int8" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_quantize_i8_conv_b_fs_yx_fsv4_int8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_S8S8_11, 2, 4 }, -})); - -class conv_int8_relu_quantize : public ConvFusingTest {}; -TEST_P(conv_int8_relu_quantize, i8) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("relu", "conv_prim", activation_func::relu), - quantize("quantize", "relu", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - // Output elements are in range [-127, 127] - // 1.0f difference is allowed, since quantize can return different values in ref and scale_shift kernels - // due to big error of division (in ref kernel). - tolerance = 1.0f; - execute(p); -} - -TEST_P(conv_int8_relu_quantize, u8) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("relu", "conv_prim", activation_func::relu), - quantize("quantize", "relu", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_relu_quantize, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 4 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 4 }, -})); - -class conv_int8_scale_activation_quantize_i8 : public ConvFusingTest {}; -TEST_P(conv_int8_scale_activation_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 2.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 5 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 5 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 5 }, -})); - -class conv_int8_scale_activation_quantize_i8_eltwise_fp32 : public ConvFusingTest {}; -TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltwise_fp32, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 6 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 6 }, -})); - -class conv_int8_scale_activation_quantize_i8_activation : public ConvFusingTest {}; -TEST_P(conv_int8_scale_activation_quantize_i8_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("slope_data", get_mem(get_per_channel_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - activation("activation_quantize", "quantize", activation_func::relu), - reorder("reorder_bfyx", "activation_quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 6 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 6 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 6 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 6 }, -})); - - -class conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8 : public ConvFusingTest {}; -// With some input values accuracy error might be = 2, so the test is disabled. -TEST_P(conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, DISABLED_basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_lo1", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("out_hi1", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("eltwise_data", get_mem(layout{ data_types::i8, p.input_format, p.out_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), - reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_activation_quantize_i8_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 7 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 7 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 7 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 7 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 7 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 7 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 7 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 7 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 7 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 7 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 7 }, -})); - -class conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec : public ConvFusingTest {}; -TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_lo1", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("out_hi1", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("slope_data", get_mem(get_per_channel_layout(p))), - data("eltwise_data", get_mem(layout{ data_types::i8, format::b_fs_yx_fsv4, p.out_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), - reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::b_fs_yx_fsv4, "convolution_gpu_b_fs_yx_fsv4_1x1" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1.f; - execute(p); -} - -TEST_P(conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, vector_ops_mixed_types) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_lo1", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("in_hi1", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_lo1", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("out_hi1", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()/255)), - data("slope_data", get_mem(layout{ data_types::f16, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } })), - data("eltwise_data", get_mem(layout{ data_types::u8, format::b_fs_yx_fsv4, p.out_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - activation("activation_scale", "scale", "slope_data", activation_func::relu_negative_slope), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - quantize("quantize_1", "sum", "in_lo1", "in_hi1", "out_lo1", "out_hi1", 255, data_types::i8), - reorder("reorder_bfyx", "quantize_1", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::b_fs_yx_fsv4, "convolution_gpu_b_fs_yx_fsv4_1x1" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_prelu_quantize_i8_eltwise_fp32_quantize_i8_vec, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_3, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_5, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_5, 2, 7 }, -})); - -class conv_int8_asymmetric_weights : public ConvFusingTest {}; -TEST_P(conv_int8_asymmetric_weights, basic) { - auto p = GetParam(); - auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx; - auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) : - get_weights_layout(p); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(weights_layout)), - data("bias", get_mem(get_bias_layout(p))), - data("w_zp", get_mem(get_weights_zp_layout(p), 1, 127)), - eltwise("w_sub", { "weights", "w_zp" }, eltwise_mode::sub, data_types::f32), - convolution("conv_prim", "input", { "w_sub" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), - reorder("reorder_bfyx", "conv_prim", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - ASSERT_FALSE(network_fused.get_primitives_info().empty()); - ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); - - // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto pi_not_fused = network_not_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_conv); - - ASSERT_TRUE(info_fused != pi_fused.end()); - ASSERT_TRUE(info_not_fused != pi_not_fused.end()); - - ASSERT_EQ(info_fused->c_dependencies.size(), 4lu); // input + weights + bias + w_zp - ASSERT_EQ(info_not_fused->c_dependencies.size(), 3lu); // input + weights + bias - - compare(network_not_fused, network_fused, p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_asymmetric_weights, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 2 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 2 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 2 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 2 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 2 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 2 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 2 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 2 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 2 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 2 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 2 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 2 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 2 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 2 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 2 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 2 }, -})); - -class conv_int8_asymmetric_data : public ConvFusingTest {}; -TEST_P(conv_int8_asymmetric_data, basic) { - auto p = GetParam(); - auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx; - auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) : - get_weights_layout(p); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(weights_layout)), - data("bias", get_mem(get_bias_layout(p))), - data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)), - eltwise("a_sub", { "input", "a_zp" }, eltwise_mode::sub, data_types::f32), - convolution("conv_prim", "a_sub", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), - reorder("reorder_bfyx", "conv_prim", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - ASSERT_FALSE(network_fused.get_primitives_info().empty()); - ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); - - // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto pi_not_fused = network_not_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_conv); - - ASSERT_TRUE(info_fused != pi_fused.end()); - ASSERT_TRUE(info_not_fused != pi_not_fused.end()); - - ASSERT_EQ(info_fused->c_dependencies.size(), 5lu); // input + weights + bias + a_zp + comp - ASSERT_EQ(info_not_fused->c_dependencies.size(), 3lu); // input + weights + bias - - compare(network_not_fused, network_fused, p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_asymmetric_data, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 3 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, -})); - -class conv_int8_asymmetric_data_and_weights : public ConvFusingTest {}; -TEST_P(conv_int8_asymmetric_data_and_weights, basic) { - auto p = GetParam(); - auto weights_format = (p.weights_format == format::goiyx) ? format::bfyx : format::bfzyx; - auto weights_layout = (p.groups > 1) ? get_weights_layout(p, 1, weights_format) : - get_weights_layout(p); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(weights_layout)), - data("bias", get_mem(get_bias_layout(p))), - data("a_zp", get_mem(get_activations_zp_layout(p), 1, 127)), - data("w_zp", get_mem(get_weights_zp_layout(p), 1, 127)), - eltwise("a_sub", { "input", "a_zp" }, eltwise_mode::sub, data_types::f32), - eltwise("w_sub", { "weights", "w_zp" }, eltwise_mode::sub, data_types::f32), - convolution("conv_prim", "a_sub", { "w_sub" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), - reorder("reorder_bfyx", "conv_prim", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - ASSERT_FALSE(network_fused.get_primitives_info().empty()); - ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); - - // Search for both conv_prim and reorder_bfyx, as in case of fused topology convolution will be merged with the last reorder - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "conv_prim" || p.original_id == "reorder_bfyx") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto pi_not_fused = network_not_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_conv); - - ASSERT_TRUE(info_fused != pi_fused.end()); - ASSERT_TRUE(info_not_fused != pi_not_fused.end()); - - ASSERT_EQ(info_fused->c_dependencies.size(), 6lu); // input + weights + bias + a_zp + w_zp + comp - ASSERT_EQ(info_not_fused->c_dependencies.size(), 3lu); // input + weights + bias - - compare(network_not_fused, network_fused, p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_asymmetric_data_and_weights, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 3 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_4, 2, 3 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 2, 3 }, -})); - - -class conv_i8_activation_eltwise_diff_sizes : public ConvEltwTest {}; -TEST_P(conv_i8_activation_eltwise_diff_sizes, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::abs), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_i8_activation_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_CONV_ELTW_i8_1, 3, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_i8_2, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_i8_3, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_i8_4, 2, 4 }, - conv_eltw_test_params{ CASE_CONV_ELTW_i8_5, 3, 4 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ----------------------------------- Force convolution kernel cases ---------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -class ConvFusingForceKernelTest : public ::WeightsPrimitiveFusingTest { - public: - void execute(bc_force_kernel_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - build_options options; - options.set_option(build_option::optimize_data(true)); - implementation_desc conv_impl = { p.input_format, p.kernel_name }; - options.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, options); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "conv_prim") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - if (info_fused != pi_fused.end()) - std::cout << "kernel: " << info_fused->kernel_id << std::endl; - } -}; - -class conv_fp16_activation : public ConvFusingForceKernelTest {}; -TEST_P(conv_fp16_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::abs), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp16_activation, ::testing::ValuesIn(std::vector{ - bc_force_kernel_params{ CASE_CONV_FP16_13, 2, 3, "convolution_gpu_fs_byx_fsv32" }, -})); - - -class conv_fp16_scale : public ConvFusingForceKernelTest {}; -TEST_P(conv_fp16_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - scale("scale", "conv_prim", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp16_scale, ::testing::ValuesIn(std::vector{ - bc_force_kernel_params{ CASE_CONV_FP16_13, 2, 3, "convolution_gpu_fs_byx_fsv32" }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- FC cases --------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -class FCFusingTest : public WeightsPrimitiveFusingTest {}; -class fc_fp32_activation : public FCFusingTest {}; -TEST_P(fc_fp32_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - fully_connected("fc_prim", "input", "weights", "bias", "", padding(), get_fc_output_dim_size(p)), - activation("activation", "fc_prim", activation_func::abs), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_FC_FP32_1, 2, 3 }, - bc_test_params{ CASE_FC_FP32_2, 2, 3 }, - bc_test_params{ CASE_FC_FP32_3, 2, 3 }, - bc_test_params{ CASE_FC_FP32_3D_1, 2, 3 }, - bc_test_params{ CASE_FC_FP32_3D_2, 2, 3 }, - bc_test_params{ CASE_FC_FP32_3D_3, 2, 3 }, -})); - -class fc_fp32_bias : public FCFusingTest {}; -TEST_P(fc_fp32_bias, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - fully_connected("fc_prim", "input", "weights", ""), - eltwise("bias_add", { "fc_prim", "bias" }, eltwise_mode::sum), - reorder("reorder_bfyx", "bias_add", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_fp32_bias, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_FC_FP32_1, 2, 3 }, - bc_test_params{ CASE_FC_FP32_2, 2, 3 }, - bc_test_params{ CASE_FC_FP32_3, 2, 3 }, -})); - -class fc_int8_scale : public FCFusingTest {}; -TEST_P(fc_int8_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())), - fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_fc_output_dim_size(p)), - scale("scale", "fc_prim", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(fc_int8_scale, fp16_scale_out) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count())), - fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_fc_output_dim_size(p)), - scale("scale", "fc_prim", "scale_data", optional_data_type{ data_types::f16 }), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_FC_U8S8_1, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_2, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3D_1, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3D_2, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3D_3, 2, 3 }, -})); - -class fc_int8_quantize_u8 : public FCFusingTest {}; -TEST_P(fc_int8_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_fc_output_dim_size(p)), - quantize("quantize", "fc_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu_fc, fc_int8_quantize_u8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_FC_U8S8_1, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_2, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3D_1, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3D_2, 2, 3 }, - bc_test_params{ CASE_FC_U8S8_3D_3, 2, 3 }, -})); - -class fc_int8_scale_quantize_i8 : public FCFusingTest {}; -TEST_P(fc_int8_scale_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_fc_output_dim_size(p)), - scale("scale", "fc_prim", "scale_data"), - quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_quantize_i8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_FC_U8S8_1, 2, 4 }, - bc_test_params{ CASE_FC_U8S8_2, 2, 4 }, - bc_test_params{ CASE_FC_U8S8_3, 2, 4 }, - bc_test_params{ CASE_FC_U8S8_3D_1, 2, 4 }, - bc_test_params{ CASE_FC_U8S8_3D_2, 2, 4 }, - bc_test_params{ CASE_FC_U8S8_3D_3, 2, 4 }, -})); - - - -class fc_int8_scale_activation_quantize_i8 : public FCFusingTest {}; -TEST_P(fc_int8_scale_activation_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - fully_connected("fc_prim", "input", "weights", "bias", data_types::f32, "", padding(), get_fc_output_dim_size(p)), - scale("scale", "fc_prim", "scale_data"), - activation("activation_scale", "scale", activation_func::exp), - quantize("quantize", "activation_scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_scale_activation_quantize_i8, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_FC_U8S8_1, 2, 5 }, - bc_test_params{ CASE_FC_U8S8_2, 2, 5 }, - bc_test_params{ CASE_FC_U8S8_3, 2, 5 }, - - bc_test_params{ CASE_FC_U8S8_3D_1, 2, 5 }, - bc_test_params{ CASE_FC_U8S8_3D_2, 2, 5 }, - bc_test_params{ CASE_FC_U8S8_3D_3, 2, 5 }, - - bc_test_params{ CASE_FC_FP32_3D_1, 3, 5 }, - bc_test_params{ CASE_FC_FP32_3D_2, 3, 5 }, - bc_test_params{ CASE_FC_FP32_3D_3, 3, 5 }, -})); - - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- Gemm cases ------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -#define CASE_GEMM_3IN_FP32_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_3IN_FP32_2 { { 1, 1, 63, 63 }, { 1, 1, 63, 63 }, { 1, 1, 63, 63 } }, { 1, 1, 63, 63 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_3IN_FP32_3 { { 1, 1, 128, 128 }, { 1, 1, 128, 128 }, { 1, 1, 128, 128 } }, { 1, 1, 128, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_3IN_FP32_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 }, { 1, 2, 256, 128 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_3IN_FP16_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_3IN_FP16_2 { { 1, 1, 31, 31 }, { 1, 1, 31, 31 }, { 1, 1, 31, 31 } }, { 1, 1, 31, 31 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_3IN_FP16_3 { { 1, 1, 64, 64 }, { 1, 1, 64, 64 }, { 1, 1, 64, 64 } }, { 1, 1, 64, 64 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_3IN_FP16_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 }, { 1, 2, 256, 128 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_3IN_S8S8_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_3IN_S8S8_2 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 }, { 1, 2, 256, 128 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_3IN_S8S8_3 { { 1, 1, 8, 16 }, { 1, 1, 32, 8 }, { 1, 1, 32, 16 } }, { 1, 1, 32, 16 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx - -#define CASE_GEMM_2IN_FP32_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_FP32_2 { { 1, 1, 63, 63 }, { 1, 1, 63, 63 } }, { 1, 1, 63, 63 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_FP32_3 { { 1, 1, 128, 128 }, { 1, 1, 128, 128 } }, { 1, 1, 128, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_FP32_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_FP16_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_2IN_FP16_2 { { 1, 1, 31, 31 }, { 1, 1, 31, 31 } }, { 1, 1, 31, 31 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_2IN_FP16_3 { { 1, 1, 64, 64 }, { 1, 1, 64, 64 } }, { 1, 1, 64, 64 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_2IN_FP16_4 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_2IN_U8U8_1 { { 1, 1, 2, 2 }, { 1, 1, 2, 2 } }, { 1, 1, 2, 2 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_U8U8_2 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_U8U8_3 { { 1, 1, 16, 32 }, { 1, 1, 32, 16 } }, { 1, 1, 32, 32 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx - -#define CASE_GEMM_2IN_U8S8_1 { { 1, 1, 4, 2 }, { 1, 1, 8, 4 } }, { 1, 1, 8, 4 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::i8, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_2IN_S8U8_1 { { 1, 2, 64, 128 }, { 1, 2, 256, 64 } }, { 1, 2, 256, 128 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx - -#define CASE_GEMM_ELTWISE_2IN_FP32_1 { { 1, 1, 4, 4 }, { 1, 1, 4, 4 } }, { 1, 1, 4, 4 }, tensor{ 1 }, tensor{ 0 }, data_types::f32, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_ELTWISE_2IN_FP16_1 { { 1, 1, 32, 32 }, { 1, 1, 32, 32 } }, { 1, 1, 32, 32 }, tensor{ 1 }, tensor{ 0 }, data_types::f16, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GEMM_ELTWISE_2IN_U8S8_1 { { 1, 1, 4, 4 }, { 1, 1, 4, 4 } }, { 1, 1, 4, 4 }, tensor{ 1 }, tensor{ 0 }, data_types::u8, data_types::i8, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_GEMM_ELTWISE_2IN_S8U8_1 { { 1, 1, 32, 32 }, { 1, 1, 32, 32 } }, { 1, 1, 32, 32 }, tensor{ 1 }, tensor{ 0 }, data_types::i8, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx - -class gemm_3in_quantize_i8 : public GemmFusingTest {}; -TEST_P(gemm_3in_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - input_layout("input2", get_input_layout(p, 2)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - gemm("gemm_prim", { "input0", "input1", "input2" }, data_types::f32), - quantize("quantize", "gemm_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_3in_quantize_i8, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_3IN_FP16_1, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP16_2, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP16_3, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP16_4, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP32_1, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP32_2, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP32_3, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_FP32_4, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_S8S8_1, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_S8S8_2, 4, 5 }, - gemm_test_params{ CASE_GEMM_3IN_S8S8_3, 4, 5 }, -})); - -class gemm_2in_quantize_u8 : public GemmFusingTest {}; -TEST_P(gemm_2in_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("in_lo", get_mem(get_per_channel_layout(p), 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - quantize("quantize", "gemm_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_quantize_u8, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 }, -})); - -class gemm_2in_quantize_float_in : public GemmFusingTest {}; -TEST_P(gemm_2in_quantize_float_in, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("in_lo", get_mem(get_per_channel_layout(p), 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - quantize("quantize", "gemm_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - implementation_desc gemm_impl = { format::bfyx, "gemm_tiled_opt" }; - bo_fused.set_option(build_option::force_implementations({ { "gemm_prim", gemm_impl } })); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_quantize_float_in, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 4 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP16_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP32_1, 3, 4 }, -})); - -class gemm_2in_scale : public GemmFusingTest {}; -TEST_P(gemm_2in_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - scale("scale", "gemm_prim", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(gemm_2in_scale, fp16_scale_out) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - scale("scale", "gemm_prim", "scale_data", optional_data_type{ data_types::f16 }), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_scale, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_U8U8_1, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_U8U8_2, 3, 4 }, - gemm_test_params{ CASE_GEMM_2IN_U8U8_3, 3, 4 }, -})); - -class gemm_2in_act_scale_quantize_i8 : public GemmFusingTest {}; -TEST_P(gemm_2in_act_scale_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - activation("activation", "gemm_prim", activation_func::exp), - scale("scale", "activation", "scale_data"), - quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_act_scale_quantize_i8, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_2IN_FP32_1, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_2, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_3, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP32_4, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_1, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_2, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_3, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_FP16_4, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_U8S8_1, 3, 6 }, - gemm_test_params{ CASE_GEMM_2IN_S8U8_1, 3, 6 }, -})); - -class gemm_2in_act_scale_quantize_eltwise_i8 : public GemmFusingTest {}; -TEST_P(gemm_2in_act_scale_quantize_eltwise_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - data("eltwise_data", get_mem(get_output_layout(p))), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - activation("activation", "gemm_prim", activation_func::exp), - scale("scale", "activation", "scale_data"), - quantize("quantize", "scale", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - eltwise("sum", { "quantize", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_act_scale_quantize_eltwise_i8, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP32_1, 3, 7 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP16_1, 3, 7 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_U8S8_1, 3, 7 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_S8U8_1, 3, 7 }, -})); - -class gemm_2in_act_scale_eltwise : public GemmFusingTest {}; -TEST_P(gemm_2in_act_scale_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - data("eltwise_data", get_mem(get_output_layout(p))), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - scale("scale", "gemm_prim", "scale_data"), - activation("activation", "scale", activation_func::negative), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1e-4f; - execute(p); -} - -TEST_P(gemm_2in_act_scale_eltwise, broadcast_eltwise) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p, 0)), - input_layout("input1", get_input_layout(p, 1)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / p.kernel.count() / 255)), - data("eltwise_data", get_mem(get_single_element_layout(p))), - gemm("gemm_prim", { "input0", "input1" }, data_types::f32), - scale("scale", "gemm_prim", "scale_data"), - activation("activation", "scale", activation_func::negative), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1e-4f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gemm_2in_act_scale_eltwise, ::testing::ValuesIn(std::vector{ - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP32_1, 3, 6 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_FP16_1, 3, 6 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_U8S8_1, 3, 6 }, - gemm_test_params{ CASE_GEMM_ELTWISE_2IN_S8U8_1, 3, 6 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- Resample cases --------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -#define CASE_RESAMPLE_FP32_1 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f32, format::bfyx, resample_type::nearest, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_2 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f32, format::bfyx, resample_type::bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_3 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f32, format::bfyx, resample_type::caffe_bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_4 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::bfyx, resample_type::nearest, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_5 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::bfyx, resample_type::bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_6 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::bfyx, resample_type::caffe_bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_7 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f32, format::bfzyx, resample_type::nearest, data_types::f32, format::bfzyx -#define CASE_RESAMPLE_FP32_8 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f32, format::bfzyx, resample_type::caffe_bilinear, data_types::f32, format::bfzyx -#define CASE_RESAMPLE_FP32_9 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_FP32_10 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f32, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f32, format::bfyx - -#define CASE_RESAMPLE_FP16_1 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f16, format::bfyx, resample_type::nearest, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_2 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f16, format::bfyx, resample_type::bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_3 { 1, 15, 4, 5 }, { 1, 15, 2, 3 }, data_types::f16, format::bfyx, resample_type::caffe_bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_4 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::bfyx, resample_type::nearest, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_5 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::bfyx, resample_type::bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_6 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::bfyx, resample_type::caffe_bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_7 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f16, format::bfzyx, resample_type::nearest, data_types::f16, format::bfzyx -#define CASE_RESAMPLE_FP16_8 { 1, 16, 4, 5, 4 }, { 1, 16, 2, 3, 2 }, data_types::f16, format::bfzyx, resample_type::caffe_bilinear, data_types::f16, format::bfzyx -#define CASE_RESAMPLE_FP16_9 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_10 { 2, 32, 4, 5 }, { 2, 32, 7, 8 }, data_types::f16, format::fs_b_yx_fsv32, resample_type::bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_11 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_12 { 2, 32, 4, 5 }, { 2, 32, 7, 8 }, data_types::f16, format::fs_b_yx_fsv32, resample_type::caffe_bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_13 { 1, 16, 4, 5 }, { 1, 16, 7, 8 }, data_types::f16, format::b_fs_yx_fsv16, resample_type::caffe_bilinear, data_types::f16, format::bfyx -#define CASE_RESAMPLE_FP16_14 { 1, 32, 4, 5 }, { 1, 32, 2, 3 }, data_types::f16, format::fs_b_yx_fsv32, resample_type::caffe_bilinear, data_types::f16, format::bfyx - -#define CASE_RESAMPLE_I8_1 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx -#define CASE_RESAMPLE_I8_2 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx -#define CASE_RESAMPLE_I8_3 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_I8_4 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::i8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx - -#define CASE_RESAMPLE_U8_1 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx -#define CASE_RESAMPLE_U8_2 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::nearest, data_types::f32, format::bfyx -#define CASE_RESAMPLE_U8_3 { 1, 16, 4, 5 }, { 1, 16, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx -#define CASE_RESAMPLE_U8_4 { 2, 32, 4, 5 }, { 2, 32, 2, 3 }, data_types::u8, format::b_fs_yx_fsv16, resample_type::bilinear, data_types::f32, format::bfyx - -class resample_quantize : public ResamplePrimitiveFusingTest {}; -TEST_P(resample_quantize, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - resample("resample_prim", "input", p.out_shape, p.in_shape.feature[0], p.type), - quantize("quantize", "resample_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_quantize, ::testing::ValuesIn(std::vector{ - resample_test_params{ CASE_RESAMPLE_FP32_1, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_2, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_3, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_4, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_5, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_6, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_7, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_8, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_9, 2, 3 }, - resample_test_params{ CASE_RESAMPLE_FP32_10, 2, 3 }, - - // FQ can't be fused to FP16 primitive for now - // resample_test_params{ CASE_RESAMPLE_FP16_1, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_2, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_3, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_4, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_5, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_6, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_7, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_8, 2, 3 }, - // resample_test_params{ CASE_RESAMPLE_FP16_9, 2, 3 }, -})); - -class resample_scale_activation_eltwise : public ResamplePrimitiveFusingTest {}; -TEST_P(resample_scale_activation_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - data("eltwise_data", get_mem(get_output_layout(p), -10, 10)), - resample("resample_prim", "input", p.out_shape, p.in_shape.feature[0], p.type), - scale("scale", "resample_prim", "scale_data"), - activation("activation", "scale", activation_func::abs), - eltwise("eltwise", { "activation", "eltwise_data" }, eltwise_mode::sum), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ - resample_test_params{ CASE_RESAMPLE_FP32_1, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_2, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_3, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_4, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_5, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_6, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_7, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_8, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_9, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP32_10, 2, 5 }, - - resample_test_params{ CASE_RESAMPLE_FP16_1, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_2, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_3, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_4, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_5, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_6, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_7, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_8, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_9, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_10, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_11, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_12, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_13, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_FP16_14, 2, 5 }, - - resample_test_params{ CASE_RESAMPLE_I8_1, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_I8_2, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_I8_3, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_I8_4, 2, 5 }, - - resample_test_params{ CASE_RESAMPLE_U8_1, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_U8_2, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_U8_3, 2, 5 }, - resample_test_params{ CASE_RESAMPLE_U8_4, 2, 5 }, -})); - -class resample_quantize_concat : public ResamplePrimitiveFusingTest {}; -TEST_P(resample_quantize_concat, along_f) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - resample("resample1", "input", p.out_shape, p.in_shape.feature[0], p.type), - data("in_lo_1", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi_1", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo_1", get_mem(get_single_element_layout(p), -128)), - data("out_hi_1", get_mem(get_single_element_layout(p), 127)), - quantize("quant1", "resample1", "in_lo_1", "in_hi_1", "out_lo_1", "out_hi_1", 256, data_types::i8), - resample("resample2", "input", p.out_shape, p.in_shape.feature[0], p.type), - data("in_lo_2", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi_2", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo_2", get_mem(get_single_element_layout(p), -127)), - data("out_hi_2", get_mem(get_single_element_layout(p), 127)), - quantize("quant2", "resample2", "in_lo_2", "in_hi_2", "out_lo_2", "out_hi_2", 255, data_types::i8), - concatenation("concat", { "quant1", "quant2" }, cldnn::concatenation::along_f), - reorder("reorder_bfyx", "concat", cldnn::format::bfyx, p.default_type) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_quantize_concat, ::testing::ValuesIn(std::vector{ - resample_test_params{ CASE_RESAMPLE_FP32_1, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_2, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_4, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_5, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_6, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_10, 3, 6 }, - - resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_4, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_5, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_6, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_7, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_11, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_12, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_13, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_14, 3, 6 }, - - resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6 }, - - resample_test_params{ CASE_RESAMPLE_U8_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_U8_4, 3, 6 }, -})); - -class resample_scale_concat : public ResamplePrimitiveFusingTest {}; -TEST_P(resample_scale_concat, along_f) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - resample("resample1", "input", p.out_shape, p.in_shape.feature[0], p.type), - data("scale1_scale", get_mem(get_per_channel_layout(p), -10, 10)), - data("scale1_shift", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale1", "resample1", "scale1_scale", "scale1_shift"), - resample("resample2", "input", p.out_shape, p.in_shape.feature[0], p.type), - data("scale2_scale", get_mem(get_per_channel_layout(p), -10, 10)), - data("scale2_shift", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale2", "resample2", "scale2_scale", "scale2_shift"), - concatenation("concat", { "scale1", "scale2" }, cldnn::concatenation::along_f), - reorder("reorder_bfyx", "concat", cldnn::format::bfyx, p.default_type) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, resample_scale_concat, ::testing::ValuesIn(std::vector{ - resample_test_params{ CASE_RESAMPLE_FP32_1, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_2, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_4, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_5, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_6, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_7, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_8, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_9, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP32_10, 3, 6 }, - - resample_test_params{ CASE_RESAMPLE_FP16_1, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_2, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_4, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_5, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_6, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_7, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_8, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_9, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_10, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_11, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_12, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_13, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_FP16_14, 3, 6 }, - - resample_test_params{ CASE_RESAMPLE_I8_1, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_I8_2, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_I8_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_I8_4, 3, 6 }, - - resample_test_params{ CASE_RESAMPLE_U8_1, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_U8_2, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_U8_3, 3, 6 }, - resample_test_params{ CASE_RESAMPLE_U8_4, 3, 6 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* --------------------------------------- MVN cases --------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct mvn_test_params { - tensor input_size; - tensor elwise_size; - data_types input_type; - format input_format; - bool across_channels; - bool normalize_variance; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_MVN_F32_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::f32, format::bfyx, false, true, data_types::f32, format::bfyx -#define CASE_MVN_F32_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::f32, format::bfyx, true, true, data_types::f32, format::bfyx -#define CASE_MVN_3D_F32_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, false, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_F32_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, true, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_F32_3 { 2, 8, 4, 4, 4 }, { 2, 8, 1, 1, 1 }, data_types::f32, format::bfzyx, true, true, data_types::f32, format::bfzyx -#define CASE_MVN_F16_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::f16, format::bfyx, false, true, data_types::f16, format::bfyx -#define CASE_MVN_F16_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::f16, format::bfyx, true, true, data_types::f16, format::bfyx -#define CASE_MVN_3D_F16_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::f16, format::bfzyx, false, true, data_types::f16, format::bfzyx -#define CASE_MVN_3D_F16_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::f16, format::bfzyx, true, true, data_types::f16, format::bfzyx -#define CASE_MVN_I8_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::i8, format::bfyx, false, true, data_types::f32, format::bfyx -#define CASE_MVN_I8_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::i8, format::bfyx, true, true, data_types::f32, format::bfyx -#define CASE_MVN_I8_3 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx -#define CASE_MVN_I8_4 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx -#define CASE_MVN_I8_5 { 2, 16, 8, 8 }, { 1, 1, 1, 8 }, data_types::i8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx -#define CASE_MVN_I8_6 { 2, 16, 8, 8 }, { 1, 1, 1, 1 }, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx -#define CASE_MVN_I8_7 { 2, 16, 1, 8 }, { 1, 1, 8, 1 }, data_types::i8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx -#define CASE_MVN_3D_I8_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_I8_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::i8, format::bfzyx, true, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_I8_3 { 2, 16, 8, 8, 8 }, { 2, 1, 8, 8, 1 }, data_types::i8, format::bfzyx, true, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_I8_4 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 1, 8 }, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_I8_5 { 2, 2, 1, 2, 1 }, { 2, 2, 2, 2, 2 }, data_types::i8, format::bfzyx, false, true, data_types::f32, format::bfzyx -#define CASE_MVN_U8_1 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::u8, format::bfyx, false, true, data_types::f32, format::bfyx -#define CASE_MVN_U8_2 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::u8, format::bfyx, true, true, data_types::f32, format::bfyx -#define CASE_MVN_U8_3 { 1, 16, 8, 8 }, { 1, 16, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx -#define CASE_MVN_U8_4 { 2, 16, 8, 8 }, { 2, 16, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx -#define CASE_MVN_U8_5 { 2, 16, 8, 8 }, { 2, 1, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, false, true, data_types::f32, format::bfyx -#define CASE_MVN_U8_6 { 2, 16, 8, 8 }, { 1, 1, 1, 8 }, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx -#define CASE_MVN_U8_7 { 1, 16, 16, 1 }, { 1, 16, 1, 16 }, data_types::u8, format::b_fs_yx_fsv16, true, true, data_types::f32, format::bfyx -#define CASE_MVN_3D_U8_1 { 1, 16, 8, 8, 8 }, { 1, 16, 8, 8, 8 }, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_U8_2 { 2, 16, 8, 8, 8 }, { 2, 16, 8, 8, 8 }, data_types::u8, format::bfzyx, true, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_U8_3 { 2, 16, 8, 8, 8 }, { 2, 1, 1, 1, 1 }, data_types::u8, format::bfzyx, true, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_U8_4 { 2, 16, 8, 8, 8 }, { 1, 1, 1, 1, 1 }, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx -#define CASE_MVN_3D_U8_5 { 2, 16, 1, 8, 8 }, { 1, 1, 8, 1, 1 }, data_types::u8, format::bfzyx, false, true, data_types::f32, format::bfzyx - -class MVNFusingTest : public ::BaseFusingTest { -public: - void execute(mvn_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(mvn_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(mvn_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; - } -}; - -class mvn_activation : public MVNFusingTest {}; -TEST_P(mvn_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), - activation("act", "mvn", activation_func::hyperbolic_tan), - reorder("reorder_bfyx", "act", format::bfyx, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_activation, ::testing::ValuesIn(std::vector{ - mvn_test_params{ CASE_MVN_F32_1, 2, 3 }, - mvn_test_params{ CASE_MVN_F32_2, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_F32_1, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_F32_2, 2, 3 }, - mvn_test_params{ CASE_MVN_F16_1, 2, 3 }, - mvn_test_params{ CASE_MVN_F16_2, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_F16_1, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_F16_2, 2, 3 }, - mvn_test_params{ CASE_MVN_I8_1, 2, 3 }, - mvn_test_params{ CASE_MVN_I8_2, 2, 3 }, - mvn_test_params{ CASE_MVN_I8_3, 2, 3 }, - mvn_test_params{ CASE_MVN_I8_4, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_I8_1, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_I8_2, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_1, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_2, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_3, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_4, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_1, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_2, 2, 3 }, -})); - -class mvn_scale_quantize_i8 : public MVNFusingTest {}; -TEST_P(mvn_scale_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), - data("scale_data", get_mem(get_per_channel_layout(p))), - scale("scale", "mvn", "scale_data"), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127, 127)), - data("out_high", get_mem(get_single_element_layout(p), -127, 127)), - quantize("quant", "scale", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_scale_quantize_i8, ::testing::ValuesIn(std::vector{ - // Full fusing for fp input not supported yet, it may lead to output padding and non-optimal kernel - // mvn_test_params{ CASE_MVN_F32_1, 2, 4 }, - // mvn_test_params{ CASE_MVN_F32_2, 2, 4 }, - // mvn_test_params{ CASE_MVN_3D_F32_1, 2, 4 }, - // mvn_test_params{ CASE_MVN_3D_F32_2, 2, 4 }, - // mvn_test_params{ CASE_MVN_F16_1, 2, 4 }, - // mvn_test_params{ CASE_MVN_F16_2, 2, 4 }, - // mvn_test_params{ CASE_MVN_3D_F16_1, 2, 4 }, - // mvn_test_params{ CASE_MVN_3D_F16_2, 2, 4 }, - mvn_test_params{ CASE_MVN_I8_1, 2, 4 }, - mvn_test_params{ CASE_MVN_I8_2, 2, 4 }, - mvn_test_params{ CASE_MVN_I8_3, 2, 4 }, - mvn_test_params{ CASE_MVN_I8_4, 2, 4 }, - mvn_test_params{ CASE_MVN_3D_I8_1, 2, 4 }, - mvn_test_params{ CASE_MVN_3D_I8_2, 2, 4 }, - mvn_test_params{ CASE_MVN_U8_1, 2, 4 }, - mvn_test_params{ CASE_MVN_U8_2, 2, 4 }, - mvn_test_params{ CASE_MVN_U8_3, 2, 4 }, - mvn_test_params{ CASE_MVN_U8_4, 2, 4 }, - mvn_test_params{ CASE_MVN_3D_U8_1, 2, 4 }, - mvn_test_params{ CASE_MVN_3D_U8_2, 2, 4 }, -})); - -class mvn_scale_activation_eltwise_fp32_quantize_i8 : public MVNFusingTest {}; -TEST_P(mvn_scale_activation_eltwise_fp32_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), - data("scale_data", get_mem(get_per_channel_layout(p))), - scale("scale", "mvn", "scale_data"), - activation("act", "scale", activation_func::hyperbolic_tan), - data("eltw_data", get_mem(layout{ p.input_type, p.default_format, p.elwise_size })), - eltwise("eltw", { "act", "eltw_data" }, eltwise_mode::sum, data_types::f32), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -128)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_scale_activation_eltwise_fp32_quantize_i8, ::testing::ValuesIn(std::vector{ - // Full using for fp input not supported yet, it may lead to output padding and non-optimal kernel - // mvn_test_params{ CASE_MVN_F32_1, 2, 7 }, - // mvn_test_params{ CASE_MVN_F32_2, 2, 7 }, - // mvn_test_params{ CASE_MVN_3D_F32_1, 2, 7 }, - // mvn_test_params{ CASE_MVN_3D_F32_2, 2, 7 }, - // mvn_test_params{ CASE_MVN_F16_1, 2, 7 }, - // mvn_test_params{ CASE_MVN_F16_2, 2, 7 }, - // mvn_test_params{ CASE_MVN_3D_F16_1, 2, 7 }, - // mvn_test_params{ CASE_MVN_3D_F16_2, 2, 7 }, - mvn_test_params{ CASE_MVN_I8_1, 2, 6 }, - mvn_test_params{ CASE_MVN_I8_2, 2, 6 }, - mvn_test_params{ CASE_MVN_I8_3, 2, 6 }, - mvn_test_params{ CASE_MVN_I8_4, 2, 6 }, - mvn_test_params{ CASE_MVN_I8_5, 2, 6 }, - mvn_test_params{ CASE_MVN_I8_6, 2, 6 }, - mvn_test_params{ CASE_MVN_I8_7, 3, 6 }, - mvn_test_params{ CASE_MVN_3D_I8_1, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_I8_2, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_I8_3, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_I8_4, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_I8_5, 3, 6 }, - mvn_test_params{ CASE_MVN_U8_1, 2, 6 }, - mvn_test_params{ CASE_MVN_U8_2, 2, 6 }, - mvn_test_params{ CASE_MVN_U8_3, 2, 6 }, - mvn_test_params{ CASE_MVN_U8_4, 2, 6 }, - mvn_test_params{ CASE_MVN_U8_5, 2, 6 }, - mvn_test_params{ CASE_MVN_U8_6, 2, 6 }, - mvn_test_params{ CASE_MVN_U8_7, 3, 6 }, - mvn_test_params{ CASE_MVN_3D_U8_1, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_U8_2, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_U8_3, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_U8_4, 2, 6 }, - mvn_test_params{ CASE_MVN_3D_U8_5, 3, 6 }, -})); - -class mvn_eltwise : public MVNFusingTest {}; -TEST_P(mvn_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", layout{ p.input_type, p.input_format, p.input_size }), - mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), - data("eltw_data", get_mem(layout{ p.input_type, p.default_format, p.elwise_size })), - eltwise("eltw", { "mvn", "eltw_data" }, eltwise_mode::sum, data_types::f32), - reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_eltwise, ::testing::ValuesIn(std::vector{ - mvn_test_params{ CASE_MVN_I8_5, 2, 3 }, - mvn_test_params{ CASE_MVN_I8_6, 2, 3 }, - mvn_test_params{ CASE_MVN_I8_7, 3, 3 }, - mvn_test_params{ CASE_MVN_3D_I8_3, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_I8_4, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_I8_5, 3, 3 }, - mvn_test_params{ CASE_MVN_U8_1, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_2, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_3, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_4, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_5, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_6, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_7, 3, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_1, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_2, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_3, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_4, 2, 3 }, - mvn_test_params{ CASE_MVN_3D_U8_5, 3, 3 }, -})); - -class mvn_eltwise_f16 : public MVNFusingTest {}; -TEST_P(mvn_eltwise_f16, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", layout{ p.input_type, p.input_format, p.input_size }), - mvn("mvn", "input", p.normalize_variance, 1e-10f, false, false), - data("eltw_data", get_mem(layout{ p.input_type, p.default_format, p.elwise_size })), - eltwise("eltw", { "mvn", "eltw_data" }, eltwise_mode::sum, data_types::f16), - reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) - ); - - tolerance = 0.1f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, mvn_eltwise_f16, ::testing::ValuesIn(std::vector{ - mvn_test_params{ CASE_MVN_I8_6, 2, 3 }, - mvn_test_params{ CASE_MVN_U8_2, 2, 3 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- LRN cases -------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct lrn_test_params { - tensor in_shape; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; - lrn_norm_region lrn_type; - std::string kernel_name; -}; - -#define CASE_LRN_FP32_1 { 2, 16, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_LRN_FP32_2 { 8, 16, 4, 4 }, data_types::f32, format::yxfb, data_types::f32, format::yxfb -#define CASE_LRN_FP32_3 { 2, 16, 4, 4 }, data_types::f32, format::byxf, data_types::f32, format::byxf -#define CASE_LRN_FP32_4 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx -#define CASE_LRN_FP32_5 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - -#define CASE_LRN_FP32_TO_FP16_1 { 2, 16, 5, 5 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx -#define CASE_LRN_FP32_TO_FP16_2 { 2, 16, 5, 5 }, data_types::f32, format::byxf, data_types::f16, format::byxf -#define CASE_LRN_FP32_TO_FP16_3 { 8, 16, 4, 4 }, data_types::f32, format::yxfb, data_types::f16, format::byxf -#define CASE_LRN_FP32_TO_FP16_4 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f16, format::bfyx -#define CASE_LRN_FP32_TO_FP16_5 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::bfyx - -#define CASE_LRN_FP16_1 { 2, 16, 4, 4 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_LRN_FP16_2 { 8, 16, 4, 4 }, data_types::f16, format::yxfb, data_types::f16, format::yxfb -#define CASE_LRN_FP16_3 { 2, 16, 4, 4 }, data_types::f16, format::byxf, data_types::f16, format::byxf -#define CASE_LRN_FP16_4 { 2, 16, 4, 4 }, data_types::f16, format::b_fs_yx_fsv4, data_types::f16, format::bfyx -#define CASE_LRN_FP16_5 { 2, 16, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx - -class LrnFusingTest : public ::BaseFusingTest { -public: - void execute(lrn_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - build_options options; - implementation_desc lrn_impl = { p.input_format, p.kernel_name }; - options.set_option(build_option::optimize_data(true)); - options.set_option(build_option::force_implementations({ { "lrn_norm", lrn_impl } })); - network network_fused(this->engine, this->topology_fused, options); - network network_not_fused(this->engine, this->topology_non_fused, this->bo_not_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - ASSERT_FALSE(network_fused.get_primitives_info().empty()); - ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); - - auto find_lrn = [&](primitive_info& p) -> bool { - if (p.original_id == "lrn_norm" || p.original_id == "reorder") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto pi_not_fused = network_not_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_lrn); - auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_lrn); - - ASSERT_TRUE(info_fused != pi_fused.end()); - ASSERT_TRUE(info_not_fused != pi_not_fused.end()); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(lrn_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; - } - - layout get_per_channel_layout(lrn_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } -}; - -class lrn_fp32_quantize_u8_scale_activation : public LrnFusingTest {}; -TEST_P(lrn_fp32_quantize_u8_scale_activation, basic) { - auto p = GetParam(); - - uint32_t size = 5; - float k = 1.0f; - float alpha = (float)9.9e-05; - float beta = 0.75; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), - quantize("quantize", "lrn_norm", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - scale("scale", "quantize", "scale_data"), - activation("activation", "scale", activation_func::exp), - reorder("reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -TEST_P(lrn_fp32_quantize_u8_scale_activation, per_channel) { - auto p = GetParam(); - - uint32_t size = 5; - float k = 1.0f; - float alpha = (float)9.9e-05; - float beta = 0.75; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)), - lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), - quantize("quantize", "lrn_norm", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - scale("scale", "quantize", "scale_data"), - activation("activation", "scale", activation_func::exp), - reorder("reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_quantize_u8_scale_activation, ::testing::ValuesIn(std::vector{ - // InputDataType = FP32 OutputDataType = FP32 - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_2, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, - lrn_test_params{ CASE_LRN_FP32_3, 2, 5, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, - lrn_test_params{ CASE_LRN_FP32_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, - - // InputDataType = FP32 OutputDataType = FP16 - lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_3, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_TO_FP16_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, -})); - -class lrn_fp32_quantize_i8_scale_activation : public LrnFusingTest {}; -TEST_P(lrn_fp32_quantize_i8_scale_activation, basic) { - auto p = GetParam(); - - uint32_t size = 5; - float k = 1.0f; - float alpha = (float)9.9e-05; - float beta = 0.75; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), - scale("scale", "lrn_norm", "scale_data"), - activation("activation", "scale", activation_func::exp), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_quantize_i8_scale_activation, ::testing::ValuesIn(std::vector{ - // InputDataType = FP32 OutputDataType = INT8 - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, - lrn_test_params{ CASE_LRN_FP32_2, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, - lrn_test_params{ CASE_LRN_FP32_3, 2, 5, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, - lrn_test_params{ CASE_LRN_FP32_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, - - // InputDataType = FP16 OutputDataType = INT8/UINT8 can't be tested for now, because quantize - // primitive doesn't support input type FP16 while fusing (prepare_quantization.cpp :114 -> prepare_primitive_fusing.cpp :474) -})); - -class lrn_fp32_scale_activation_quantize_u8 : public LrnFusingTest {}; -TEST_P(lrn_fp32_scale_activation_quantize_u8, basic) { - auto p = GetParam(); - - uint32_t size = 5; - float k = 1.0f; - float alpha = (float)9.9e-05; - float beta = 0.75; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), - scale("scale", "lrn_norm", "scale_data"), - activation("activation", "scale", activation_func::exp), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp32_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ - // InputDataType = FP32 OutputDataType = UINT8 - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_ref" }, - lrn_test_params{ CASE_LRN_FP32_1, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_2, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_yxfb_b8_opt" }, - lrn_test_params{ CASE_LRN_FP32_3, 2, 5, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, - lrn_test_params{ CASE_LRN_FP32_4, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP32_5, 2, 5, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, -})); - -class lrn_fp16_scale_activation : public LrnFusingTest {}; -TEST_P(lrn_fp16_scale_activation, basic) { - auto p = GetParam(); - - uint32_t size = 5; - float k = 1.0f; - float alpha = (float)9.9e-05; - float beta = 0.75; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - lrn("lrn_norm", "input", size, k, alpha, beta, p.lrn_type), - scale("scale", "lrn_norm", "scale_data"), - activation("activation", "scale", activation_func::exp), - reorder("reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, lrn_fp16_scale_activation, ::testing::ValuesIn(std::vector{ - // InputDataType = FP16 OutputDataType = FP16 - lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_within_channel, "lrn_gpu_within_channel_opt" }, - lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_within_channel, "lrn_gpu_within_channel" }, - lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_across_channel, "lrn_ref" }, - lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP16_1, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_ref" }, - lrn_test_params{ CASE_LRN_FP16_3, 2, 4, lrn_norm_region_within_channel, "lrn_within_channel_byxf_opt" }, - lrn_test_params{ CASE_LRN_FP16_4, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features" }, - lrn_test_params{ CASE_LRN_FP16_5, 2, 4, lrn_norm_region_across_channel, "lrn_gpu_across_channel_multiple_features_fsv16" }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* -------------------------------- Activation cases --------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct activation_test_params { - tensor input_size; - data_types input_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; - std::string kernel_name; -}; - -#define CASE_ACTIVATION_F32_0 { 7, 32, 3, 3 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_1 { 1, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_2 { 7, 3, 7, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_3 { 1, 14, 8, 8 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_4 { 1, 17, 31, 29 }, data_types::f32, format::yxfb, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_5 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_6 { 1, 17, 31, 29 }, data_types::f32, format::b_fs_yx_fsv32, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F32_7 { 1, 17, 31, 29 }, data_types::f32, format::fyxb, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F32_0 { 3, 16, 13, 13, 13 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F32_1 { 2, 16, 8, 8, 8 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F32_2 { 1, 16, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F32_3 { 1, 17, 7, 7, 7 }, data_types::f32, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F32_4 { 1, 17, 7, 7, 7 }, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F32_5 { 1, 17, 7, 7, 7 }, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx - -#define CASE_ACTIVATION_F16_0 { 7, 32, 5, 5 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_1 { 1, 16, 8, 8 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_2 { 7, 16, 7, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_3 { 1, 14, 8, 8 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_4 { 1, 17, 31, 29 }, data_types::f16, format::yxfb, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_5 { 1, 17, 31, 29 }, data_types::f16, format::b_fs_yx_fsv4, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_6 { 1, 17, 31, 29 }, data_types::f16, format::b_fs_yx_fsv32, data_types::f32, format::bfyx -#define CASE_ACTIVATION_F16_7 { 1, 17, 31, 29 }, data_types::f16, format::fyxb, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_F16_0 { 3, 16, 13, 13, 13 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F16_1 { 2, 16, 8, 8, 8 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F16_2 { 1, 16, 7, 7, 7 }, data_types::f16, format::b_fs_zyx_fsv16, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F16_3 { 1, 17, 7, 7, 7 }, data_types::f16, format::b_fs_zyx_fsv32, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F16_4 { 1, 17, 7, 7, 7 }, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfzyx -#define CASE_ACTIVATION_3D_F16_5 { 1, 17, 7, 7, 7 }, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfzyx - -#define CASE_ACTIVATION_U8_1 { 1, 16, 8, 8 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_ACTIVATION_U8_2 { 1, 12, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_ACTIVATION_I8_1 { 1, 16, 8, 8 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_ACTIVATION_I8_2 { 1, 14, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_ACTIVATION_3D_I8_1 { 1, 17, 8, 8, 8 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx - -class ActivationFusingTest : public ::BaseFusingTest { -public: - void execute(activation_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - build_options options; - implementation_desc activation_impl = { p.input_format, p.kernel_name }; - options.set_option(build_option::optimize_data(true)); - options.set_option(build_option::force_implementations({ { "act", activation_impl } })); - network network_fused(this->engine, this->topology_fused, options); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(activation_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(activation_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; - } - - format get_input_format(activation_test_params &p) { return p.input_format; } -}; - -class activation_quantize_i8 : public ActivationFusingTest {}; -TEST_P(activation_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - activation("act", "input", activation_func::relu), - data("in_low", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_high", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127, 0)), - data("out_high", get_mem(get_single_element_layout(p), 0, 127)), - quantize("quant", "act", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -TEST_P(activation_quantize_i8, per_channel) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - activation("act", "input", activation_func::relu), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127, 0)), - data("out_high", get_mem(get_single_element_layout(p), 0, 127)), - quantize("quant", "act", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_quantize_i8, ::testing::ValuesIn(std::vector{ - // InputDataType = FP32 - activation_test_params{ CASE_ACTIVATION_F32_0, 2, 3, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_F32_1, 2, 3, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 3, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 3, "activation_opt" }, - - activation_test_params{ CASE_ACTIVATION_F32_0, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_1, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_2, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_3, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_4, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 3, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_2, 2, 3, "activation_ref" }, -})); - -INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_quantize_i8, ::testing::ValuesIn(std::vector{ - activation_test_params{ CASE_ACTIVATION_F32_5, 2, 3, "activation_ref" }, // FIXME - accuracy bug - activation_test_params{ CASE_ACTIVATION_F32_6, 2, 3, "activation_ref" }, // FIXME - accuracy bug - activation_test_params{ CASE_ACTIVATION_F32_7, 2, 3, "activation_ref" }, // FIXME - accuracy bug - activation_test_params{ CASE_ACTIVATION_3D_F32_3, 2, 3, "activation_ref" }, // FIXME - accuracy bug - activation_test_params{ CASE_ACTIVATION_3D_F32_5, 2, 3, "activation_ref" }, // FIXME - accuracy bug -})); - -class activation_scale_activation_quantize_u8 : public ActivationFusingTest {}; -TEST_P(activation_scale_activation_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - activation("act", "input", activation_func::relu), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - data("in_low", get_mem(get_single_element_layout(p), 0)), - data("in_high", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - scale("scale", "act", "scale_data"), - activation("act2", "scale", activation_func::softsign), - quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -TEST_P(activation_scale_activation_quantize_u8, per_channel) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - activation("act", "input", activation_func::relu), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - data("in_low", get_mem(get_per_channel_layout(p), 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - scale("scale", "act", "scale_data"), - activation("act2", "scale", activation_func::softsign), - quantize("quant", "act2", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ - // InputDataType = FP32 - activation_test_params{ CASE_ACTIVATION_F32_0, 2, 5, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_F32_1, 2, 5, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 5, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 5, "activation_opt" }, - - activation_test_params{ CASE_ACTIVATION_F32_0, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_1, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_2, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_3, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_4, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_5, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_6, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_7, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 5, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_2, 2, 5, "activation_ref" }, -})); - -INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_scale_activation_quantize_u8, ::testing::ValuesIn(std::vector{ - activation_test_params{ CASE_ACTIVATION_3D_F32_5, 2, 5, "activation_ref" }, // FIXME - accuracy bug -})); - -class activation_scale_activation : public ActivationFusingTest {}; -TEST_P(activation_scale_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - activation("act", "input", activation_func::relu), - data("scale_data", get_mem(get_single_element_layout(p), 1.0f / 255)), - scale("scale", "act", "scale_data"), - activation("act2", "scale", activation_func::exp), - reorder("reorder_bfyx", "act2", p.default_format, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, activation_scale_activation, ::testing::ValuesIn(std::vector{ - // InputDataType = FP32 - activation_test_params{ CASE_ACTIVATION_F32_0, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_F32_1, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 4, "activation_opt" }, - - activation_test_params{ CASE_ACTIVATION_F32_0, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_1, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_2, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_3, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_4, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_5, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_6, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F32_7, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_0, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_1, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F32_2, 2, 4, "activation_ref" }, - - // InputDataType = FP16 - activation_test_params{ CASE_ACTIVATION_F16_0, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_F16_1, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_0, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_opt" }, - - activation_test_params{ CASE_ACTIVATION_F16_0, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_1, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_2, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_3, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_4, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_5, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_6, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_F16_7, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_0, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_1, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_2, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_3, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_F16_4, 2, 4, "activation_ref" }, - - // InputDataType = UINT8 - activation_test_params{ CASE_ACTIVATION_U8_1, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_U8_2, 2, 4, "activation_ref" }, - - // InputDataType = INT8 - activation_test_params{ CASE_ACTIVATION_I8_1, 2, 4, "activation_opt" }, - activation_test_params{ CASE_ACTIVATION_3D_I8_1, 2, 4, "activation_opt" }, - - activation_test_params{ CASE_ACTIVATION_I8_1, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_I8_2, 2, 4, "activation_ref" }, - activation_test_params{ CASE_ACTIVATION_3D_I8_1, 2, 4, "activation_ref" } -})); - -INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, activation_scale_activation, ::testing::ValuesIn(std::vector{ - activation_test_params{ CASE_ACTIVATION_3D_F32_4, 2, 4, "activation_ref" }, // FIXME - accuracy bug - activation_test_params{ CASE_ACTIVATION_3D_F32_5, 2, 4, "activation_ref" }, // FIXME - accuracy bug -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* --------------------------------------- Deconvolution cases ----------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -using deconv_test_params = bc_test_params; - -// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; -#define CASE_DECONV_FP32_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_FP32_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx - -#define CASE_DECONV_FP16_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::gs_oiyx_gsv16, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::oiyx, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::is_os_yx_isv16_osv16, data_types::f16, format::bfyx -#define CASE_DECONV_FP16_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::gs_oiyx_gsv16, data_types::f16, format::bfyx - -#define CASE_DECONV_S8S8_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_S8S8_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx - -#define CASE_DECONV_U8S8_1 { 1, 15, 4, 5 }, { 1, 30, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_4 { 1, 32, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_5 { 1, 15, 4, 5 }, { 1, 30, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfyx, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_6 { 1, 16, 4, 5 }, { 1, 32, 9, 11 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_7 { 1, 16, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_U8S8_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_yx_fsv16, data_types::i8, format::goiyx, data_types::f32, format::bfyx - - -// 3D -// in_shape; out_shape; kernel; stride; pad; dilation; groups; data_type; input_format; weights_type; weights_format; default_type; default_format; -#define CASE_DECONV_FP32_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::gs_oizyx_gsv16, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::gs_oizyx_gsv16, data_types::f32, format::bfzyx -#define CASE_DECONV_FP32_3D_9 { 16, 16, 4, 5, 3 }, { 16, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::is_os_zyx_isv16_osv16, data_types::f32, format::bfzyx - -#define CASE_DECONV_FP16_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::oizyx, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::gs_oizyx_gsv16, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::oizyx, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::gs_oizyx_gsv16, data_types::f16, format::bfzyx -#define CASE_DECONV_FP16_3D_9 { 16, 16, 4, 5, 3 }, { 16, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::is_os_zyx_isv16_osv16, data_types::f16, format::bfzyx - -#define CASE_DECONV_S8S8_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_S8S8_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::i8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx - -#define CASE_DECONV_U8S8_3D_1 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_2 { 1, 16, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_3 { 1, 16, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_4 { 1, 32, 4, 5, 3 }, { 1, 32, 4, 5, 3 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_5 { 1, 15, 4, 5, 3 }, { 1, 30, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::bfzyx, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_6 { 1, 16, 4, 5, 3 }, { 1, 32, 9, 11, 7 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_7 { 1, 16, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 1, 1, 1 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_U8S8_3D_8 { 1, 32, 4, 5, 3 }, { 1, 32, 7, 9, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx - -#define CASE_DECONV_ELTW_FP32_1 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::oiyx, data_types::f32, format::bfyx -#define CASE_DECONV_ELTW_FP32_2 { 1, 16, 4, 5 }, { 1, 32, 6, 7 }, { 1, 1, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_ELTW_FP32_3 { 1, 16, 4, 5 }, { 1, 32, 4, 5 }, { 1, 1, 1, 1 }, { 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::is_os_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_ELTW_FP32_4 { 1, 15, 4, 5, 3 }, { 1, 30, 6, 7, 5 }, { 1, 1, 6, 7, 5 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_ELTW_FP32_5 { 1, 15, 4, 5, 4 }, { 1, 30, 6, 7, 6 }, { 1, 30, 6, 1, 6 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::oizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_ELTW_FP32_6 { 1, 32, 2, 2, 2 }, { 1, 16, 4, 4, 4 }, { 1, 16, 1, 4, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::os_is_zyx_isv16_osv16, data_types::f32, format::bfzyx -#define CASE_DECONV_ELTW_FP32_7 { 1, 16, 3, 5 }, { 1, 32, 5, 7 }, { 1, 32, 1, 7 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::os_is_yx_isv16_osv16, data_types::f32, format::bfyx -#define CASE_DECONV_ELTW_FP32_8 { 1, 32, 4, 5 }, { 1, 32, 7, 9 }, { 1, 32, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1, 1, 2, 2 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }, tensor{ 1 }, 32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::gs_oiyx_gsv16, data_types::f32, format::bfyx - -#define CASE_DECONV_ELTW_i8_1 { 1, 16, 3, 5 }, { 1, 32, 5, 7 }, { 1, 32, 5, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_DECONV_ELTW_i8_2 { 1, 32, 4, 5, 3 }, { 1, 32, 6, 7, 5 }, { 1, 32, 1, 1, 1 }, { 1, 1, 3, 3, 3 }, tensor{ 1, 1, 2, 2, 2 }, tensor{ { 0, 0, 1, 1, 1 }, 0 }, tensor{ 1 }, 32, data_types::u8, format::b_fs_zyx_fsv16, data_types::i8, format::goizyx, data_types::f32, format::bfzyx -#define CASE_DECONV_ELTW_i8_3 { 1, 5, 5, 5, 5 }, { 1, 5, 5, 5, 5 }, { 1, 1, 1, 1, 1 }, { 1, 1, 1, 1, 1 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::bfzyx, data_types::i8, format::oiyx, data_types::f32, format::bfzyx -#define CASE_DECONV_ELTW_i8_4 { 1, 16, 1, 4 }, { 1, 16, 1, 6 }, { 1, 16, 1, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx -#define CASE_DECONV_ELTW_i8_5 { 1, 16, 2, 4 }, { 1, 16, 4, 6 }, { 1, 16, 4, 1 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::i8, format::b_fs_yx_fsv16, data_types::i8, format::os_is_yx_osv16_isv16, data_types::f32, format::bfyx - - -class DeconvolutionFusingTest : public ::WeightsPrimitiveFusingTest { -public: - void execute(deconv_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "deconv") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - if (info_fused != pi_fused.end()) - std::cout << "kernel: " << info_fused->kernel_id << std::endl; - } -}; - -class deconv_actv : public DeconvolutionFusingTest {}; -TEST_P(deconv_actv, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - activation("act", "deconv", activation_func::relu), - reorder("out", "act", p.default_format, data_types::f32) - ); - // Need much higher tolerance because of deconvolution -> convolution optimization - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_actv, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_FP32_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_FP16_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 }, - // Here and below this test case and CASE_DECONV_S8S8_4 are commented because they fail for z_pad=0 which is unexpected - // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 }, - // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_9, 2, 3 }, - - deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_9, 2, 3 }, - - deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 3 }, -})); - - -class deconv_bias : public DeconvolutionFusingTest {}; -TEST_P(deconv_bias, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - eltwise("bias_add", { "deconv", "bias" }, eltwise_mode::sum), - reorder("out", "bias_add", p.default_format, data_types::f32) - ); - - // Need much higher tolerance because of deconvolution -> convolution optimization - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_bias, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_FP32_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_FP16_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 }, - - deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 }, -})); - -class deconv_scale : public DeconvolutionFusingTest {}; -TEST_P(deconv_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - scale("scale", "deconv", "scale_data"), - reorder("out", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(deconv_scale, fp16_scale_out) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - scale("scale", "deconv", "scale_data", optional_data_type{ data_types::f16 }), - reorder("out", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_U8S8_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3, 2, 3 }, - // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_S8S8_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3, 2, 3 }, - // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 3 }, - - deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 3 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 3 }, -})); - -class deconv_actv_eltw_actv : public DeconvolutionFusingTest {}; -TEST_P(deconv_actv_eltw_actv, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("eltw_data", get_mem(get_output_layout(p))), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - activation("act1", "deconv", activation_func::relu), - eltwise("eltw", { "act1", "eltw_data" }, eltwise_mode::sum), - activation("act2", "eltw", activation_func::relu), - reorder("out", "act2", p.default_format, data_types::f32) - ); - - // Need much higher tolerance because of deconvolution -> convolution optimization - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_actv_eltw_actv, ::testing::ValuesIn(std::vector{ - // Some fusings disabled under deconvolution -> convolution optimization - deconv_test_params{ CASE_DECONV_FP32_1, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP32_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_FP16_1, 3, 5 }, - deconv_test_params{ CASE_DECONV_FP16_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_U8S8_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3, 2, 5 }, - // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_S8S8_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3, 2, 5 }, - // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_9, 2, 5 }, - - deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_9, 2, 5 }, - - deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 5 }, -})); - -class deconv_scale_actv_quant_i8 : public DeconvolutionFusingTest {}; -TEST_P(deconv_scale_actv_quant_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.f/p.kernel.count())), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - scale("scale", "deconv", "scale_data"), - activation("actv", "scale", activation_func::softsign), - quantize("quant", "actv", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("out", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_FP32_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_FP16_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_U8S8_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3, 2, 5 }, - // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_S8S8_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3, 2, 5 }, - // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 5 }, - // FIXME no quantize implementation for bs_fs_yx_bsv16_fsv16 format AND add_required_reorders pass completely ruins data types - // add_required_reorders pass tries to reorder everything to output type if no format exists, this ruins fp32 -> int8 quantize - //deconv_test_params{ CASE_DECONV_FP32_3D_9, 3, 5 }, - - deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 5 }, - //deconv_test_params{ CASE_DECONV_FP16_3D_9, 3, 5 }, - - deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 5 }, - - deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 5 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 5 }, -})); - -class deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8 : public DeconvolutionFusingTest {}; -TEST_P(deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("scale1_data", get_mem(get_per_channel_layout(p), 1.f / p.kernel.count())), - data("in1_lo", get_mem(get_per_channel_layout(p), 0)), - data("in1_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out1_lo", get_mem(get_single_element_layout(p), 0)), - data("out1_hi", get_mem(get_single_element_layout(p), 255)), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.out_shape))), - data("scale2_data", get_mem(get_per_channel_layout(p), 1.f / p.kernel.count())), - data("in2_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in2_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out2_lo", get_mem(get_single_element_layout(p), -127)), - data("out2_hi", get_mem(get_single_element_layout(p), 127)), - deconvolution("deconv", "input", { "weights" }, p.groups, p.stride, p.pad), - scale("scale1", "deconv", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - quantize("quant1", "actv1", "in1_lo", "in1_hi", "out1_lo", "out1_hi", 256, data_types::u8), - eltwise("eltw", { "quant1", "eltw_data" }, eltwise_mode::sum, p.default_type), - scale("scale2", "eltw", "scale2_data"), - activation("actv2", "scale2", activation_func::relu), - quantize("quant2", "actv2", "in2_lo", "in2_hi", "out2_lo", "out2_hi", 255, data_types::i8), - reorder("out", "quant2", p.default_format, data_types::f32) - ); - - tolerance = 2.1f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ - deconv_test_params{ CASE_DECONV_FP32_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_2, 2, 9 }, - // deconv_test_params{ CASE_DECONV_FP32_3, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_8, 2, 9 }, - - deconv_test_params{ CASE_DECONV_FP16_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_8, 2, 9 }, - - deconv_test_params{ CASE_DECONV_U8S8_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3, 2, 9 }, - // deconv_test_params{ CASE_DECONV_U8S8_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_8, 2, 9 }, - - deconv_test_params{ CASE_DECONV_S8S8_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3, 2, 9 }, - // deconv_test_params{ CASE_DECONV_S8S8_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_8, 2, 9 }, - - deconv_test_params{ CASE_DECONV_FP32_3D_1, 2, 9 }, - // deconv_test_params{ CASE_DECONV_FP32_3D_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_3, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_6, 2, 9 }, - // deconv_test_params{ CASE_DECONV_FP32_3D_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP32_3D_8, 2, 9 }, - // deconv_test_params{ CASE_DECONV_FP32_3D_9, 6, 9 }, - - deconv_test_params{ CASE_DECONV_FP16_3D_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_3, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_FP16_3D_8, 2, 9 }, - // deconv_test_params{ CASE_DECONV_FP16_3D_9, 6, 9 }, - - deconv_test_params{ CASE_DECONV_U8S8_3D_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_3, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_U8S8_3D_8, 2, 9 }, - - deconv_test_params{ CASE_DECONV_S8S8_3D_1, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_2, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_3, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_4, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_5, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_6, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 9 }, - deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 9 }, -})); - -class deconv_scale_activation_quantize_i8_eltwise_quantize_u8 : public ConvEltwTest {}; -TEST_P(deconv_scale_activation_quantize_i8_eltwise_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - deconvolution("deconv_prim", "input", { "weights" }, p.groups, p.stride, p.pad), - data("scale_data", get_mem(get_per_channel_layout(p), 1.f / p.kernel.count())), - scale("scale", "deconv_prim", "scale_data"), - activation("activation", "scale", activation_func::relu), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -127)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - quantize("quant", "activation", "in_low", "in_high", "out_low", "out_high", 255, data_types::i8), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), - eltwise("eltw", { "quant", "eltwise_data" }, eltwise_mode::sum, p.default_type), - data("in_low2", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high2", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low2", get_mem(get_single_element_layout(p), 0)), - data("out_high2", get_mem(get_single_element_layout(p), 255)), - quantize("quant2", "eltw", "in_low2", "in_high2", "out_low2", "out_high2", 256, data_types::u8), - reorder("reorder_bfyx", "quant2", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_scale_activation_quantize_i8_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_1, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_2, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_3, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_4, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_5, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_6, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_7, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_8, 2, 7 }, - - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_1, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_2, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_3, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_4, 2, 7 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_5, 2, 7 }, - -})); - -class deconv_activation_eltwise_diff_sizes : public ConvEltwTest {}; -TEST_P(deconv_activation_eltwise_diff_sizes, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.eltw_shape })), - deconvolution("deconv_prim", "input", { "weights" }, p.groups, p.stride, p.pad), - activation("activation", "deconv_prim", activation_func::relu), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum, p.default_type), - reorder("reorder_bfyx", "sum", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, deconv_activation_eltwise_diff_sizes, ::testing::ValuesIn(std::vector{ - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_1, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_2, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_3, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_4, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_5, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_6, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_7, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_FP32_8, 2, 4 }, - - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_1, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_2, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_3, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_4, 2, 4 }, - conv_eltw_test_params{ CASE_DECONV_ELTW_i8_5, 2, 4 }, -})); - -#ifdef ENABLE_ONEDNN_FOR_GPU -/* ----------------------------------------------------------------------------------------------------- */ -/* --------------------------------------- Concat cases ------------------------------------------------ */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct concat_test_params { - tensor in_shape; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; - std::string kernel_name; -}; - -#define CASE_CONCAT_F32_1 { 1, 8, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_CONCAT_F16_1 { 1, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx - -class ConcatOneDNNFusingTest : public ::BaseFusingTest { -public: - void execute(concat_test_params& p) { - // Onednn post operation has issue in a machine that does not support imad. - if (!engine.get_device_info().supports_imad) - return; - - auto input0_prim = get_mem(get_input_layout(p)); - auto input1_prim = get_mem(get_input_layout(p)); - - build_options onednn_options; - build_options cldnn_options; - - onednn_options.set_option(build_option::optimize_data(true)); - cldnn_options.set_option(build_option::optimize_data(true)); - - implementation_desc onednn_impl = { p.input_format, "", impl_types::onednn }; - implementation_desc cldnn_impl = { p.input_format, "", impl_types::ocl }; - onednn_options.set_option(build_option::force_implementations({ { "concat", onednn_impl } })); - cldnn_options.set_option(build_option::force_implementations({ { "concat", cldnn_impl } })); - - // for onednn fusing test, topology_non_fused means cldnn, topology_fused is onednn - network network_fused_cldnn(this->engine, this->topology_non_fused, cldnn_options); - network network_fused_onednn(this->engine, this->topology_fused, onednn_options); - - network_fused_cldnn.set_input_data("input0", input0_prim); - network_fused_cldnn.set_input_data("input1", input1_prim); - network_fused_onednn.set_input_data("input0", input0_prim); - network_fused_onednn.set_input_data("input1", input1_prim); - - ASSERT_FALSE(network_fused_cldnn.get_primitives_info().empty()); - ASSERT_FALSE(network_fused_onednn.get_primitives_info().empty()); - - auto find_and_check = [&](primitive_info& p) -> bool { - if (p.original_id == "concat" || p.original_id == "reorder_bfyx") - return true; - return false; - }; - - auto pi_fused_onednn = network_fused_onednn.get_primitives_info(); - auto pi_fused_cldnn = network_fused_cldnn.get_primitives_info(); - auto info_fused_onednn = std::find_if(pi_fused_onednn.begin(), pi_fused_onednn.end(), find_and_check); - auto info_fused_cldnn = std::find_if(pi_fused_cldnn.begin(), pi_fused_cldnn.end(), find_and_check); - - ASSERT_TRUE(info_fused_onednn != pi_fused_onednn.end()); - ASSERT_TRUE(info_fused_cldnn != pi_fused_cldnn.end()); - - compare(network_fused_cldnn, network_fused_onednn, p); - } - - layout get_input_layout(concat_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; - } - - layout get_per_channel_layout(concat_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } -}; - -class concat_onednn_activation : public ConcatOneDNNFusingTest {}; -TEST_P(concat_onednn_activation, along_f) { - auto p = GetParam(); - create_topologies( - input_layout("input0", get_input_layout(p)), - input_layout("input1", get_input_layout(p)), - concatenation("concat", - { "input0", "input1" }, - concatenation::concatenation_axis::along_f, - data_types::f16, - "", - padding{ { 0, 0, 0, 0 }, 0 }), - activation("act", "concat", activation_func::relu), - reorder("reorder_bfyx", "act", cldnn::format::bfyx, p.default_type) - ); - - tolerance = 1.f; - execute(p); -} - -class concat_onednn_eltwise : public ConcatOneDNNFusingTest {}; -TEST_P(concat_onednn_eltwise, along_f) { - auto p = GetParam(); - layout data_layout(p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0]*2, 1, 1 }); - - create_topologies( - input_layout("input0", get_input_layout(p)), - input_layout("input1", get_input_layout(p)), - data("scale_data", get_mem(data_layout, 1.0f / tensor{ 1, 1, 4, 4 }.count())), - concatenation("concat", - { "input0", "input1" }, - concatenation::concatenation_axis::along_f, - data_types::f16, - "", - padding{ { 0, 0, 0, 0 }, 0 }), - eltwise("scale", { "concat", "scale_data" }, eltwise_mode::prod, p.default_type), - reorder("reorder_bfyx", "scale", cldnn::format::bfyx, p.default_type) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_onednn_activation, ::testing::ValuesIn(std::vector{ - concat_test_params{ CASE_CONCAT_F16_1, 3, 3, "" }, -})); - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, concat_onednn_eltwise, ::testing::ValuesIn(std::vector{ - concat_test_params{ CASE_CONCAT_F32_1, 4, 4, "" }, - concat_test_params{ CASE_CONCAT_F16_1, 4, 4, "" }, -})); -#endif - -/* ----------------------------------------------------------------------------------------------------- */ -/* --------------------------------------- Pooling cases ----------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct pooling_test_params { - tensor in_shape; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; - pooling_mode pool_mode; - std::string kernel_name; -}; - -#define CASE_POOLING_F32_1 { 1, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_F32_2 { 2, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_F32_3 { 1, 32, 10, 10 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_F32_4 { 1, 32, 10, 10 }, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfyx -#define CASE_POOLING_F32_5 { 1, 32, 10, 10 }, data_types::f32, format::byxf, data_types::f32, format::bfyx -#define CASE_POOLING_F32_6 { 1, 32, 40, 40 }, data_types::f32, format::byxf, data_types::f32, format::bfyx -#define CASE_POOLING_F32_7 { 16, 32, 10, 10 }, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F32_8 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F32_9 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F32_10 { 16, 32, 10, 10, 10 }, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F32_11 { 1, 1, 3, 3 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx - -#define CASE_POOLING_F32_F16_1 { 1, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_2 { 2, 16, 8, 8 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_3 { 1, 32, 10, 10 }, data_types::f32, format::bfyx, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_4 { 1, 32, 10, 10 }, data_types::f32, format::fs_b_yx_fsv32, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_5 { 1, 32, 10, 10 }, data_types::f32, format::byxf, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_6 { 1, 32, 40, 40 }, data_types::f32, format::byxf, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_7 { 16, 32, 10, 10 }, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_8 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_9 { 16, 32, 10, 10 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx -#define CASE_POOLING_F32_F16_10 { 16, 32, 10, 10, 10 }, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx - -#define CASE_POOLING_F16_1 { 1, 16, 8, 8 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_F16_3 { 1, 32, 10, 10 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_F16_4 { 1, 32, 10, 10 }, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfyx -#define CASE_POOLING_F16_5 { 1, 32, 10, 10 }, data_types::f16, format::byxf, data_types::f32, format::bfyx -#define CASE_POOLING_F16_6 { 1, 32, 40, 40 }, data_types::f16, format::byxf, data_types::f32, format::bfyx -#define CASE_POOLING_F16_7 { 16, 32, 10, 10 }, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F16_8 { 16, 32, 10, 10 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F16_9 { 16, 32, 10, 10, 10 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_F16_10 { 16, 32, 10, 10, 10 }, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx - -#define CASE_POOLING_F16_FP16_1 { 1, 32, 10, 10 }, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_2 { 1, 32, 10, 10 }, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_3 { 1, 32, 10, 10 }, data_types::f16, format::byxf, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_4 { 1, 32, 40, 40 }, data_types::f16, format::byxf, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_5 { 16, 32, 10, 10 }, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_6 { 16, 32, 10, 10 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_7 { 16, 32, 10, 10, 10 }, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx -#define CASE_POOLING_F16_FP16_8 { 16, 32, 10, 10, 10 }, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx - -#define CASE_POOLING_U8_1 { 1, 16, 8, 8 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_U8_2 { 2, 16, 8, 8 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_U8_3 { 1, 32, 10, 10 }, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 -#define CASE_POOLING_U8_5 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx -#define CASE_POOLING_U8_6 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx - -#define CASE_POOLING_U8_FP16_3 { 1, 32, 10, 10 }, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 -#define CASE_POOLING_U8_FP16_5 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx -#define CASE_POOLING_U8_FP16_6 { 16, 32, 10, 10, 10 }, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx - -#define CASE_POOLING_I8_1 { 1, 16, 8, 8 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_POOLING_I8_2 { 2, 16, 8, 8 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_POOLING_I8_5 { 1, 32, 10, 10 }, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4 -#define CASE_POOLING_I8_6 { 16, 32, 10, 10, 10 }, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx - -#define CASE_POOLING_I8_FP16_5 { 1, 32, 10, 10 }, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4 -#define CASE_POOLING_I8_FP16_6 { 16, 32, 10, 10, 10 }, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx - -class PoolingFusingTest : public ::BaseFusingTest { -public: - void execute(pooling_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - build_options options; - options.set_option(build_option::optimize_data(true)); - if (!p.kernel_name.empty()) { - implementation_desc impl = { p.input_format, p.kernel_name }; - options.set_option(build_option::force_implementations({ { "pooling", impl } })); - } - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, options); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - ASSERT_FALSE(network_fused.get_primitives_info().empty()); - ASSERT_FALSE(network_not_fused.get_primitives_info().empty()); - - auto find_and_check = [&](primitive_info& p) -> bool { - if (p.original_id == "pooling" || p.original_id == "output_reorder") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto pi_not_fused = network_not_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_and_check); - auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_and_check); - - ASSERT_TRUE(info_fused != pi_fused.end()); - ASSERT_TRUE(info_not_fused != pi_not_fused.end()); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(pooling_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; - } - - layout get_per_channel_layout(pooling_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } -}; - -class pooling_f32_activation : public PoolingFusingTest {}; -TEST_P(pooling_f32_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), - activation("act", "pooling", activation_func::relu), - reorder("output_reorder", "act", format::bfyx, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_f32_activation, ::testing::ValuesIn(std::vector{ - pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::average, "" }, -})); - -class pooling_f32_scale : public PoolingFusingTest {}; -TEST_P(pooling_f32_scale, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 3, 3 }.count())), - pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), - scale("scale", "pooling", "scale_data"), - reorder("output_reorder", "scale", format::bfyx, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -TEST_P(pooling_f32_scale, fp16_scale_out) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 3, 3 }.count())), - pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), - scale("scale", "pooling", "scale_data", optional_data_type{ data_types::f16 }), - reorder("output_reorder", "scale", format::bfyx, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_f32_scale, ::testing::ValuesIn(std::vector{ - pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_F32_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_F16_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_U8_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 3, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 3, pooling_mode::average, "" }, -})); - -class pooling_scale_activation_quantize : public PoolingFusingTest {}; -TEST_P(pooling_scale_activation_quantize, basic) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), - pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), - scale("scale", "pooling", "scale_data"), - activation("activation", "scale", activation_func::relu), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), - reorder("output_reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -TEST_P(pooling_scale_activation_quantize, i8_output_data_type) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127, 127)), - data("out_hi", get_mem(get_single_element_layout(p), -127, 127)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), - pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), - scale("scale", "pooling", "scale_data"), - activation("activation", "scale", activation_func::relu), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("output_reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -TEST_P(pooling_scale_activation_quantize, per_channel) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), - pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), - scale("scale", "pooling", "scale_data"), - activation("activation", "scale", activation_func::atan), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), - reorder("output_reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_scale_activation_quantize, ::testing::ValuesIn(std::vector{ - // Input type: FP32 - pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_4, 2, 5, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F32_4, 2, 5, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F32_5, 2, 5, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F32_5, 2, 5, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F32_6, 2, 5, pooling_mode::average, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F32_6, 2, 5, pooling_mode::max, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F32_7, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_7, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_8, 2, 5, pooling_mode::average, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F32_8, 2, 5, pooling_mode::max, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F32_9, 2, 5, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_9, 2, 5, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_10, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - - // Input type: INT8 - pooling_test_params{ CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_I8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, - - // Input type: UINT8 - pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref" }, -})); - -INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, pooling_scale_activation_quantize, ::testing::ValuesIn(std::vector{ - pooling_test_params{ CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt" }, //currently not enabled, fusing not upported -})); - -class pooling_scale_activation : public PoolingFusingTest {}; -TEST_P(pooling_scale_activation, basic) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), - pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), - scale("scale", "pooling", "scale_data"), - activation("activation", "scale", activation_func::relu), - reorder("output_reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -TEST_P(pooling_scale_activation, eltwise_mul) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{ 1, 1, 4, 4 }.count())), - pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)), - eltwise("scale", { "pooling", "scale_data" }, eltwise_mode::prod, p.default_type), - activation("activation", "scale", activation_func::relu), - reorder("output_reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_scale_activation, ::testing::ValuesIn(std::vector{ - // Input type: F32 - pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F32_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F32_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F32_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F32_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F32_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F32_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F32_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F32_9, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_9, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - - // Input type: INT8 - pooling_test_params{ CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_I8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - - // Input type: UINT8 - pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - - // Input type: FP16 Output type: F32 - pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - - // Input type: FP16 - pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - - // Input type: FP32 - pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt" }, - pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32" }, - pooling_test_params{ CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt" }, - pooling_test_params{ CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt" }, - pooling_test_params{ CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked" }, - pooling_test_params{ CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref" }, - pooling_test_params{ CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16" }, - pooling_test_params{ CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16" }, - - // Input type: INT8 - pooling_test_params{ CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - - // Input type: UINT8 - pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4" }, - pooling_test_params{ CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref" }, - pooling_test_params{ CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref" }, -})); - -#ifdef ENABLE_ONEDNN_FOR_GPU -class PoolingOneDNNFusingTest : public ::BaseFusingTest { -public: - void execute(pooling_test_params& p) { - // Onednn post operation has issue in a machine that does not support imad. - if (!engine.get_device_info().supports_imad) - return; - - auto input_prim = get_mem(get_input_layout(p)); - - build_options onednn_options; - build_options cldnn_options; - - onednn_options.set_option(build_option::optimize_data(true)); - cldnn_options.set_option(build_option::optimize_data(true)); - - implementation_desc onednn_impl = { p.input_format, "", impl_types::onednn }; - implementation_desc cldnn_impl = { p.input_format, "", impl_types::ocl }; - onednn_options.set_option(build_option::force_implementations({ { "pooling", onednn_impl } })); - cldnn_options.set_option(build_option::force_implementations({ { "pooling", cldnn_impl } })); - - // for onednn fusing test, topology_non_fused means cldnn, topology_fused is onednn - network network_fused_cldnn(this->engine, this->topology_non_fused, cldnn_options); - network network_fused_onednn(this->engine, this->topology_fused, onednn_options); - - network_fused_cldnn.set_input_data("input", input_prim); - network_fused_onednn.set_input_data("input", input_prim); - - ASSERT_FALSE(network_fused_cldnn.get_primitives_info().empty()); - ASSERT_FALSE(network_fused_onednn.get_primitives_info().empty()); - - auto find_and_check = [&](primitive_info& p) -> bool { - if (p.original_id == "pooling" || p.original_id == "output_reorder") - return true; - return false; - }; - - auto pi_fused_onednn = network_fused_onednn.get_primitives_info(); - auto pi_fused_cldnn = network_fused_onednn.get_primitives_info(); - auto info_fused_onednn = std::find_if(pi_fused_onednn.begin(), pi_fused_onednn.end(), find_and_check); - auto info_fused_cldnn = std::find_if(pi_fused_cldnn.begin(), pi_fused_cldnn.end(), find_and_check); - - ASSERT_TRUE(info_fused_onednn != pi_fused_onednn.end()); - ASSERT_TRUE(info_fused_cldnn != pi_fused_cldnn.end()); - - compare(network_fused_cldnn, network_fused_onednn, p); - } - - layout get_input_layout(pooling_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; - } - - layout get_per_channel_layout(pooling_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } -}; - -class pooling_onednn_activation1 : public PoolingOneDNNFusingTest {}; -TEST_P(pooling_onednn_activation1, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - pooling("pooling", "input", p.pool_mode, tensor{ 1, 1, 3, 3 }, tensor{ 1 }, tensor{ { 0, 0, 1, 1, 0, 0 }, 0 }), - activation("act", "pooling", activation_func::relu), - reorder("output_reorder", "act", format::bfyx, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -class pooling_onednn_activation2 : public PoolingOneDNNFusingTest {}; -TEST_P(pooling_onednn_activation2, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - pooling("pooling", "input", p.pool_mode, { 1, 1, 3, 3 }, { 1, 1, 1, 1 }), - activation("act", "pooling", activation_func::relu), - reorder("output_reorder", "act", format::bfyx, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_onednn_activation1, ::testing::ValuesIn(std::vector{ - // pooling_test_params{ CASE_POOLING_F32_1, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_F16_1, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_U8_1, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_U8_2, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_1, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_I8_2, 2, 2, pooling_mode::max, "" }, -})); - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, pooling_onednn_activation2, ::testing::ValuesIn(std::vector{ - pooling_test_params{ CASE_POOLING_F32_11, 2, 2, pooling_mode::max, "" }, - pooling_test_params{ CASE_POOLING_F32_11, 2, 2, pooling_mode::average, "" }, - pooling_test_params{ CASE_POOLING_F32_11, 2, 2, pooling_mode::average_no_padding, "" }, -})); -#endif - -/* ----------------------------------------------------------------------------------------------------- */ -/* -------------------------------- DepthToSpace cases ------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct depth_to_space_test_params { - tensor input_size; - tensor output_size; - depth_to_space_mode mode; - data_types input_type; - format input_format; - size_t block_size; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_DEPTH_TO_SPACE_F32_1 { 1, 16, 8, 10 }, { 1, 4, 16, 20 }, depth_to_space_mode::blocks_first, data_types::f32, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_F32_2 { 1, 32, 8, 8 }, { 1, 2, 32, 32 }, depth_to_space_mode::blocks_first, data_types::f32, format::b_fs_yx_fsv16, 4, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_F16_1 { 1, 12, 8, 8 }, { 1, 3, 16, 16 }, depth_to_space_mode::blocks_first, data_types::f16, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_F16_2 { 1, 16, 9, 8 }, { 1, 1, 36, 32 }, depth_to_space_mode::blocks_first, data_types::f16, format::b_fs_yx_fsv16, 4, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_U8_1 { 1, 128, 8, 8 }, { 1, 2, 64, 64 }, depth_to_space_mode::blocks_first, data_types::u8, format::bfyx, 8, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_U8_2 { 1, 128, 4, 8 }, { 1, 8, 16, 32 }, depth_to_space_mode::blocks_first, data_types::u8, format::b_fs_yx_fsv16, 4, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_I8_1 { 1, 16, 8, 8 }, { 1, 4, 16, 16 }, depth_to_space_mode::blocks_first, data_types::i8, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_DEPTH_TO_SPACE_I8_2 { 1, 256, 8, 8 }, { 1, 4, 64, 64 }, depth_to_space_mode::blocks_first, data_types::i8, format::b_fs_yx_fsv16, 8, data_types::f32, format::bfyx - -class DepthToSpaceFusingsTest : public ::BaseFusingTest { -public: - void execute(depth_to_space_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(depth_to_space_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(depth_to_space_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; - } - - format get_input_format(depth_to_space_test_params &p) { - return p.input_format; - } -}; - -class depth_to_space_quantize_i8 : public DepthToSpaceFusingsTest {}; -TEST_P(depth_to_space_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - depth_to_space("depth_to_space", "input", p.block_size, p.mode), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -128)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - quantize("quant", "depth_to_space", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, depth_to_space_quantize_i8, ::testing::ValuesIn(std::vector{ - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_1, 2, 3 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_2, 2, 3 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_1, 2, 3 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_2, 2, 3 }, -})); - -class depth_to_space_scale_act_eltwise_quantize_u8 : public DepthToSpaceFusingsTest {}; -TEST_P(depth_to_space_scale_act_eltwise_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - depth_to_space("depth_to_space", "input", p.block_size, p.mode), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "depth_to_space", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), 0)), - data("out_high", get_mem(get_single_element_layout(p), 255)), - quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, depth_to_space_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_1, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_2, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_1, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_2, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_1, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_2, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_1, 2, 6 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_2, 2, 6 }, -})); - - -class depth_to_space_scale_act_eltw : public DepthToSpaceFusingsTest {}; -TEST_P(depth_to_space_scale_act_eltw, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - depth_to_space("depth_to_space", "input", p.block_size, p.mode), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "depth_to_space", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - reorder("reorder_bfyx", "eltw", format::bfyx, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, depth_to_space_scale_act_eltw, ::testing::ValuesIn(std::vector{ - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_1, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F32_2, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_1, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_F16_2, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_1, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_U8_2, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_1, 2, 5 }, - depth_to_space_test_params{ CASE_DEPTH_TO_SPACE_I8_2, 2, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* -------------------------------- SpaceToDepth cases ------------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct space_to_depth_params { - tensor input_size; - tensor output_size; - space_to_depth::depth_mode mode; - data_types input_type; - format input_format; - size_t block_size; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_SPACE_TO_DEPTH_F32_1 { 2, 2, 8, 10 }, { 2, 8, 4, 5 }, space_to_depth::depth_mode::blocks_first, data_types::f32, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_F32_2 { 1, 2, 6, 6, 6 }, { 1, 54, 2, 2, 2 }, space_to_depth::depth_mode::depth_first, data_types::f32, format::bfzyx, 3, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_F16_1 { 1, 3, 6, 6 }, { 1, 12, 3, 3 }, space_to_depth::depth_mode::blocks_first, data_types::f16, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_F16_2 { 2, 1, 3, 3 }, { 2, 9, 1, 1 }, space_to_depth::depth_mode::blocks_first, data_types::f16, format::b_fs_yx_fsv16, 3, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_U8_1 { 2, 2, 8, 10 }, { 2, 8, 4, 5 }, space_to_depth::depth_mode::blocks_first, data_types::u8, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_U8_2 { 1, 2, 6, 6, 6 }, { 1, 54, 2, 2, 2 }, space_to_depth::depth_mode::depth_first, data_types::u8, format::bfzyx, 3, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_I8_1 { 1, 3, 6, 6 }, { 1, 12, 3, 3 }, space_to_depth::depth_mode::blocks_first, data_types::i8, format::bfyx, 2, data_types::f32, format::bfyx -#define CASE_SPACE_TO_DEPTH_I8_2 { 2, 1, 3, 3 }, { 2, 9, 1, 1 }, space_to_depth::depth_mode::blocks_first, data_types::i8, format::b_fs_yx_fsv16, 3, data_types::f32, format::bfyx - -class SpaceToDepthFusingsTest : public ::BaseFusingTest { -public: - void execute(space_to_depth_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(space_to_depth_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(space_to_depth_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; - } - - format get_input_format(space_to_depth_params &p) { - return p.input_format; - } -}; - -class space_to_depth_quantize_i8 : public SpaceToDepthFusingsTest {}; -TEST_P(space_to_depth_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - space_to_depth("space_to_depth", "input", p.mode, p.block_size), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -128)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - quantize("quant", "space_to_depth", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_depth_quantize_i8, ::testing::ValuesIn(std::vector{ - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_1, 2, 3 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_2, 2, 3 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_1, 2, 3 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_2, 2, 3 }, -})); - -class space_to_depth_scale_act_eltwise_quantize_u8 : public SpaceToDepthFusingsTest {}; -TEST_P(space_to_depth_scale_act_eltwise_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - space_to_depth("space_to_depth", "input", p.mode, p.block_size), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "space_to_depth", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), 0)), - data("out_high", get_mem(get_single_element_layout(p), 255)), - quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), - reorder("reorder_bfyx", "quant", format::bfyx, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_depth_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_1, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_2, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_1, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_2, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_1, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_2, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_1, 2, 6 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_2, 2, 6 }, -})); - - -class space_to_depth_scale_act_eltw : public SpaceToDepthFusingsTest {}; -TEST_P(space_to_depth_scale_act_eltw, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - space_to_depth("space_to_depth", "input", p.mode, p.block_size), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "space_to_depth", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - reorder("reorder_bfyx", "eltw", format::bfyx, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_depth_scale_act_eltw, ::testing::ValuesIn(std::vector{ - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_1, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F32_2, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_1, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_F16_2, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_1, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_U8_2, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_1, 2, 5 }, - space_to_depth_params{ CASE_SPACE_TO_DEPTH_I8_2, 2, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ------------------------------------------ Gather cases --------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct gather_test_params { - tensor dictionary_shape; - tensor indices_shape; - tensor out_shape; - format out_format; - cldnn::gather::gather_axis axis; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_GATHER_FP32_1 { 2, 3, 1, 4 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GATHER_FP32_2 { 3, 2, 1, 2 }, { 2, 3, 1, 1 }, { 2, 3, 2, 2 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GATHER_FP32_3 { 3, 1, 1, 2 }, { 2, 1, 1, 1 }, { 3, 2, 1, 2 }, format::bfyx, cldnn::gather::gather_axis::along_f, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GATHER_FP32_4 { 5, 3, 2, 2 }, { 3, 1, 1, 1 }, { 5, 2, 2, 3 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_GATHER_FP32_5 { 2, 3, 1, 2 }, { 1, 3, 1, 1 }, { 2, 3, 3, 1 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx - -#define CASE_GATHER_FP16_1 { 2, 3, 1, 4 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GATHER_FP16_2 { 3, 2, 1, 2 }, { 2, 3, 1, 1 }, { 2, 3, 2, 2 }, format::bfyx, cldnn::gather::gather_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GATHER_FP16_3 { 3, 1, 1, 2 }, { 2, 1, 1, 1 }, { 3, 2, 1, 2 }, format::bfyx, cldnn::gather::gather_axis::along_f, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GATHER_FP16_4 { 5, 3, 2, 2 }, { 3, 1, 1, 1 }, { 5, 2, 2, 3 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_GATHER_FP16_5 { 2, 3, 1, 2 }, { 1, 3, 1, 1 }, { 2, 3, 3, 1 }, format::bfyx, cldnn::gather::gather_axis::along_y, data_types::f16, format::bfyx, data_types::f16, format::bfyx - -#define CASE_GATHER_5D_FP32_1 { 2, 3, 1, 4, 1 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_b, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_GATHER_5D_FP32_2 { 2, 3, 2, 2, 2 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_f, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_GATHER_5D_FP32_3 { 5, 3, 2, 2, 2 }, { 3, 1, 1, 1 }, { 5, 3, 2, 3, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_y, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_GATHER_5D_FP32_4 { 2, 3, 1, 4, 4 }, { 2, 1, 1, 1 }, { 2, 3, 1, 4, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_z, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_GATHER_5D_FP32_5 { 3, 1, 5, 2, 1 }, { 2, 1, 1, 1 }, { 3, 1, 2, 2, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_x, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx - -#define CASE_GATHER_5D_FP16_1 { 3, 2, 1, 2, 1 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_b, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_GATHER_5D_FP16_2 { 1, 3, 1, 2, 1 }, { 2, 1, 1, 1 }, { 1, 2, 1, 2, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_f, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_GATHER_5D_FP16_3 { 2, 3, 1, 3, 3 }, { 1, 2, 1, 1 }, { 2, 3, 1, 2, 3 }, format::bfzyx, cldnn::gather::gather_axis::along_y, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_GATHER_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 1, 1, 1 }, { 3, 2, 2, 2, 2 }, format::bfzyx, cldnn::gather::gather_axis::along_z, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_GATHER_5D_FP16_5 { 1, 1, 2, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 3, 1, 1 }, format::bfzyx, cldnn::gather::gather_axis::along_x, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx - -class GatherPrimitiveFusingTest : public ::BaseFusingTest { -public: - void execute(gather_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(gather_test_params& p) { - return layout{ p.data_type, p.input_format, p.dictionary_shape }; - } - - layout get_indices_layout(gather_test_params& p) { - return layout{ p.data_type, format::bfyx, p.indices_shape }; - } - - size_t get_axis_dim(gather_test_params& p) { - switch (p.axis) { - case cldnn::gather::gather_axis::along_x: - return p.dictionary_shape.spatial[0]; - case cldnn::gather::gather_axis::along_y: - return p.dictionary_shape.spatial[1]; - case cldnn::gather::gather_axis::along_z: - return p.dictionary_shape.spatial[2]; - case cldnn::gather::gather_axis::along_w: - return p.dictionary_shape.spatial[3]; - case cldnn::gather::gather_axis::along_f: - return p.dictionary_shape.feature[0]; - case cldnn::gather::gather_axis::along_b: - return p.dictionary_shape.batch[0]; - default: - return 1; - } - } - - layout get_per_channel_layout(gather_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } -}; - -class gather_quantize : public GatherPrimitiveFusingTest {}; -TEST_P(gather_quantize, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - gather("gather_prim", "input", "gather_indices", p.axis, p.out_format, p.out_shape), - quantize("quantize", "gather_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_quantize, ::testing::ValuesIn(std::vector{ - gather_test_params{ CASE_GATHER_FP32_1, 2, 3 }, - gather_test_params{ CASE_GATHER_FP32_2, 2, 3 }, - gather_test_params{ CASE_GATHER_FP32_3, 2, 3 }, - gather_test_params{ CASE_GATHER_FP32_4, 2, 3 }, - gather_test_params{ CASE_GATHER_FP32_5, 2, 3 }, - - gather_test_params{ CASE_GATHER_FP16_1, 2, 3 }, - gather_test_params{ CASE_GATHER_FP16_2, 2, 3 }, - gather_test_params{ CASE_GATHER_FP16_3, 2, 3 }, - gather_test_params{ CASE_GATHER_FP16_4, 2, 3 }, - gather_test_params{ CASE_GATHER_FP16_5, 2, 3 }, - - gather_test_params{ CASE_GATHER_5D_FP32_1, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP32_2, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP32_3, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP32_4, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP32_5, 2, 3 }, - - gather_test_params{ CASE_GATHER_5D_FP16_1, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP16_2, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP16_3, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP16_4, 2, 3 }, - gather_test_params{ CASE_GATHER_5D_FP16_5, 2, 3 }, -})); - -class gather_scale_activation : public GatherPrimitiveFusingTest {}; -TEST_P(gather_scale_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)))), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - gather("gather_prim", "input", "gather_indices", p.axis, p.out_format, p.out_shape), - activation("activation", "gather_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_scale_activation, ::testing::ValuesIn(std::vector{ - gather_test_params{ CASE_GATHER_FP32_1, 2, 4 }, - gather_test_params{ CASE_GATHER_FP32_2, 2, 4 }, - gather_test_params{ CASE_GATHER_FP32_3, 2, 4 }, - gather_test_params{ CASE_GATHER_FP32_4, 2, 4 }, - gather_test_params{ CASE_GATHER_FP32_5, 2, 4 }, - - gather_test_params{ CASE_GATHER_FP16_1, 2, 4 }, - gather_test_params{ CASE_GATHER_FP16_2, 2, 4 }, - gather_test_params{ CASE_GATHER_FP16_3, 2, 4 }, - gather_test_params{ CASE_GATHER_FP16_4, 2, 4 }, - gather_test_params{ CASE_GATHER_FP16_5, 2, 4 }, - - gather_test_params{ CASE_GATHER_5D_FP32_1, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP32_2, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP32_3, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP32_4, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP32_5, 2, 4 }, - - gather_test_params{ CASE_GATHER_5D_FP16_1, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP16_2, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP16_3, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP16_4, 2, 4 }, - gather_test_params{ CASE_GATHER_5D_FP16_5, 2, 4 }, -})); - - -/* ----------------------------------------------------------------------------------------------------- */ -/* ------------------------------------------ ScatterUpdate cases -------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct scatter_update_test_params { - tensor dictionary_shape; - tensor indices_shape; - tensor updates_shape; - cldnn::scatter_update::scatter_update_axis axis; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_SCATTER_UPDATE_FP32_1 { 2, 4, 1, 1 }, { 2, 1, 1, 1 }, { 2, 4, 1, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_UPDATE_FP32_2 { 8, 1, 1, 1 }, { 4, 1, 1, 1 }, { 4, 1, 1, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_UPDATE_FP32_3 { 4, 3, 1, 1 }, { 2, 2, 1, 1 }, { 2, 2, 1, 3 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_UPDATE_FP32_4 { 2, 5, 1, 2 }, { 2, 2, 1, 1 }, { 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_UPDATE_FP32_5 { 2, 2, 1, 4 }, { 2, 2, 1, 1 }, { 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx - -#define CASE_SCATTER_UPDATE_FP16_1 { 2, 4, 1, 1 }, { 1, 1, 1, 2 }, { 2, 1, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_UPDATE_FP16_2 { 8, 2, 1, 20 }, { 2, 3, 1, 1 }, { 2, 3, 20, 2 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_UPDATE_FP16_3 { 2, 2, 4, 1 }, { 3, 1, 1, 1 }, { 2, 2, 3, 1 }, cldnn::scatter_update::scatter_update_axis::along_x, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_UPDATE_FP16_4 { 6, 2, 1, 1 }, { 1, 2, 1, 2 }, { 1, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_UPDATE_FP16_5 { 3, 1, 1, 5 }, { 2, 2, 1, 1 }, { 3, 1, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f16, format::bfyx, data_types::f16, format::bfyx - -#define CASE_SCATTER_UPDATE_5D_FP32_1 { 4, 3, 1, 4, 1 }, { 4, 1, 1, 1 }, { 4, 3, 1, 4, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP32_2 { 2, 3, 2, 2, 2 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP32_3 { 5, 3, 2, 4, 2 }, { 3, 1, 1, 1 }, { 5, 3, 2, 3, 2 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP32_4 { 2, 3, 1, 4, 4 }, { 2, 1, 1, 1 }, { 2, 3, 1, 4, 2 }, cldnn::scatter_update::scatter_update_axis::along_z, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP32_5 { 3, 1, 5, 2, 1 }, { 2, 1, 1, 1 }, { 3, 1, 2, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_x, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx - -#define CASE_SCATTER_UPDATE_5D_FP16_1 { 3, 2, 1, 2, 1 }, { 2, 1, 1, 1 }, { 2, 2, 2, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_b, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP16_2 { 1, 3, 1, 2, 1 }, { 2, 1, 1, 1 }, { 1, 2, 1, 2, 1 }, cldnn::scatter_update::scatter_update_axis::along_f, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP16_3 { 2, 3, 1, 3, 3 }, { 1, 2, 1, 1 }, { 2, 3, 1, 2, 3 }, cldnn::scatter_update::scatter_update_axis::along_y, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP16_4 { 3, 2, 2, 2, 2 }, { 2, 1, 1, 1 }, { 3, 2, 2, 2, 2 }, cldnn::scatter_update::scatter_update_axis::along_z, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_SCATTER_UPDATE_5D_FP16_5 { 1, 1, 4, 1, 1 }, { 3, 1, 1, 1 }, { 1, 1, 3, 1, 1 }, cldnn::scatter_update::scatter_update_axis::along_x, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx - -class ScatterUpdatePrimitiveFusingTest : public ::BaseFusingTest { -public: - void execute(scatter_update_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(scatter_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.dictionary_shape }; - } - - layout get_indices_layout(scatter_update_test_params& p) { - return layout{ p.data_type, format::bfyx, p.indices_shape }; - } - - layout get_updates_layout(scatter_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.updates_shape }; - } - - size_t get_axis_dim(scatter_update_test_params& p) { - switch (p.axis) { - case cldnn::scatter_update::scatter_update_axis::along_x: - return p.dictionary_shape.spatial[0]; - case cldnn::scatter_update::scatter_update_axis::along_y: - return p.dictionary_shape.spatial[1]; - case cldnn::scatter_update::scatter_update_axis::along_z: - return p.dictionary_shape.spatial[2]; - case cldnn::scatter_update::scatter_update_axis::along_w: - return p.dictionary_shape.spatial[3]; - case cldnn::scatter_update::scatter_update_axis::along_f: - return p.dictionary_shape.feature[0]; - case cldnn::scatter_update::scatter_update_axis::along_b: - return p.dictionary_shape.batch[0]; - default: - return 1; - } - } - - layout get_per_channel_layout(scatter_update_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.dictionary_shape.feature[0], 1, 1 } }; - } -}; - -class scatter_update_quantize : public ScatterUpdatePrimitiveFusingTest {}; -TEST_P(scatter_update_quantize, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), - data("scatter_update_updates", get_mem(get_updates_layout(p), 0, 1000)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - scatter_update("scatter_update_prim", "input", "scatter_update_indices", "scatter_update_updates", p.axis), - quantize("quantize", "scatter_update_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_update_quantize, ::testing::ValuesIn(std::vector{ - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_1, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_2, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_3, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_4, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_5, 2, 3 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_1, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_2, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_3, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 2, 3 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_4, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_5, 2, 3 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_1, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_2, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_3, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_4, 2, 3 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_5, 2, 3 }, -})); - -class scatter_update_scale_activation : public ScatterUpdatePrimitiveFusingTest {}; -TEST_P(scatter_update_scale_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), - data("scatter_update_updates", get_mem(get_updates_layout(p), 0, 1000)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - scatter_update("scatter_update_prim", "input", "scatter_update_indices", "scatter_update_updates", p.axis), - activation("activation", "scatter_update_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_update_scale_activation, ::testing::ValuesIn(std::vector{ - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_1, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_2, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_3, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_4, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_5, 2, 4 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_1, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_2, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_3, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 2, 4 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_4, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_5, 2, 4 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_1, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_2, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_3, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_4, 2, 4 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_5, 2, 4 }, -})); - -class scatter_update_scale_activation_eltwise : public ScatterUpdatePrimitiveFusingTest {}; -TEST_P(scatter_update_scale_activation_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), - data("scatter_update_updates", get_mem(get_updates_layout(p), 0, 1000)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - data("eltw_data", get_mem(layout(p.default_type, p.default_format, p.dictionary_shape))), - scatter_update("scatter_update_prim", "input", "scatter_update_indices", "scatter_update_updates", p.axis), - activation("activation", "scatter_update_prim", activation_func::abs), - eltwise("eltw", { "activation", "eltw_data" }, eltwise_mode::sum, p.default_type), - scale("scale", "eltw", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_update_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_1, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_2, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_3, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_4, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP32_5, 3, 5 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_1, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_2, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_3, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_4, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_FP16_5, 3, 5 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_1, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_2, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_3, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_4, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP32_5, 3, 5 }, - - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_1, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_2, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_3, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_4, 3, 5 }, - scatter_update_test_params{ CASE_SCATTER_UPDATE_5D_FP16_5, 3, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ------------------------------------------ ScatterElementsUpdate cases ------------------------------ */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct scatter_elements_update_test_params { - tensor input_shape; - tensor indices_shape; - cldnn::scatter_elements_update::scatter_elements_update_axis axis; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -// input shape along the update axis should be larger than the total number of elements in the update tensor. -// This is not a limitation of operation itself, but a limitation of test implementation. -#define CASE_SCATTER_ELEMENTS_UPDATE_FP32_1 { 8, 4, 1, 1 }, { 2, 4, 1, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_b, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ELEMENTS_UPDATE_FP32_2 { 2, 8, 1, 2 }, { 2, 2, 1, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_f, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ELEMENTS_UPDATE_FP32_3 { 2, 3, 10, 10 }, { 2, 2, 1, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_y, data_types::f32, format::bfyx, data_types::f32, format::bfyx - -#define CASE_SCATTER_ELEMENTS_UPDATE_FP16_1 { 2, 2, 14, 12 }, { 2, 2, 3, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_x, data_types::f16, format::bfyx, data_types::f16, format::bfyx - -#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_1 { 24, 3, 1, 4, 1 }, { 4, 3, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_b, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_2 { 2, 17, 2, 2, 2 }, { 1, 2, 2, 2, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_f, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_3 { 5, 3, 2, 20, 22 }, { 5, 1, 1, 2, 2 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_y, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx - -#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_1 { 13, 2, 1, 2, 1 }, { 2, 2, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_b, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_2 { 1, 13, 1, 2, 1 }, { 1, 2, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_f, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx -#define CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_3 { 2, 3, 1, 13, 13 }, { 2, 3, 1, 2, 1 }, cldnn::scatter_elements_update::scatter_elements_update_axis::along_y, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx - -class ScatterElementsUpdatePrimitiveFusingTest : public ::BaseFusingTest{ -public: - void execute(scatter_elements_update_test_params& p) { - - auto input_prim = get_mem(get_input_layout(p), -5, 5); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(scatter_elements_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.input_shape }; - } - - layout get_indices_layout(scatter_elements_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.indices_shape }; - } - - layout get_updates_layout(scatter_elements_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.indices_shape }; - } - - size_t get_axis_dim(scatter_elements_update_test_params& p) { - switch (p.axis) { - case cldnn::scatter_elements_update::scatter_elements_update_axis::along_x: - return p.input_shape.spatial[0]; - case cldnn::scatter_elements_update::scatter_elements_update_axis::along_y: - return p.input_shape.spatial[1]; - case cldnn::scatter_elements_update::scatter_elements_update_axis::along_z: - return p.input_shape.spatial[2]; - case cldnn::scatter_elements_update::scatter_elements_update_axis::along_w: - return p.input_shape.spatial[3]; - case cldnn::scatter_elements_update::scatter_elements_update_axis::along_f: - return p.input_shape.feature[0]; - case cldnn::scatter_elements_update::scatter_elements_update_axis::along_b: - return p.input_shape.batch[0]; - default: - return 1; - } - } - - layout get_per_channel_layout(scatter_elements_update_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_shape.feature[0], 1, 1 } }; - } -}; - -class scatter_elements_update_quantize : public ScatterElementsUpdatePrimitiveFusingTest {}; -TEST_P(scatter_elements_update_quantize, basic) { - auto p = GetParam(); - const auto &seu = scatter_elements_update("scatter_elements_update_prim", "input", "scatter_elements_update_indices", "scatter_elements_update_updates", p.axis); - const auto &q = quantize("quantize", "scatter_elements_update_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8); - const auto &r = reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_elements_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), - data("scatter_elements_update_updates", get_mem(get_updates_layout(p), 0, 100)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - seu, - q, - r - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_elements_update_quantize, ::testing::ValuesIn(std::vector{ - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_1, 2, 3 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_2, 2, 3 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_3, 2, 3 }, - - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP16_1, 2, 3 }, - - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_1, 2, 3 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_2, 2, 3 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_3, 2, 3 }, - - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_1, 2, 3 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_2, 2, 3 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_3, 2, 3 }, -})); - -class scatter_elements_update_scale_activation_eltwise : public ScatterElementsUpdatePrimitiveFusingTest {}; -TEST_P(scatter_elements_update_scale_activation_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_elements_update_indices", get_repeatless_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p)) - 1)), - data("scatter_elements_update_updates", get_mem(get_updates_layout(p), 0, 5)), - data("scale_data", get_mem(get_per_channel_layout(p), -1, 1)), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.input_shape })), - scatter_elements_update("scatter_elements_update_prim", "input", "scatter_elements_update_indices", "scatter_elements_update_updates", p.axis), - activation("activation", "scatter_elements_update_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_elements_update_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_1, 2, 5 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_2, 2, 5 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP32_3, 2, 5 }, - - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_FP16_1, 2, 5 }, - - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_1, 2, 5 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_2, 2, 5 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP32_3, 2, 5 }, - - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_1, 2, 5 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_2, 2, 5 }, - scatter_elements_update_test_params{ CASE_SCATTER_ELEMENTS_UPDATE_5D_FP16_3, 2, 5 }, -})); - -/* ------------------------------------------------------------------------------------------------------------ */ -/* ---------------------------------------- PERMUTE FUSE cases ------------------------------------------------ */ -/* ------------------------------------------------------------------------------------------------------------ */ - -struct permute_params { - tensor in_shape; - tensor out_shape; - std::vector permute_order; - tensor eltw_in_shape; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_PERMUTE_F32_0 { 1, 16, 2, 2 }, { 1, 16, 2, 2 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_1 { 1, 15, 16, 16 }, { 1, 15, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_2 { 1, 8, 16, 16 }, { 16, 16, 8, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_3 { 1, 1, 3, 4 }, { 1, 3, 4, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_4 { 2, 16, 16, 16 }, { 2, 16, 16, 16 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_5 { 1, 32, 4, 5 }, { 32, 4, 5, 1 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_6 { 1, 16, 4, 5 }, { 5, 16, 4, 1 }, { 3, 1, 2, 0 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F32_7 { 1, 16, 1, 1 }, { 1, 1, 1, 16 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - -#define CASE_PERMUTE_F16_0 { 1, 16, 4, 5 }, { 1, 16, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F16_1 { 2, 16, 4, 5 }, { 16, 4, 5, 2 }, { 1, 2, 3, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F16_2 { 1, 32, 2, 3 }, { 2, 3, 32, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F16_3 { 3, 16, 1, 1 }, { 1, 1, 16, 3 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_F16_4 { 2, 15, 4, 5 }, { 4, 2, 5, 15 }, { 2, 0, 3, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_F16_5 { 1, 15, 1, 2 }, { 15, 2, 1, 1 }, { 1, 3, 2, 0 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_F16_6 { 1, 15, 4, 4 }, { 4, 4, 1, 15 }, { 2, 3, 0, 1 }, tensor{ 0 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx - -#define CASE_PERMUTE_S8_0 { 1, 15, 4, 5 }, { 1, 15, 4, 5 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_S8_1 { 1, 15, 4, 5 }, { 5, 4, 15, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_S8_2 { 1, 16, 1, 2 }, { 1, 1, 16, 2 }, { 2, 0, 1, 3 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_S8_3 { 1, 16, 2, 2 }, { 2, 2, 16, 1 }, { 2, 3, 1, 0 }, tensor{ 0 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_U8_0 { 1, 15, 4, 5 }, { 15, 5, 1, 4 }, { 1, 3, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_U8_1 { 1, 15, 16, 16 }, { 15, 16, 1, 16 }, { 1, 2, 0, 3 }, tensor{ 0 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_U8_2 { 1, 32, 5, 4 }, { 1, 32, 5, 4 }, { 0, 1, 2, 3 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_PERMUTE_U8_3 { 1, 16, 4, 5 }, { 5, 4, 16, 1 }, { 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - -// 3d -#define CASE_PERMUTE_F32_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F32_3D_1 { 2, 15, 2, 3, 4 }, { 15, 2, 3, 4, 2 }, { 1, 2, 3, 4, 0 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F32_3D_2 { 2, 16, 4, 4, 5 }, { 4, 2, 4, 5, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F32_3D_3 { 1, 32, 4, 2, 2 }, { 2, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F32_3D_4 { 1, 16, 1, 1, 1 }, { 1, 1, 1, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx - -#define CASE_PERMUTE_F16_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F16_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 2, 15, 3 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F16_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F16_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_F16_3D_4 { 16, 16, 1, 1, 1 },{ 1, 16, 1, 1, 16 },{ 4, 0, 3, 2, 1 }, tensor{ 0 }, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx - -#define CASE_PERMUTE_S8_3D_0 { 1, 15, 4, 4, 5 }, { 1, 15, 4, 4, 5 }, { 0, 1, 2, 3, 4 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_S8_3D_1 { 2, 15, 4, 3, 4 }, { 4, 4, 15, 2, 3 }, { 4, 2, 1, 0, 3 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_S8_3D_2 { 2, 16, 4, 4, 3 }, { 2, 4, 3, 16, 4 }, { 0, 3, 4, 1, 2 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_S8_3D_3 { 1, 32, 4, 2, 1 }, { 2, 32, 4, 1, 1 }, { 3, 1, 2, 4, 0 }, tensor{ 0 }, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_U8_3D_0 { 16, 16, 1, 1, 1 }, { 1, 1, 16, 16, 1 }, { 2, 4, 0, 1, 3 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_U8_3D_1 { 16, 16, 1, 1, 1 }, { 1, 1, 1, 16, 16 }, { 4, 3, 2, 1, 0 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_U8_3D_2 { 2, 16, 4, 4, 3 }, { 4, 2, 4, 3, 16 }, { 3, 0, 2, 4, 1 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_U8_3D_3 { 1, 32, 4, 2, 1 }, { 1, 2, 32, 1, 4 }, { 4, 3, 1, 0, 2 }, tensor{ 0 }, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx - -// permute_tile_8x8_4x4 -#define CASE_PERMUTE_TILE_8x8_4x4_4D_0 { 1, 8, 8, 2 }, { 1, 2, 8, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_TILE_8x8_4x4_4D_1 { 1, 5, 8, 2 }, { 1, 2, 5, 8 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_TILE_8x8_4x4_4D_2 { 1, 8, 5, 2 }, { 1, 2, 8, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_TILE_8x8_4x4_4D_3 { 1, 5, 5, 2 }, { 1, 2, 5, 5 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_PERMUTE_TILE_8x8_4x4_5D_0 { 1, 8, 8, 2, 2 }, { 1, 2, 8, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_TILE_8x8_4x4_5D_1 { 1, 5, 8, 2, 2 }, { 1, 2, 5, 8, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_TILE_8x8_4x4_5D_2 { 1, 8, 5, 2, 2 }, { 1, 2, 8, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_TILE_8x8_4x4_5D_3 { 1, 5, 5, 2, 2 }, { 1, 2, 5, 5, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_PERMUTE_TILE_8x8_4x4_6D_0 { 1, 8, 8, 2, 2, 2 }, { 1, 2, 8, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx -#define CASE_PERMUTE_TILE_8x8_4x4_6D_1 { 1, 5, 8, 2, 2, 2 }, { 1, 2, 5, 8, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx -#define CASE_PERMUTE_TILE_8x8_4x4_6D_2 { 1, 8, 5, 2, 2, 2 }, { 1, 2, 8, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx -#define CASE_PERMUTE_TILE_8x8_4x4_6D_3 { 1, 5, 5, 2, 2, 2 }, { 1, 2, 5, 5, 2, 2 }, { 0, 5, 1, 2, 3, 4 }, tensor{ 0 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx - -// permute_tile_8x8_4x4_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0 { 1, 16, 16, 2 }, { 1, 2, 16, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1 { 1, 15, 16, 2 }, { 1, 2, 15, 16 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2 { 1, 16, 3, 2 }, { 1, 2, 16, 3 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3 { 1, 5, 7, 2 }, { 1, 2, 5, 7 }, { 0, 3, 1, 2 }, tensor{ 0 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0 { 1, 16, 16, 2, 2 }, { 1, 2, 16, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1 { 1, 15, 16, 2, 2 }, { 1, 2, 15, 16, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2 { 1, 16, 3, 2, 2 }, { 1, 2, 16, 3, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 -#define CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3 { 1, 5, 7, 2, 2 }, { 1, 2, 5, 7, 2 }, { 0, 4, 1, 2, 3 }, tensor{ 0 }, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::b_fs_zyx_fsv16 - -class PermuteFusingTest : public ::BaseFusingTest { -public: - - void execute(permute_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(permute_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape, padding{} }; - } - - layout get_per_channel_layout(permute_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.out_shape.feature[0], 1, 1 } }; - } -}; - -class permute_activation_scale_eltwise: public PermuteFusingTest {}; -TEST_P(permute_activation_scale_eltwise, basic) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape })), - data("scale_data", get_mem(get_per_channel_layout(p), 5e-1f)), - permute("permute", "input", p.permute_order), - scale("scale", "permute", "scale_data"), - activation("actv", "scale", activation_func::relu), - eltwise("eltwise", { "actv", "eltwise_data" }, eltwise_mode::sum, p.data_type), - reorder("reorder_bfyx", "eltwise", p.default_format, p.default_type) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_activation_scale_eltwise, ::testing::ValuesIn(std::vector{ - permute_params{ CASE_PERMUTE_F32_0, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_1, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_2, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_3, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_4, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_5, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_6, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_7, 2, 5 }, - - permute_params{ CASE_PERMUTE_F16_0, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_1, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_2, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_3, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_4, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_5, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_6, 2, 5 }, - - permute_params{ CASE_PERMUTE_S8_0, 2, 5 }, - permute_params{ CASE_PERMUTE_S8_1, 2, 5 }, - permute_params{ CASE_PERMUTE_S8_2, 2, 5 }, - permute_params{ CASE_PERMUTE_S8_3, 2, 5 }, - - permute_params{ CASE_PERMUTE_U8_0, 2, 5 }, - permute_params{ CASE_PERMUTE_U8_1, 2, 5 }, - permute_params{ CASE_PERMUTE_U8_2, 2, 5 }, - permute_params{ CASE_PERMUTE_U8_3, 2, 5 }, - - permute_params{ CASE_PERMUTE_F32_3D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_3D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_3D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_3D_3, 2, 5 }, - permute_params{ CASE_PERMUTE_F32_3D_4, 2, 5 }, - - permute_params{ CASE_PERMUTE_F16_3D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_3D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_3D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_3D_3, 2, 5 }, - permute_params{ CASE_PERMUTE_F16_3D_4, 2, 5 }, - - permute_params{ CASE_PERMUTE_S8_3D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_S8_3D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_S8_3D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_S8_3D_3, 2, 5 }, - - permute_params{ CASE_PERMUTE_U8_3D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_U8_3D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_U8_3D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_U8_3D_3, 2, 5 }, - - // Fusing tests for permute_tile_8x8_4x4 - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_3, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_3, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_3, 2, 5 }, - - // Fusing tests for permute_tile_8x8_4x4_fsv16 - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2, 2, 5 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3, 2, 5 }, -})); - -class permute_quant_u8: public PermuteFusingTest {}; -TEST_P(permute_quant_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - permute("permute", "input", p.permute_order), - quantize("quant", "permute", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quant", p.default_format, p.default_type) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_quant_u8, ::testing::ValuesIn(std::vector{ - permute_params{ CASE_PERMUTE_F32_0, 2, 3 }, - permute_params{ CASE_PERMUTE_F32_1, 2, 3 }, - - permute_params{ CASE_PERMUTE_F16_0, 2, 3 }, - permute_params{ CASE_PERMUTE_F16_1, 2, 3 }, -})); - -class permute_scale_actv_eltw_scale_actv_quant_i8: public PermuteFusingTest {}; -TEST_P(permute_scale_actv_eltw_scale_actv_quant_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale1_data", get_mem(get_per_channel_layout(p), 1e-1f)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("eltw_data", get_mem(layout(p.data_type, p.input_format, p.out_shape))), - data("scale2_data", get_mem(get_per_channel_layout(p), 1e-1f)), - permute("permute", "input", p.permute_order), - scale("scale1", "permute", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.data_type), - scale("scale2", "eltw", "scale2_data"), - activation("actv2", "scale2", activation_func::relu), - quantize("quant", "actv2", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("out", "quant", p.default_format, p.default_type) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_actv_eltw_scale_actv_quant_i8, ::testing::ValuesIn(std::vector{ - permute_params{ CASE_PERMUTE_F32_0, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_1, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_2, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_3, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_4, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_5, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_6, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_7, 2, 8 }, - - permute_params{ CASE_PERMUTE_F16_0, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_1, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_2, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_3, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_4, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_5, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_6, 2, 8 }, - - permute_params{ CASE_PERMUTE_S8_0, 2, 8 }, - permute_params{ CASE_PERMUTE_S8_1, 2, 8 }, - permute_params{ CASE_PERMUTE_S8_2, 2, 8 }, - permute_params{ CASE_PERMUTE_S8_3, 2, 8 }, - - permute_params{ CASE_PERMUTE_U8_0, 2, 8 }, - permute_params{ CASE_PERMUTE_U8_1, 2, 8 }, - permute_params{ CASE_PERMUTE_U8_2, 2, 8 }, - permute_params{ CASE_PERMUTE_U8_3, 2, 8 }, - - permute_params{ CASE_PERMUTE_F32_3D_0, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_3D_1, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_3D_2, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_3D_3, 2, 8 }, - permute_params{ CASE_PERMUTE_F32_3D_4, 2, 8 }, - - permute_params{ CASE_PERMUTE_F16_3D_0, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_3D_1, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_3D_2, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_3D_3, 2, 8 }, - permute_params{ CASE_PERMUTE_F16_3D_4, 2, 8 }, - - permute_params{ CASE_PERMUTE_S8_3D_0, 2, 8 }, - permute_params{ CASE_PERMUTE_S8_3D_1, 2, 8 }, - permute_params{ CASE_PERMUTE_S8_3D_2, 2, 8 }, - permute_params{ CASE_PERMUTE_S8_3D_3, 2, 8 }, - - permute_params{ CASE_PERMUTE_U8_3D_0, 2, 8 }, - permute_params{ CASE_PERMUTE_U8_3D_1, 2, 8 }, - permute_params{ CASE_PERMUTE_U8_3D_2, 2, 8 }, - permute_params{ CASE_PERMUTE_U8_3D_3, 2, 8 }, -})); - -class permute_scale_eltwise_actv_scale_actv: public PermuteFusingTest {}; -TEST_P(permute_scale_eltwise_actv_scale_actv, basic) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.out_shape })), - data("scale_data1", get_mem(get_per_channel_layout(p), 1e-1f)), - data("scale_data2", get_mem(get_per_channel_layout(p), 1e-1f)), - permute("permute", "input", p.permute_order), - scale("scale1", "permute", "scale_data1"), - activation("actv1", "scale1", activation_func::relu), - eltwise("eltwise", { "actv1", "eltwise_data" }, eltwise_mode::sum, p.default_type), - scale("scale2", "eltwise", "scale_data2"), - activation("actv2", "scale2", activation_func::relu), - reorder("reorder_bfyx", "actv2", p.default_format, p.default_type) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_scale_eltwise_actv_scale_actv, ::testing::ValuesIn(std::vector{ - permute_params{ CASE_PERMUTE_F32_0, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_1, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_2, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_3, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_4, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_5, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_6, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_7, 2, 7 }, - - permute_params{ CASE_PERMUTE_F16_0, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_1, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_2, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_3, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_4, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_5, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_6, 2, 7 }, - - permute_params{ CASE_PERMUTE_S8_0, 2, 7 }, - permute_params{ CASE_PERMUTE_S8_1, 2, 7 }, - permute_params{ CASE_PERMUTE_S8_2, 2, 7 }, - permute_params{ CASE_PERMUTE_S8_3, 2, 7 }, - - permute_params{ CASE_PERMUTE_U8_0, 2, 7 }, - permute_params{ CASE_PERMUTE_U8_1, 2, 7 }, - permute_params{ CASE_PERMUTE_U8_2, 2, 7 }, - permute_params{ CASE_PERMUTE_U8_3, 2, 7 }, - - permute_params{ CASE_PERMUTE_F32_3D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_3D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_3D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_3D_3, 2, 7 }, - permute_params{ CASE_PERMUTE_F32_3D_4, 2, 7 }, - - permute_params{ CASE_PERMUTE_F16_3D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_3D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_3D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_3D_3, 2, 7 }, - permute_params{ CASE_PERMUTE_F16_3D_4, 2, 7 }, - - permute_params{ CASE_PERMUTE_S8_3D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_S8_3D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_S8_3D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_S8_3D_3, 2, 7 }, - - permute_params{ CASE_PERMUTE_U8_3D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_U8_3D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_U8_3D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_U8_3D_3, 2, 7 }, - - // Fusing tests for permute_tile_8x8_4x4 - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_4D_3, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_5D_3, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_6D_3, 2, 7 }, - - // Fusing tests for permute_tile_8x8_4x4_fsv16 - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_4D_3, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_0, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_1, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_2, 2, 7 }, - permute_params{ CASE_PERMUTE_TILE_8x8_4x4_FSV16_5D_3, 2, 7 }, -})); - -/* ------------------------------------------------------------------------------------------------------------ */ -/* ---------------------------- PERMUTE FUSE REDUNDANT REORDER cases ------------------------------------------ */ -/* ------------------------------------------------------------------------------------------------------------ */ - -struct permute_reorder_params { - tensor in_shape; - std::vector permute_order1; - std::vector permute_order2; - data_types permute_type; - data_types output_type; - format permute_format; - format output_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_PERMUTE_REORDER_F32_0 { 1, 16, 32, 2 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_F32_1 { 2, 7, 9, 27 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, format::bfyx -#define CASE_PERMUTE_REORDER_F32_2 { 1, 16, 4, 5, 16 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx -#define CASE_PERMUTE_REORDER_F16_0 { 1, 16, 2, 4 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_F16_1 { 1, 16, 4, 5, 16 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx -#define CASE_PERMUTE_REORDER_F16_2 { 1, 5, 1, 2, 14 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::f16, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx - -// type change -#define CASE_PERMUTE_REORDER_S8_TO_F32_0 { 1, 15, 4, 5 }, { 0, 3, 2, 1 }, { 0, 3, 2, 1 }, data_types::i8, data_types::f32, format::b_fs_yx_fsv4, format::bfyx -#define CASE_PERMUTE_REORDER_S8_TO_F32_1 { 1, 2, 15, 4, 5 }, { 0, 3, 2, 1, 4 }, { 0, 3, 2, 1, 4 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx -#define CASE_PERMUTE_REORDER_F32_TO_F16_0 { 1, 5, 1, 2, 14 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::f32, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx -#define CASE_PERMUTE_REORDER_U8_TO_F16_0 { 1, 17, 1, 2, 7 }, { 0, 4, 2, 3, 1 }, { 0, 1, 2, 3, 4 }, data_types::u8, data_types::f16, format::b_fs_zyx_fsv16, format::bfzyx - -// dim change -#define CASE_PERMUTE_REORDER_4D_TO_5D_F32_0 { 1, 16, 8, 16 }, { 1, 2, 0, 3 }, { 0, 3, 1, 4, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfzyx -#define CASE_PERMUTE_REORDER_4D_TO_6D_F32_1 { 1, 16, 8, 16 }, { 0, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfyx, format::bfwzyx -#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_0 { 1, 16, 4, 5, 18 },{ 0, 4, 2, 3, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_5D_TO_4D_F32_1 { 1, 16, 4, 5, 16 },{ 0, 2, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfzyx, format::bfyx -#define CASE_PERMUTE_REORDER_5D_TO_6D_F32_2 { 1, 16, 8, 4, 16 }, { 0, 4, 2, 3, 1 }, { 0, 3, 5, 4, 1, 2 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx -#define CASE_PERMUTE_REORDER_6D_TO_4D_F32_0 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfyx -#define CASE_PERMUTE_REORDER_6D_TO_5D_F32_1 { 1, 16, 4, 5, 4, 16 }, { 0, 2, 5, 3, 4, 1 }, { 0, 3, 4, 1, 2 }, data_types::f32, data_types::f32, format::bfwzyx, format::bfzyx - -// permute_opt for blocked format -#define CASE_PERMUTE_REORDER_TILED_F32_0 { 1, 256, 2, 64 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F32_1 { 1, 78, 2, 259 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F32_2 { 1, 48, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 4, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfzyx - -// permute_opt for blocked format => reorder to differnt dim -#define CASE_PERMUTE_REORDER_TILED_F32_3 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F32_4 { 2, 273, 19, 19 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F32_5 { 2, 546, 2, 2 }, { 0, 3, 1, 2 }, { 0, 2, 3, 1 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, format::bfyx - -// permute opt for blocked format => reorder to different dim/type -#define CASE_PERMUTE_REORDER_TILED_I8_4 { 1, 45, 1, 3, 259 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::i8, data_types::f32, format::b_fs_zyx_fsv16, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F16_5 { 1, 48, 3, 256 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 }, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, format::bfzyx -#define CASE_PERMUTE_REORDER_TILED_F16_6 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 5, 4, 3, 1 }, data_types::f16, data_types::f32, format::b_fs_zyx_fsv16, format::bfwzyx - -// permute opt for non_blocked format => reorder to differnt dim/type -#define CASE_PERMUTE_REORDER_TILED_F16_7 { 1, 48, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F16_8 { 1, 28, 2, 2, 3, 256 }, { 0, 5, 1, 2, 3, 4 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfwzyx, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F16_9 { 1, 24, 2, 3, 256 }, { 0, 4, 1, 2, 3 }, { 0, 2, 3, 1 }, data_types::f16, data_types::f32, format::bfzyx, format::bfyx -#define CASE_PERMUTE_REORDER_TILED_F16_10 { 1, 35, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 3, 1 }, data_types::f16, data_types::f32, format::bfyx, format::bfzyx -#define CASE_PERMUTE_REORDER_TILED_F16_11 { 1, 32, 3, 253 }, { 0, 3, 1, 2 }, { 0, 2, 4, 5, 3, 1 }, data_types::f16, data_types::f32, format::bfyx, format::bfwzyx - -class PermuteReorderFusingTest : public ::BaseFusingTest { -public: - - void execute(permute_reorder_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p, true); - } - - layout get_input_layout(permute_reorder_params& p) { - return layout{ p.permute_type, p.permute_format, p.in_shape, padding{} }; - } -}; - -class permute_redundant_reorder : public PermuteReorderFusingTest {}; - -TEST_P(permute_redundant_reorder, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - permute("permute1", "input", p.permute_order1), - reorder("reorder1", "permute1", p.output_format, p.output_type), // to be fused - permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_redundant_reorder, ::testing::ValuesIn(std::vector{ - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_S8_TO_F32_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_TO_F16_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_U8_TO_F16_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 3 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 3 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 3 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 3 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 3 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_2, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_3, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_4, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_I8_4, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_5, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_6, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_7, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_8, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_9, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 4 }, -})); - -class permute_act_reorder : public PermuteReorderFusingTest {}; - -TEST_P(permute_act_reorder, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - permute("permute1", "input", p.permute_order1), - activation("activation", "permute1", activation_func::abs), - reorder("reorder1", "activation", p.output_format, p.output_type), // to be fused - permute("permute2", "reorder1", p.permute_order2) // dummy last op to make reorder fused - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, permute_act_reorder, ::testing::ValuesIn(std::vector{ - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_0, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_1, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F32_2, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F16_0, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F16_1, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_F16_2, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_5D_F32_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_4D_TO_6D_F32_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_0, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_4D_F32_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_5D_TO_6D_F32_2, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_4D_F32_0, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_6D_TO_5D_F32_1, 3, 4 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_0, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_1, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_2, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F32_3, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_5, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_6, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_7, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_8, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_9, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_10, 3, 5 }, - permute_reorder_params{ CASE_PERMUTE_REORDER_TILED_F16_11, 3, 5 }, -})); - -class NormalizeFusingTest : public ::BaseFusingTest { -public: - void execute(normalize_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(normalize_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; - } - - layout get_per_channel_layout(normalize_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } - - layout get_weights_layout(normalize_test_params& p) { - return layout { p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } -}; - -class normalize_i8_quantize : public NormalizeFusingTest {}; -TEST_P(normalize_i8_quantize, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - normalize("normalizel2", "input", "weights", p.across_spatial), - quantize("quantize", "normalizel2", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8), - reorder("output_reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, normalize_i8_quantize, ::testing::ValuesIn(std::vector{ - normalize_test_params{ CASE_NORMALIZE_I8_1, false, 2, 3 }, - normalize_test_params{ CASE_NORMALIZE_I8_1, true, 2, 3 }, -})); - -class normalize_i8_float : public NormalizeFusingTest {}; -TEST_P(normalize_i8_float, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/255)), - normalize("normalizel2", "input", "weights", p.across_spatial), - scale("scale", "normalizel2", "scale_data"), - activation("activation", "scale", activation_func::abs), - reorder("output_reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-05f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, normalize_i8_float, ::testing::ValuesIn(std::vector{ - normalize_test_params{ CASE_NORMALIZE_I8_1, false, 2, 4 }, - normalize_test_params{ CASE_NORMALIZE_I8_1, true, 2, 4 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- BatchToSpace cases ----------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct batch_to_space_test_params { - tensor input_size; - tensor output_size; - data_types input_type; - format input_format; - tensor block_shape; - tensor crops_begin; - tensor crops_end; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_BATCH_TO_SPACE_F32_1 { 8, 1, 1, 1 }, { 2, 1, 2, 2 }, data_types::f32, format::bfyx, { 1, 1, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_F32_2 { 64, 16, 2, 2 }, { 2, 112, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, { 1, 8, 2, 2 }, { 0, 8, 0, 0 }, { 0, 8, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_F16_1 { 16, 4, 1, 2 }, { 2, 12, 1, 2 }, data_types::f16, format::bfyx, { 1, 4, 2, 1 }, { 0, 2, 1, 0 }, { 0, 2, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_F16_2 { 32, 16, 2, 1 }, { 1, 16, 32, 2 }, data_types::f16, format::b_fs_yx_fsv16, { 1, 1, 16, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_U8_1 { 30, 12, 4, 6 }, { 1, 52, 8, 9 }, data_types::u8, format::bfyx, { 1, 5, 2, 3 }, { 0, 8, 0, 9 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_U8_2 { 24, 32, 4, 5 }, { 2, 64, 12, 8 }, data_types::u8, format::b_fs_yx_fsv16, { 1, 2, 3, 2 }, { 0, 0, 0, 2 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_I8_1 { 32, 1, 3, 4 }, { 1, 8, 6, 8 }, data_types::i8, format::bfyx, { 1, 8, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_BATCH_TO_SPACE_I8_2 { 16, 16, 2, 1 }, { 2, 32, 4, 2 }, data_types::i8, format::b_fs_yx_fsv16, { 1, 2, 2, 2 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx - -class BatchToSpaceFusingsTest : public ::BaseFusingTest { -public: - void execute(batch_to_space_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(batch_to_space_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(batch_to_space_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; - } -}; - -class batch_to_space_quantize_i8 : public BatchToSpaceFusingsTest {}; -TEST_P(batch_to_space_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - batch_to_space("batch_to_space", "input", p.block_shape, p.crops_begin, p.crops_end, p.output_size), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -128)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - quantize("quant", "batch_to_space", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, batch_to_space_quantize_i8, ::testing::ValuesIn(std::vector{ - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_1, 2, 3 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_2, 2, 3 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_1, 2, 3 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_2, 2, 3 }, -})); - -class batch_to_space_scale_act_eltwise_quantize_u8 : public BatchToSpaceFusingsTest {}; -TEST_P(batch_to_space_scale_act_eltwise_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - batch_to_space("batch_to_space", "input", p.block_shape, p.crops_begin, p.crops_end, p.output_size), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "batch_to_space", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), 0)), - data("out_high", get_mem(get_single_element_layout(p), 255)), - quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, batch_to_space_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_1, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_2, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_1, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_2, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_1, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_2, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_1, 2, 6 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_2, 2, 6 }, -})); - -class batch_to_space_scale_act_eltw : public BatchToSpaceFusingsTest {}; -TEST_P(batch_to_space_scale_act_eltw, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - batch_to_space("batch_to_space", "input", p.block_shape, p.crops_begin, p.crops_end, p.output_size), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "batch_to_space", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, batch_to_space_scale_act_eltw, ::testing::ValuesIn(std::vector{ - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_1, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F32_2, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_1, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_F16_2, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_1, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_U8_2, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_1, 2, 5 }, - batch_to_space_test_params{ CASE_BATCH_TO_SPACE_I8_2, 2, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- SpaceToBatch cases ----------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct space_to_batch_test_params { - tensor input_size; - tensor output_size; - data_types input_type; - format input_format; - tensor block_shape; - tensor pads_begin; - tensor pads_end; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_SPACE_TO_BATCH_F32_1 { 1, 4, 8, 8 }, { 16, 2, 3, 8 }, data_types::f32, format::bfyx, { 1, 2, 4, 1 }, { 0, 0, 4, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_F32_2 { 2, 16, 4, 6 }, { 24, 4, 4, 3 }, data_types::f32, format::b_fs_yx_fsv16, { 1, 4, 1, 3 }, { 0, 0, 0, 0 }, { 0, 0, 0, 3 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_F16_1 { 1, 1, 6, 8 }, { 48, 1, 1, 1 }, data_types::f16, format::bfyx, { 1, 1, 6, 8 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_F16_2 { 1, 32, 1, 5 }, { 20, 4, 1, 4 }, data_types::f16, format::b_fs_yx_fsv16, { 1, 10, 1, 2 }, { 0, 8, 0, 0 }, { 0, 0, 0, 3 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_U8_1 { 3, 12, 4, 8 }, { 48, 6, 2, 3 }, data_types::u8, format::bfyx, { 1, 2, 2, 4 }, { 0, 0, 0, 4 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_U8_2 { 2, 16, 3, 6 }, { 30, 4, 1, 6 }, data_types::u8, format::b_fs_yx_fsv16, { 1, 5, 3, 1 }, { 0, 4, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_I8_1 { 1, 2, 8, 1 }, { 4, 2, 2, 1 }, data_types::i8, format::bfyx, { 1, 1, 4, 1 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, data_types::f32, format::bfyx -#define CASE_SPACE_TO_BATCH_I8_2 { 1, 32, 4, 8 }, { 48, 2, 6, 3 }, data_types::i8, format::b_fs_yx_fsv16, { 1, 16, 1, 3 }, { 0, 0, 2, 0 }, { 0, 0, 0, 1 }, data_types::f32, format::bfyx - -class SpaceToBatchFusingsTest : public ::BaseFusingTest { -public: - void execute(space_to_batch_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(space_to_batch_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(space_to_batch_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.output_size.feature[0], 1, 1 } }; - } -}; - -class space_to_batch_quantize_i8 : public SpaceToBatchFusingsTest {}; -TEST_P(space_to_batch_quantize_i8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - space_to_batch("space_to_batch", "input", p.block_shape, p.pads_begin, p.pads_end, p.output_size), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), -128)), - data("out_high", get_mem(get_single_element_layout(p), 127)), - quantize("quant", "space_to_batch", "in_low", "in_high", "out_low", "out_high", 256, data_types::i8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_batch_quantize_i8, ::testing::ValuesIn(std::vector{ - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_1, 2, 3 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_2, 2, 3 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_1, 2, 3 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_2, 2, 3 }, -})); - -class space_to_batch_scale_act_eltwise_quantize_u8 : public SpaceToBatchFusingsTest {}; -TEST_P(space_to_batch_scale_act_eltwise_quantize_u8, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - space_to_batch("space_to_batch", "input", p.block_shape, p.pads_begin, p.pads_end, p.output_size), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "space_to_batch", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - data("in_low", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_high", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_low", get_mem(get_single_element_layout(p), 0)), - data("out_high", get_mem(get_single_element_layout(p), 255)), - quantize("quant", "eltw", "in_low", "in_high", "out_low", "out_high", 256, data_types::u8), - reorder("reorder_bfyx", "quant", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_batch_scale_act_eltwise_quantize_u8, ::testing::ValuesIn(std::vector{ - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_1, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_2, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_1, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_2, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_1, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_2, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_1, 2, 6 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_2, 2, 6 }, -})); - - -class space_to_batch_scale_act_eltw : public SpaceToBatchFusingsTest {}; -TEST_P(space_to_batch_scale_act_eltw, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - space_to_batch("space_to_batch", "input", p.block_shape, p.pads_begin, p.pads_end, p.output_size), - data("scale1_data", get_mem(get_per_channel_layout(p), -0.125f)), - scale("scale1", "space_to_batch", "scale1_data"), - activation("actv1", "scale1", activation_func::relu), - data("eltw_data", get_mem(layout(p.default_type, p.input_format, p.output_size))), - eltwise("eltw", { "actv1", "eltw_data" }, eltwise_mode::sum, p.default_type), - reorder("reorder_bfyx", "eltw", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, space_to_batch_scale_act_eltw, ::testing::ValuesIn(std::vector{ - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_1, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F32_2, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_1, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_F16_2, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_1, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_U8_2, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_1, 2, 5 }, - space_to_batch_test_params{ CASE_SPACE_TO_BATCH_I8_2, 2, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- Eltwise cases ---------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct eltwise_test_params { - tensor input_size; - data_types input_type; - data_types input_type2; - format input_format; - data_types default_type; - format default_format; - eltwise_mode mode; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_ELTWISE_FP32_1 { 2, 16, 4, 4 }, data_types::f32, data_types::f32, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_FP32_2 { 2, 16, 4, 4 }, data_types::f32, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_FP32_3 { 2, 32, 4, 8 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_FP32_4 { 2, 16, 4, 4 }, data_types::f32, data_types::f32, format::bfwzyx, data_types::f32, format::bfwzyx, eltwise_mode::sum -#define CASE_ELTWISE_FP16_1 { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_FP16_2 { 2, 16, 4, 4 }, data_types::f16, data_types::f16, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_FP16_3 { 2, 32, 4, 8 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_FP16_4 { 3, 32, 4, 4 }, data_types::f16, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::fs_b_yx_fsv32, eltwise_mode::sum -#define CASE_ELTWISE_I8_1 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_I8_2 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_I8_3 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_U8_1 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_U8_2 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_U8_3 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_FP32_FP16_1 { 2, 16, 4, 4 }, data_types::f32, data_types::f16, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_FP32_FP16_2 { 2, 16, 4, 4 }, data_types::f32, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_FP32_FP16_3 { 2, 32, 4, 4 }, data_types::f32, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_FP16_FP32_1 { 2, 16, 4, 4 }, data_types::f16, data_types::f32, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_FP16_FP32_2 { 2, 16, 4, 4 }, data_types::f16, data_types::f32, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_FP16_FP32_3 { 2, 32, 4, 4 }, data_types::f16, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_I8_FP16_1 { 2, 16, 4, 4 }, data_types::i8, data_types::f16, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_I8_FP16_2 { 2, 16, 4, 4 }, data_types::i8, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_I8_FP16_3 { 2, 32, 4, 4 }, data_types::i8, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_I8_FP32_1 { 2, 16, 4, 4 }, data_types::i8, data_types::f32, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_I8_FP32_2 { 2, 16, 4, 4 }, data_types::i8, data_types::f32, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_I8_FP32_3 { 2, 32, 4, 4 }, data_types::i8, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_U8_FP16_1 { 2, 16, 4, 4 }, data_types::u8, data_types::f16, format::bfyx, data_types::f32, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_U8_FP16_2 { 2, 16, 4, 4 }, data_types::u8, data_types::f16, format::bfzyx, data_types::f32, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_U8_FP16_3 { 2, 32, 4, 4 }, data_types::u8, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16, eltwise_mode::sum -#define CASE_ELTWISE_U8_FP32_1 { 2, 16, 4, 4 }, data_types::u8, data_types::f32, format::bfyx, data_types::f16, format::bfyx, eltwise_mode::sum -#define CASE_ELTWISE_U8_FP32_2 { 2, 16, 4, 4 }, data_types::u8, data_types::f32, format::bfzyx, data_types::f16, format::bfzyx, eltwise_mode::sum -#define CASE_ELTWISE_U8_FP32_3 { 2, 32, 4, 4 }, data_types::u8, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::b_fs_yx_fsv16, eltwise_mode::sum - -#define CASE_ELTWISE_FP32_5 { 1, 5, 4, 4 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum -#define CASE_ELTWISE_FP32_6 { 2, 32, 4, 8 }, data_types::f32, data_types::f32, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum -#define CASE_ELTWISE_FP16_5 { 2, 32, 4, 8 }, data_types::f16, data_types::f16, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4, eltwise_mode::sum -#define CASE_ELTWISE_FP16_6 { 1, 32, 4, 8 }, data_types::f16, data_types::f16, format::byxf, data_types::f16, format::byxf, eltwise_mode::sum -#define CASE_ELTWISE_I8_4 { 2, 16, 4, 4 }, data_types::i8, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum -#define CASE_ELTWISE_U8_4 { 2, 16, 4, 4 }, data_types::u8, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4, eltwise_mode::sum - - -class EltwiseFusingTest : public ::BaseFusingTest { -public: - void execute(eltwise_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - auto input_prim2 = get_mem(get_input_layout2(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - auto inputs = network_fused.get_input_ids(); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - if (std::find(inputs.begin(), inputs.end(), "input2") != inputs.end()) { - network_fused.set_input_data("input2", input_prim2); - network_not_fused.set_input_data("input2", input_prim2); - } - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(eltwise_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_input_layout2(eltwise_test_params& p) { - return layout{ p.input_type2, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(eltwise_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; - } -}; - -class eltwise_quantize : public EltwiseFusingTest {}; -TEST_P(eltwise_quantize, u8) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("out", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -TEST_P(eltwise_quantize, i8_per_channel) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -128)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("out", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_quantize, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP32_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP32_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP16_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP16_3, 3, 4 }, - // fsv4 - eltwise_test_params{ CASE_ELTWISE_FP16_5, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_5, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_6, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_I8_4, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_U8_4, 3, 4 }, -})); - -class eltwise_const_path : public EltwiseFusingTest {}; -TEST_P(eltwise_const_path, not_fuse_to_const_eltwise) { - auto p = GetParam(); - create_topologies( - data("const1", get_mem(get_input_layout2(p), -10, 10)), - data("const2", get_mem(get_input_layout2(p), -10, 10)), - input_layout("input", get_input_layout2(p)), - eltwise("eltwise", { "const1", "const2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "input" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_const_path, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_3, 2, 3 }, - eltwise_test_params{ CASE_ELTWISE_FP32_3, 2, 3 }, - eltwise_test_params{ CASE_ELTWISE_FP32_5, 2, 3 }, - eltwise_test_params{ CASE_ELTWISE_FP32_6, 2, 3 }, - eltwise_test_params{ CASE_ELTWISE_I8_4, 2, 3 }, - eltwise_test_params{ CASE_ELTWISE_U8_4, 2, 3 }, -})); - -class eltwise_fp32_fsv16 : public EltwiseFusingTest {}; -TEST_P(eltwise_fp32_fsv16, add) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::b_fs_yx_fsv16, "eltwise_b_fs_yx_fsv16" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(eltwise_fp32_fsv16, add_per_element) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_input_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::b_fs_yx_fsv16, "eltwise_b_fs_yx_fsv16" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fsv16, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 }, -})); - -class eltwise_fp32_fsv32 : public EltwiseFusingTest {}; -TEST_P(eltwise_fp32_fsv32, add) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(eltwise_fp32_fsv32, add_per_element) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_input_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::fs_b_yx_fsv32, "eltwise_fs_b_yx_fsv32" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fsv32, ::testing::ValuesIn(std::vector{ - // There's no optimized eltwise kernel yet for fsv32 layout that supports fused_ops - // So only activation is fused via legacy mechanism - eltwise_test_params{ CASE_ELTWISE_FP16_4, 4, 5 }, -})); - -class eltwise_fp32_fsv4 : public EltwiseFusingTest {}; -TEST_P(eltwise_fp32_fsv4, add) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::b_fs_yx_fsv4, "eltwise_b_fs_yx_fsv4" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(eltwise_fp32_fsv4, add_per_element) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_input_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::b_fs_yx_fsv4, "eltwise_b_fs_yx_fsv4" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fsv4, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP32_5, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_6, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_4, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_4, 3, 5 }, -})); - -class eltwise_fp32_fused_prims : public EltwiseFusingTest {}; -TEST_P(eltwise_fp32_fused_prims, scale_activation) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - scale("scale", "eltwise", "scale_data"), - activation("activation", "scale", activation_func::abs), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(eltwise_fp32_fused_prims, eltwise_activation) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("eltwise_data", get_mem(get_input_layout2(p), -10, 10)), - eltwise("eltwise1", { "input", "input2" }, p.mode, data_types::f32), - eltwise("eltwise2", { "eltwise1", "eltwise_data" }, eltwise_mode::prod, p.default_type), - activation("activation", "eltwise2", activation_func::abs), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(eltwise_fp32_fused_prims, eltwise_activation_with_broadcast) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("eltwise_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise1", { "input", "input2" }, p.mode, p.default_type), - eltwise("eltwise2", { "eltwise1", "eltwise_data" }, eltwise_mode::prod, p.default_type), - activation("activation", "eltwise2", activation_func::abs), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_fused_prims, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP32_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP32_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP32_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP32_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP32_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP32_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP16_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP16_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_FP16_3, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP16_1, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP16_2, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_FP16_3, 3, 5 }, - // fsv4 - eltwise_test_params{ CASE_ELTWISE_FP32_5, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_FP32_6, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_I8_4, 3, 5 }, - eltwise_test_params{ CASE_ELTWISE_U8_4, 3, 5 }, -})); - -class eltwise_fp32_scale : public EltwiseFusingTest {}; -TEST_P(eltwise_fp32_scale, 6d) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - scale("scale", "eltwise", "scale_data"), - reorder("out", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp32_scale, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP32_4, 3, 4 }, -})); - -class eltwise_fp16_byxf : public EltwiseFusingTest {}; -TEST_P(eltwise_fp16_byxf, add) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - data("add_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - eltwise("add", { "eltwise", "add_data" }, eltwise_mode::sum), - activation("activation", "add", activation_func::negative), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - implementation_desc eltw_impl = { format::byxf, "generic_eltwise_ref" }; - bo_fused.set_option(build_option::force_implementations({ { "eltwise", eltw_impl } })); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_fp16_byxf, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_6, 3, 5 } -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- Scale cases ------------------------------------------------ */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct scale_test_params { - tensor input_size; - data_types input_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -// Scale uses the same kernel as eltwise primitive, so the kernel is well covered by the eltwise tests above -// So here we can just check that fused scale kernel is constructed correctly (inputs are set correctly, fused precision is propagated, etc) -// and fusing conditions in the graph are correct -#define CASE_SCALE_FP32_1 { 2, 16, 4, 4 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCALE_FP32_2 { 2, 16, 4, 4 }, data_types::f32, format::bfzyx, data_types::f32, format::bfzyx -#define CASE_SCALE_FP32_3 { 2, 16, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::b_fs_yx_fsv16 - -class ScaleFusingTest : public ::BaseFusingTest { -public: - void execute(scale_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(scale_test_params& p) { - return layout{ p.input_type, p.input_format, p.input_size }; - } - - layout get_per_channel_layout(scale_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_size.feature[0], 1, 1 } }; - } -}; - -class scale_basic : public ScaleFusingTest {}; -TEST_P(scale_basic, no_bias_act_eltwise) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale", "input", "scale_data"), - activation("activation", "scale", activation_func::negative), - data("eltwise_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "activation", "eltwise_data" }, eltwise_mode::prod, p.default_type), - reorder("out", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(scale_basic, bias_act_eltwise) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - data("bias_data", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale", "input", "scale_data", "bias_data"), - activation("activation", "scale", activation_func::negative), - data("eltwise_data", get_mem(get_per_channel_layout(p), -10, 10)), - eltwise("eltwise", { "activation", "eltwise_data" }, eltwise_mode::prod, p.default_type), - reorder("out", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(scale_basic, bias_act_scale) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - data("bias_data", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale", "input", "scale_data", "bias_data"), - activation("activation", "scale", activation_func::negative), - data("scale_data2", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale2", "activation", "scale_data2"), - reorder("out", "scale2", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(scale_basic, bias_act_quantize) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - data("bias_data", get_mem(get_per_channel_layout(p), -10, 10)), - scale("scale", "input", "scale_data", "bias_data"), - activation("activation", "scale", activation_func::negative), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -128)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("out", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scale_basic, ::testing::ValuesIn(std::vector{ - scale_test_params{ CASE_SCALE_FP32_1, 2, 4 }, - scale_test_params{ CASE_SCALE_FP32_2, 2, 4 }, - scale_test_params{ CASE_SCALE_FP32_3, 2, 4 }, -})); - -class eltwise_no_pitches_same_dims_quantize : public EltwiseFusingTest {}; -TEST_P(eltwise_no_pitches_same_dims_quantize, quantize_f32_output) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -128)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 256, p.input_type), - reorder("out", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_no_pitches_same_dims_quantize, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, -})); - -class eltwise_activation : public EltwiseFusingTest {}; -TEST_P(eltwise_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - eltwise("eltwise", { "input", "input2" }, p.mode, p.default_type), - activation("activation", "eltwise", activation_func::relu, { 6.0f, 0.0f }), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -TEST_P(eltwise_activation, fp16_out) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - input_layout("input2", get_input_layout2(p)), - eltwise("eltwise", { "input", "input2" }, p.mode, data_types::f16), - activation("activation", "eltwise", activation_func::relu, { 6.0f, 0.0f }), - reorder("out", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, eltwise_activation, ::testing::ValuesIn(std::vector{ - eltwise_test_params{ CASE_ELTWISE_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP32_FP16_3, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_1, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_2, 3, 4 }, - eltwise_test_params{ CASE_ELTWISE_FP16_FP32_3, 3, 4 } -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ---------------------------------------- Reduce cases ----------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct reduce_test_params { - cldnn::tensor in_shape; - cldnn::tensor out_shape; - cldnn::data_types data_type; - cldnn::format input_format; - data_types default_type; - cldnn::format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; - cldnn::reduce_mode reduce_mode; - std::vector reduce_axes; - bool keep_dims; - std::string kernel_name; -}; - -#define CASE_REDUCE_F32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_1 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_3 { 16, 16, 16, 8, 8, 8 }, { 16, 16, 16, 8, 8, 8 }, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F32_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - -#define CASE_REDUCE_F16_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::f16, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::f16, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_F16_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - -#define CASE_REDUCE_I32_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i32, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I32_4 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i32, format::bfwzyx, data_types::f32, format::bfyx - -#define CASE_REDUCE_I8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::i8, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::i8, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_I8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - -#define CASE_REDUCE_U8_0 { 3, 7, 5, 7 }, { 3, 7, 5, 7 },data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_1 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::bfyx, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_2 { 2, 4, 8, 4, 4 }, { 2, 4, 8, 4, 4 }, data_types::u8, format::bfzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_3 { 3, 5, 3, 5, 7, 7 }, { 3, 5, 3, 5, 7, 7 }, data_types::u8, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_REDUCE_U8_4 { 2, 8, 4, 4 }, { 2, 8, 4, 4 }, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx - - -class ReduceFusingTest : public ::BaseFusingTest { -public: - void execute(reduce_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - } - - void update_out_shape(reduce_test_params& p) { - for (auto& axis : p.reduce_axes) { - switch (axis) { - case 0: // batch - p.out_shape.batch[0] = 1; - break; - case 1: // feature - p.out_shape.feature[0] = 1; - break; - case 2: // x - p.out_shape.spatial[0] = 1; - break; - case 3: // y - p.out_shape.spatial[1] = 1; - break; - case 4: // z - p.out_shape.spatial[2] = 1; - break; - case 5: // w - p.out_shape.spatial[3] = 1; - break; - } - } - } - - layout get_input_layout(reduce_test_params& p) { - return layout{ p.data_type, p.input_format, p.in_shape }; - } - - layout get_per_channel_layout(reduce_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.in_shape.feature[0], 1, 1 } }; - } -}; - -class reduce_eltwise_activation_quantize : public ReduceFusingTest {}; -TEST_P(reduce_eltwise_activation_quantize, basic) { - auto p = GetParam(); - update_out_shape(p); - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)), - data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -128)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("eltwise_data", get_mem(get_output_layout(p))), - reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), - eltwise("eltwise", { "reduce", "eltwise_data" }, eltwise_mode::sum, p.default_type), - activation("activation", "eltwise", activation_func::relu), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("output_reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -TEST_P(reduce_eltwise_activation_quantize, per_channel) { - auto p = GetParam(); - update_out_shape(p); - create_topologies( - input_layout("input", get_input_layout(p)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -128)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("eltwise_data", get_mem(get_output_layout(p))), - reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), - eltwise("eltwise", { "reduce", "eltwise_data" }, eltwise_mode::sum, p.default_type), - activation("activation", "eltwise", activation_func::relu), - quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("output_reorder", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector{ - reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_2, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_4, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_1, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_ref" }, - - reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_0, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_1, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_2, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_4, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - - reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_2, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_4, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_0, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_1, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_ref" }, - - reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::mean, { reduce::along_x, reduce::along_f, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_f, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::min, { reduce::along_x, reduce::along_y, reduce::along_f }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::sum, { reduce::along_f, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { reduce::along_f, reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { reduce::along_x, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_1, 2, 5, reduce_mode::sum, { reduce::along_y }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_2, 2, 5, reduce_mode::min, { reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::sum, { reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_U8_0, 2, 5, reduce_mode::max, { reduce::along_f }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_U8_4, 2, 5, reduce_mode::mean, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" } -})); - -class reduce_scale_activation : public ReduceFusingTest {}; -TEST_P(reduce_scale_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_single_element_layout(p), -0.125f)), - reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), - scale("scale", "reduce", "scale_data"), - activation("activation", "scale", activation_func::cos), - reorder("output_reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-02f; - execute(p); -} - -TEST_P(reduce_scale_activation, per_channel) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scale_data", get_mem(get_per_channel_layout(p), -0.125f)), - reduce("reduce", "input", p.reduce_mode, p.reduce_axes, p.keep_dims), - scale("scale", "reduce", "scale_data"), - activation("activation", "scale", activation_func::cos), - reorder("output_reorder", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-02f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, reduce_scale_activation, ::testing::ValuesIn(std::vector{ - reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::max, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_1, 2, 4, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, { reduce::along_x, reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_2, 2, 4, reduce_mode::mean, { reduce::along_x, reduce::along_y }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::l1, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::min, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F32_0, 2, 4, reduce_mode::sum, { reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, - - reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::max, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_1, 2, 4, reduce_mode::sum, { reduce::along_x, reduce::along_y, reduce::along_b }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, { reduce::along_x, reduce::along_y }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_2, 2, 4, reduce_mode::mean, { reduce::along_x, reduce::along_y }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::min, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, - reduce_test_params{ CASE_REDUCE_F16_0, 2, 4, reduce_mode::sum, { reduce::along_x }, true, "reduce_gpu_b_fs_yx_fsv16" }, -})); - -INSTANTIATE_TEST_SUITE_P(DISABLED_fusings_gpu, reduce_eltwise_activation_quantize, ::testing::ValuesIn(std::vector{ - // No layout format available for quantize/scale - reduce_test_params{ CASE_REDUCE_F32_3, 2, 4, reduce_mode::l1, { reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_F16_3, 2, 4, reduce_mode::min, { reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I32_2, 2, 4, reduce_mode::max, { reduce::along_x, reduce::along_y }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I32_3, 2, 4, reduce_mode::sum, { reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_I8_3, 2, 4, reduce_mode::mean, { reduce::along_x }, true, "reduce_ref" }, - reduce_test_params{ CASE_REDUCE_U8_3, 2, 4, reduce_mode::l2, { reduce::along_x }, true, "reduce_ref" } -})); - - -/* ----------------------------------------------------------------------------------------------- */ -/* ------------------------------------ ScatterNDUpdate cases ------------------------------------ */ -/* ----------------------------------------------------------------------------------------------- */ - -struct scatter_nd_update_test_params { - tensor input_shape; - tensor indices_shape; - tensor updates_shape; - int indices_rank; - data_types data_type; - format input_format; - data_types default_type; - format default_format; - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_SCATTER_ND_UPDATE_FP16_4D_1 { 6, 1, 1, 1 }, { 3, 1, 1, 1 }, { 3, 1, 1, 1 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_4D_2 { 6, 6, 1, 1 }, { 3, 2, 1, 1 }, { 3, 1, 1, 1 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_4D_3 { 6, 7, 8, 9 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_4D_4 { 6, 7, 8, 9 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_4D_5 { 6, 7, 8, 9 }, { 6, 2, 1, 1 }, { 6, 9, 1, 8 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_4D_6 { 6, 7, 8, 9 }, { 6, 3, 1, 1 }, { 6, 8, 1, 1 }, 2, data_types::f16, format::bfyx, data_types::f16, format::bfyx - -#define CASE_SCATTER_ND_UPDATE_FP16_5D_1 { 6, 7, 8, 9, 10 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_2 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 1 }, { 5, 10, 1, 8, 9 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_3 { 6, 7, 8, 9, 10 }, { 5, 3, 1, 1 }, { 5, 9, 1, 1, 8 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_4 { 6, 7, 8, 9, 10 }, { 5, 4, 1, 1 }, { 5, 8, 1, 1, 1 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_5 { 6, 7, 8, 9, 10 }, { 5, 5, 1, 1 }, { 5, 1, 1, 1, 1 }, 2, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_6 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 2 }, { 5, 2, 8, 9, 10 }, 3, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_7 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 3 }, { 5, 2, 1, 8, 9 }, 3, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_8 { 6, 7, 8, 9, 10 }, { 5, 2, 4, 3 }, { 5, 2, 1, 8, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_5D_9 { 6, 7, 8, 9, 10 }, { 5, 2, 3, 3 }, { 5, 2, 8, 9, 3 }, 4, data_types::f16, format::bfzyx, data_types::f16, format::bfyx - -#define CASE_SCATTER_ND_UPDATE_FP16_6D_1 { 6, 7, 8, 9, 10, 11 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10, 11 }, 1, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_6D_2 { 6, 7, 8, 9, 10, 11 }, { 5, 2, 1, 1 }, { 5, 11, 1, 8, 9, 10 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_6D_3 { 6, 7, 8, 9, 10, 11 }, { 5, 3, 1, 1 }, { 5, 10, 1, 1, 8, 9 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_6D_4 { 6, 7, 8, 9, 10, 11 }, { 5, 4, 1, 1 }, { 5, 9, 1, 1, 1, 8 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_6D_5 { 6, 7, 8, 9, 2, 2 }, { 5, 5, 1, 1 }, { 5, 8, 1, 1, 1, 1 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP16_6D_6 { 6, 7, 8, 9, 2, 2 }, { 5, 6, 1, 1 }, { 5, 1, 1, 1, 1, 1 }, 2, data_types::f16, format::bfwzyx, data_types::f16, format::bfyx - -#define CASE_SCATTER_ND_UPDATE_FP32_4D_1 { 6, 1, 1, 1 }, { 3, 1, 1, 1 }, { 3, 1, 1, 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_4D_2 { 6, 6, 1, 1 }, { 3, 2, 1, 1 }, { 3, 1, 1, 1 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_4D_3 { 6, 7, 8, 1 }, { 5, 1, 1, 1 }, { 5, 7, 8, 1 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_4D_4 { 6, 7, 8, 9 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_4D_5 { 6, 7, 8, 9 }, { 6, 2, 1, 1 }, { 6, 9, 1, 8 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_4D_6 { 6, 7, 8, 9 }, { 6, 3, 1, 1 }, { 6, 8, 1, 1 }, 2, data_types::f32, format::bfyx, data_types::f32, format::bfyx - -#define CASE_SCATTER_ND_UPDATE_FP32_5D_1 { 6, 7, 8, 9, 10 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10 }, 1, data_types::f32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_5D_2 { 6, 7, 8, 9, 10 }, { 5, 2, 1, 1 }, { 5, 10, 1, 8, 9 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_5D_3 { 6, 7, 8, 9, 10 }, { 5, 3, 1, 1 }, { 5, 9, 1, 1, 8 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_5D_4 { 6, 7, 8, 9, 10 }, { 5, 4, 1, 1 }, { 5, 8, 1, 1, 1 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_5D_5 { 6, 7, 8, 9, 10 }, { 5, 5, 1, 1 }, { 5, 1, 1, 1, 1 }, 2, data_types::f32, format::bfzyx, data_types::f32, format::bfyx - -#define CASE_SCATTER_ND_UPDATE_FP32_6D_1 { 6, 7, 8, 9, 10, 11 }, { 5, 1, 1, 1 }, { 5, 7, 8, 9, 10, 11 }, 1, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_6D_2 { 6, 7, 8, 9, 10, 11 }, { 5, 2, 1, 1 }, { 5, 11, 1, 8, 9, 10 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_6D_3 { 6, 7, 8, 9, 10, 11 }, { 5, 3, 1, 1 }, { 5, 10, 1, 1, 8, 9 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_6D_4 { 6, 7, 8, 9, 10, 11 }, { 5, 4, 1, 1 }, { 5, 9, 1, 1, 1, 8 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_6D_5 { 6, 7, 8, 9, 2, 2 }, { 5, 5, 1, 1 }, { 5, 8, 1, 1, 1, 1 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx -#define CASE_SCATTER_ND_UPDATE_FP32_6D_6 { 6, 7, 8, 9, 2, 2 }, { 5, 6, 1, 1 }, { 5, 1, 1, 1, 1, 1 }, 2, data_types::f32, format::bfwzyx, data_types::f32, format::bfyx - -class ScatterNDUpdatePrimitiveFusingTest : public ::BaseFusingTest { -public: - void execute(scatter_nd_update_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(scatter_nd_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.input_shape }; - } - - layout get_indices_layout(scatter_nd_update_test_params& p) { - return layout{ p.data_type, get_default_format(p.indices_rank), p.indices_shape }; - } - - layout get_updates_layout(scatter_nd_update_test_params& p) { - return layout{ p.data_type, p.input_format, p.updates_shape }; - } - - layout get_per_channel_layout(scatter_nd_update_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.input_shape.feature[0], 1, 1 } }; - } - - format get_default_format(int rank = 4) { - if (rank <= 4) - return cldnn::format::bfyx; - else if (rank == 5) - return cldnn::format::bfzyx; - else - return cldnn::format::bfwzyx; - } - - template - T generate_random_val(int min, int max, int k = 8) { - static std::default_random_engine generator(random_seed); - // 1/k is the resolution of the floating point numbers - std::uniform_int_distribution distribution(k * min, k * max); - T val = (T)distribution(generator); - val /= k; - - return val; - } - - template - std::vector generate_unique_indices(scatter_nd_update_test_params& p) { - std::set> unique_indices; - std::vector result; - auto indices_shape = p.indices_shape.sizes(get_default_format(p.indices_rank)); - auto last_indices_dim = indices_shape.back(); - - auto count = 1; - for (size_t i = 0; i < indices_shape.size() - 1; i++) - count *= indices_shape[i]; - - while (unique_indices.size() != count) { - std::vector indices; - for (size_t i = 0; i < last_indices_dim; i++) - indices.push_back(generate_random_val(0, indices_shape[i])); - - unique_indices.insert(indices); - } - - std::for_each(unique_indices.begin(), - unique_indices.end(), - [&](const std::vector& indices) { - result.insert(result.end(), indices.begin(), indices.end()); - }); - - return result; - } - - cldnn::memory::ptr get_indices_mem(scatter_nd_update_test_params& p) { - auto indices_layout = get_indices_layout(p); - auto prim = engine.allocate_memory(indices_layout); - if (indices_layout.data_type == data_types::f32) { - VF rnd_vec = generate_unique_indices(p); - set_values(prim, rnd_vec); - } else if (indices_layout.data_type == data_types::f16) { - VF rnd_vec = generate_unique_indices(p); - set_values(prim, rnd_vec); - } else if (indices_layout.data_type == data_types::i8) { - VF rnd_vec = generate_unique_indices(p); - set_values(prim, rnd_vec); - } else { - throw std::runtime_error("Unsupported data type for indicies of scatter_nd_update primitive"); - } - - return prim; - } -}; - -class scatter_nd_update_quantize : public ScatterNDUpdatePrimitiveFusingTest {}; -TEST_P(scatter_nd_update_quantize, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_nd_update_indices", get_indices_mem(p)), - data("scatter_nd_update_updates", get_mem(get_updates_layout(p), 0, 100)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - scatter_nd_update("scatter_nd_update_prim", "input", "scatter_nd_update_indices", "scatter_nd_update_updates", p.indices_rank), - quantize("quantize", "scatter_nd_update_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.input_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_nd_update_quantize, ::testing::ValuesIn(std::vector{ - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_1, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_2, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_3, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_4, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_5, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_6, 2, 3 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_1, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_2, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_3, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_7, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_9, 2, 3 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_3, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_4, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_5, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_6, 2, 3 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_1, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_2, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_3, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_4, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_5, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_6, 2, 3 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_1, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_2, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_3, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_4, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_5, 2, 3 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_1, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_2, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_3, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_4, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_5, 2, 3 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_6, 2, 3 }, -})); - -class scatter_nd_update_scale_activation_eltwise : public ScatterNDUpdatePrimitiveFusingTest {}; -TEST_P(scatter_nd_update_scale_activation_eltwise, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("scatter_nd_update_indices", get_indices_mem(p)), - data("scatter_nd_update_updates", get_mem(get_updates_layout(p), 0, 100)), - data("scale_data", get_mem(get_per_channel_layout(p), -1, 1)), - data("eltwise_data", get_mem(layout{ p.data_type, p.input_format, p.input_shape })), - scatter_nd_update("scatter_nd_update_prim", "input", "scatter_nd_update_indices", "scatter_nd_update_updates", p.indices_rank), - activation("activation", "scatter_nd_update_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), - reorder("reorder_bfyx", "eltwise", p.input_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, scatter_nd_update_scale_activation_eltwise, ::testing::ValuesIn(std::vector{ - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_1, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_2, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_3, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_4, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_5, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_4D_6, 2, 5 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_1, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_2, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_3, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_4, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_5, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_5D_6, 2, 5 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_1, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_2, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_3, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_4, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_5, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP16_6D_6, 2, 5 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_1, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_2, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_3, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_4, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_5, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_4D_6, 2, 5 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_1, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_2, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_3, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_4, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_5D_5, 2, 5 }, - - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_1, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_2, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_3, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_4, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_5, 2, 5 }, - scatter_nd_update_test_params{ CASE_SCATTER_ND_UPDATE_FP32_6D_6, 2, 5 }, -})); - - -/* ----------------------------------------------------------------------------------------------------- */ -/* ------------------------------------------ GatherND cases ------------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct gather_nd_test_params { - data_types data_type; - - format input_format; - tensor input_shape; - - format indices_format; - tensor indices_shape; - - format output_format; - tensor output_shape; - - int max_number_in_indices; - int indices_rank; - int batch_dims; - - data_types default_type; - format default_format; - - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_GATHER_ND_FP16_4D_1 data_types::f16, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 3, 1, 1, 1 }, format::bfyx, { 3, 7, 9, 8 }, 6, 2, 0, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_4D_2 data_types::f16, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 6, 1, 1, 1 }, format::bfyx, { 6, 8, 1, 9 }, 6, 2, 1, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_4D_3 data_types::f16, format::bfyx, { 5, 4, 7, 2 }, format::bfyx, { 5, 4, 1, 2 }, format::bfyx, { 40, 1, 1, 1 }, 6, 4, 3, data_types::f16, format::bfyx - -#define CASE_GATHER_ND_FP16_5D_1 data_types::f16, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfzyx, { 5, 6, 7, 8, 5 }, 5, 2, 0, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_5D_2 data_types::f16, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfyx, { 5, 5, 7, 8 }, 5, 2, 1, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_5D_3 data_types::f16, format::bfzyx, { 5, 4, 7, 8, 5 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 20, 1, 1, 1 }, 4, 3, 2, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_5D_4 data_types::f16, format::bfzyx, { 5, 4, 7, 8, 3 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 60, 7, 1, 1 }, 4, 4, 3, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_5D_5 data_types::f16, format::bfzyx, { 5, 4, 7, 2, 3 }, format::bfzyx, { 5, 4, 1, 2, 3 }, format::bfyx, { 120, 1, 1, 1 }, 4, 5, 4, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_5D_6 data_types::f16, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 1, 1, 3 }, format::bfzyx, { 20, 3, 7, 4, 1 }, 4, 5, 2, data_types::f16, format::bfyx - -#define CASE_GATHER_ND_FP16_6D_1 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 8, 5 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 20, 2, 6, 7 }, 5, 4, 2, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_6D_2 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 40, 6, 1, 1 }, 5, 4, 3, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_6D_3 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 2, 2 }, format::bfzyx, { 5, 4, 1, 2, 2 }, format::bfyx, { 80, 6, 1, 1 }, 5, 5, 4, data_types::f16, format::bfyx -#define CASE_GATHER_ND_FP16_6D_4 data_types::f16, format::bfwzyx, { 5, 4, 6, 3, 2, 2 }, format::bfwzyx, { 5, 4, 1, 3, 2, 2 }, format::bfyx, { 240, 1, 1, 1 }, 5, 6, 5, data_types::f16, format::bfyx - -#define CASE_GATHER_ND_FP32_4D_1 data_types::f32, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 3, 1, 1, 1 }, format::bfyx, { 3, 7, 9, 8 }, 6, 2, 0, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_4D_2 data_types::f32, format::bfyx, { 6, 7, 9, 8 }, format::bfyx, { 6, 1, 1, 1 }, format::bfyx, { 6, 8, 1, 9 }, 6, 2, 1, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_4D_3 data_types::f32, format::bfyx, { 5, 4, 7, 2 }, format::bfyx, { 5, 4, 1, 2 }, format::bfyx, { 40, 1, 1, 1 }, 6, 4, 3, data_types::f32, format::bfyx - -#define CASE_GATHER_ND_FP32_5D_1 data_types::f32, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfzyx, { 5, 6, 7, 8, 5 }, 5, 2, 0, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_5D_2 data_types::f32, format::bfzyx, { 5, 6, 7, 8, 5 }, format::bfyx, { 5, 1, 1, 1 }, format::bfyx, { 5, 5, 7, 8 }, 5, 2, 1, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_5D_3 data_types::f32, format::bfzyx, { 5, 4, 7, 8, 5 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 20, 1, 1, 1 }, 4, 3, 2, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_5D_4 data_types::f32, format::bfzyx, { 5, 4, 7, 8, 3 }, format::bfyx, { 5, 4, 1, 3 }, format::bfyx, { 60, 7, 1, 1 }, 4, 4, 3, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_5D_5 data_types::f32, format::bfzyx, { 5, 4, 7, 2, 3 }, format::bfzyx, { 5, 4, 1, 2, 3 }, format::bfyx, { 120, 1, 1, 1 }, 4, 5, 4, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_5D_6 data_types::f32, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 1, 1, 3 }, format::bfzyx, { 20, 3, 7, 4, 1 }, 4, 5, 2, data_types::f32, format::bfyx - -#define CASE_GATHER_ND_FP32_6D_1 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 8, 5 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 20, 2, 6, 7 }, 5, 4, 2, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_6D_2 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfyx, { 5, 4, 2, 2 }, format::bfyx, { 40, 6, 1, 1 }, 5, 4, 3, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_6D_3 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 2, 2 }, format::bfzyx, { 5, 4, 1, 2, 2 }, format::bfyx, { 80, 6, 1, 1 }, 5, 5, 4, data_types::f32, format::bfyx -#define CASE_GATHER_ND_FP32_6D_4 data_types::f32, format::bfwzyx, { 5, 4, 6, 3, 2, 2 }, format::bfwzyx, { 5, 4, 1, 3, 2, 2 }, format::bfyx, { 240, 1, 1, 1 }, 5, 6, 5, data_types::f32, format::bfyx - - - -class GatherNDPrimitiveFusingTest : public ::BaseFusingTest { -public: - void execute(gather_nd_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p); - } - - layout get_input_layout(gather_nd_test_params& p) { - return layout{ p.data_type, p.input_format, p.input_shape }; - } - - layout get_indices_layout(gather_nd_test_params& p) { - return layout{ p.data_type, p.indices_format, p.indices_shape }; - } - - layout get_output_layout(gather_nd_test_params& p) { - return layout{ p.data_type, p.output_format, p.output_shape }; - } - - layout get_per_channel_layout(gather_nd_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.output_shape.feature[0], 1, 1 } }; - } -}; - -class gather_nd_quantize : public GatherNDPrimitiveFusingTest {}; -TEST_P(gather_nd_quantize, basic) { - auto p = GetParam(); - - auto input_rank = 0; - if (p.input_format == format::bfyx) { - input_rank = 4; - } else if (p.input_format == format::bfzyx) { - input_rank = 5; - } else if (p.input_format == format::bfwzyx) { - input_rank = 6; - } - - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_nd_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices - 1)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - gather_nd("gather_nd_prim", "input", "gather_nd_indices", input_rank, p.indices_rank, p.batch_dims), - quantize("quantize", "gather_nd_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_nd_quantize, ::testing::ValuesIn(std::vector{ - gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_1, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_2, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_3, 2, 3 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_1, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_2, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_3, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_4, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_5, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_6, 2, 3 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_1, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_2, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_3, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_4, 2, 3 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_1, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_2, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_3, 2, 3 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_1, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_2, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_3, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_4, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_5, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_6, 2, 3 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_1, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_2, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_3, 2, 3 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_4, 2, 3 }, -})); - -class gather_nd_activation_scale_eltwise : public GatherNDPrimitiveFusingTest {}; -TEST_P(gather_nd_activation_scale_eltwise, basic) { - auto p = GetParam(); - - auto input_rank = 0; - if (p.input_format == format::bfyx) { - input_rank = 4; - } else if (p.input_format == format::bfzyx) { - input_rank = 5; - } else if (p.input_format == format::bfwzyx) { - input_rank = 6; - } - - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_nd_indices", get_mem(get_indices_layout(p), 0, p.max_number_in_indices - 1)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)), - data("eltwise_data", get_mem(get_output_layout(p))), - gather_nd("gather_nd_prim", "input", "gather_nd_indices", input_rank, p.indices_rank, p.batch_dims), - activation("activation", "gather_nd_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_nd_activation_scale_eltwise, ::testing::ValuesIn(std::vector{ - gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_1, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_2, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_4D_3, 2, 5 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_1, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_2, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_3, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_4, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_5, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_5D_6, 2, 5 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_1, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_2, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_3, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP16_6D_4, 2, 5 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_1, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_2, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_4D_3, 2, 5 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_1, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_2, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_3, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_4, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_5, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_5D_6, 2, 5 }, - - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_1, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_2, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_3, 2, 5 }, - gather_nd_test_params{ CASE_GATHER_ND_FP32_6D_4, 2, 5 }, -})); - - - -/* ----------------------------------------------------------------------------------------------------- */ -/* ------------------------------------------ GatherElements cases ------------------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -struct gather_elements_test_params { - data_types data_type; - - format input_format; - tensor input_shape; - - format indices_format; - tensor indices_shape; - - format output_format; - tensor output_shape; - - cldnn::gather_elements::gather_elements_axis axis; - - data_types default_type; - format default_format; - - size_t expected_fused_primitives; - size_t expected_not_fused_primitives; -}; - -#define CASE_GATHER_ELEMENTS_FP16_4D_1 data_types::f16, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, cldnn::gather_elements::gather_elements_axis::along_y, data_types::f16, format::bfyx -#define CASE_GATHER_ELEMENTS_FP16_4D_2 data_types::f16, format::bfyx, { 3, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, cldnn::gather_elements::gather_elements_axis::along_b, data_types::f16, format::bfyx -#define CASE_GATHER_ELEMENTS_FP16_4D_3 data_types::f16, format::bfyx, { 1, 3, 2, 9 }, format::bfyx, { 1, 3, 5, 9 }, format::bfyx, { 1, 3, 5, 9 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f16, format::bfyx - -#define CASE_GATHER_ELEMENTS_FP16_5D_1 data_types::f16, format::bfzyx, { 3, 2, 5, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f16, format::bfzyx -#define CASE_GATHER_ELEMENTS_FP16_5D_2 data_types::f16, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 7, 4, 3 }, format::bfzyx, { 5, 4, 7, 4, 3 }, cldnn::gather_elements::gather_elements_axis::along_z, data_types::f16, format::bfzyx - -#define CASE_GATHER_ELEMENTS_FP16_6D_1 data_types::f16, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, cldnn::gather_elements::gather_elements_axis::along_f, data_types::f16, format::bfwzyx -#define CASE_GATHER_ELEMENTS_FP16_6D_2 data_types::f16, format::bfwzyx, { 2, 1, 2, 3, 2, 1 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_w, data_types::f16, format::bfwzyx -#define CASE_GATHER_ELEMENTS_FP16_6D_3 data_types::f16, format::bfwzyx, { 2, 2, 3, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f16, format::bfwzyx - - -#define CASE_GATHER_ELEMENTS_FP32_4D_1 data_types::f32, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, format::bfyx, { 3, 7, 9, 8 }, cldnn::gather_elements::gather_elements_axis::along_y, data_types::f32, format::bfyx -#define CASE_GATHER_ELEMENTS_FP32_4D_2 data_types::f32, format::bfyx, { 3, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, format::bfyx, { 2, 2, 8, 3 }, cldnn::gather_elements::gather_elements_axis::along_b, data_types::f32, format::bfyx -#define CASE_GATHER_ELEMENTS_FP32_4D_3 data_types::f32, format::bfyx, { 1, 3, 2, 9 }, format::bfyx, { 1, 3, 5, 9 }, format::bfyx, { 1, 3, 5, 9 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f32, format::bfyx - -#define CASE_GATHER_ELEMENTS_FP32_5D_1 data_types::f32, format::bfzyx, { 3, 2, 5, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, format::bfzyx, { 3, 2, 2, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f32, format::bfzyx -#define CASE_GATHER_ELEMENTS_FP32_5D_2 data_types::f32, format::bfzyx, { 5, 4, 7, 4, 4 }, format::bfzyx, { 5, 4, 7, 4, 3 }, format::bfzyx, { 5, 4, 7, 4, 3 }, cldnn::gather_elements::gather_elements_axis::along_z, data_types::f32, format::bfzyx - -#define CASE_GATHER_ELEMENTS_FP32_6D_1 data_types::f32, format::bfwzyx, { 5, 4, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, format::bfwzyx, { 5, 2, 6, 7, 8, 2 }, cldnn::gather_elements::gather_elements_axis::along_f, data_types::f32, format::bfwzyx -#define CASE_GATHER_ELEMENTS_FP32_6D_2 data_types::f32, format::bfwzyx, { 2, 1, 2, 3, 2, 1 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, format::bfwzyx, { 2, 1, 2, 3, 2, 3 }, cldnn::gather_elements::gather_elements_axis::along_w, data_types::f32, format::bfwzyx -#define CASE_GATHER_ELEMENTS_FP32_6D_3 data_types::f32, format::bfwzyx, { 2, 2, 3, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, format::bfwzyx, { 2, 2, 6, 4, 4, 2 }, cldnn::gather_elements::gather_elements_axis::along_x, data_types::f32, format::bfwzyx - -class GatherElementsPrimitiveFusingTest : public ::BaseFusingTest { -public: - void execute(gather_elements_test_params& p) { - auto input_prim = get_mem(get_input_layout(p)); - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - compare(network_not_fused, network_fused, p); - } - - size_t get_axis_dim(gather_elements_test_params& p) { - switch (p.axis) { - case cldnn::gather_elements::gather_elements_axis::along_x: - return p.input_shape.spatial[0]; - case cldnn::gather_elements::gather_elements_axis::along_y: - return p.input_shape.spatial[1]; - case cldnn::gather_elements::gather_elements_axis::along_z: - return p.input_shape.spatial[2]; - case cldnn::gather_elements::gather_elements_axis::along_w: - return p.input_shape.spatial[3]; - case cldnn::gather_elements::gather_elements_axis::along_f: - return p.input_shape.feature[0]; - case cldnn::gather_elements::gather_elements_axis::along_b: - return p.input_shape.batch[0]; - default: - return 1; - } - } - - layout get_input_layout(gather_elements_test_params& p) { - return layout{ p.data_type, p.input_format, p.input_shape }; - } - - layout get_indices_layout(gather_elements_test_params& p) { - return layout{ p.data_type, p.indices_format, p.indices_shape }; - } - - layout get_output_layout(gather_elements_test_params& p) { - return layout{ p.data_type, p.output_format, p.output_shape }; - } - - layout get_per_channel_layout(gather_elements_test_params& p) { - return layout{ p.default_type, p.default_format, tensor{ 1, p.output_shape.feature[0], 1, 1 } }; - } -}; - -class gather_elements_quantize : public GatherElementsPrimitiveFusingTest {}; -TEST_P(gather_elements_quantize, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_elements_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p))-1)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - gather_elements("gather_elements_prim", "input", "gather_elements_indices", p.output_format, p.output_shape, p.axis), - quantize("quantize", "gather_elements_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_quantize, ::testing::ValuesIn(std::vector{ - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_1, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_2, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_3, 2, 3 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_1, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_2, 2, 3 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_1, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_2, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_3, 2, 3 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_1, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_2, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_3, 2, 3 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_1, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_2, 2, 3 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_1, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_2, 2, 3 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_3, 2, 3 }, -})); - - -class gather_elements_scale_activation : public GatherElementsPrimitiveFusingTest {}; -TEST_P(gather_elements_scale_activation, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_elements_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p))-1)), - data("scale_data", get_mem(get_per_channel_layout(p), -10, 10)), - gather_elements("gather_elements_prim", "input", "gather_elements_indices", p.output_format, p.output_shape, p.axis), - activation("activation", "gather_elements_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - reorder("reorder_bfyx", "scale", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_scale_activation, ::testing::ValuesIn(std::vector{ - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_1, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_2, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_3, 2, 4 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_1, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_2, 2, 4 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_1, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_2, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_3, 2, 4 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_1, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_2, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_3, 2, 4 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_1, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_2, 2, 4 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_1, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_2, 2, 4 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_3, 2, 4 }, -})); - - -class gather_elements_activation_scale_eltwise : public GatherElementsPrimitiveFusingTest {}; -TEST_P(gather_elements_activation_scale_eltwise, basic) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("gather_elements_indices", get_mem(get_indices_layout(p), 0, static_cast(get_axis_dim(p))-1)), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / 255)), - data("eltwise_data", get_mem(get_output_layout(p))), - gather_elements("gather_elements_prim", "input", "gather_elements_indices", p.output_format, p.output_shape, p.axis), - activation("activation", "gather_elements_prim", activation_func::abs), - scale("scale", "activation", "scale_data"), - eltwise("eltwise", { "scale", "eltwise_data" }, eltwise_mode::sum, p.data_type), - reorder("reorder_bfyx", "eltwise", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_elements_activation_scale_eltwise, ::testing::ValuesIn(std::vector{ - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_1, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_2, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_4D_3, 2, 5 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_1, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_5D_2, 2, 5 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_1, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_2, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP16_6D_3, 2, 5 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_1, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_2, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_4D_3, 2, 5 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_1, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_5D_2, 2, 5 }, - - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_1, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_2, 2, 5 }, - gather_elements_test_params{ CASE_GATHER_ELEMENTS_FP32_6D_3, 2, 5 }, -})); - -#ifdef ENABLE_ONEDNN_FOR_GPU -class WeightsPrimitiveFusingTestOneDNN : public WeightsPrimitiveFusingTest { -public: - void execute(bc_test_params& p) { - // Onednn post operation has issue in a machine that does not support imad. - if (!engine.get_device_info().supports_imad) - return; - - auto input_prim = p.data_type == data_types::u8 ? get_mem(get_input_layout(p), 0, 10) : get_mem(get_input_layout(p)); - - auto impl_forcing_bo = bo_fused.get(); - const auto& impl_forcing = impl_forcing_bo->forcing; - - auto forcing_format = p.input_format; - for (auto& forcing : impl_forcing) { - if (forcing.first == "conv_prim") { - forcing_format = forcing.second.output_format; - } - } - - implementation_desc conv_impl = { forcing_format, "", impl_types::onednn }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - compare(network_not_fused, network_fused, p); - auto find_conv = [](primitive_info& p) -> bool { - if (p.original_id == "conv_prim") - return true; - return false; - }; - - auto pi_fused = network_fused.get_primitives_info(); - auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_conv); - if (info_fused != pi_fused.end()) - std::cout << "kernel: " << info_fused->kernel_id << std::endl; - } -}; - -class conv_int8_eltwise_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_int8_eltwise_onednn, u8_eltwise_sum_out) { - auto p = GetParam(); - - auto shift_layout = get_output_layout(p); - shift_layout.data_type = data_types::f32; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), 0, 2)), - data("bias", get_mem(get_bias_layout(p))), - data("shift_data", get_mem(shift_layout)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("shift", { "conv_prim", "shift_data" }, eltwise_mode::sum, data_types::f32), - // Add 'not fusable' primitive to be able to test full size tensor sum - crop("crop", "shift", get_output_layout(p).size, { 0, 0, 0, 0 }), - reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -TEST_P(conv_int8_eltwise_onednn, u8_eltwise_prod_out) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -2, 2)), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count()) ), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("scale", { "conv_prim", "scale_data" }, eltwise_mode::prod, data_types::u8), - crop("crop", "scale", get_output_layout(p).size, { 0, 0, 0, 0 }), - reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 4 }, - - bc_test_params{ CASE_CONV_U8S8_11, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 4 }, - - bc_test_params{ CASE_CONV3D_U8S8_1, 3, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_2, 3, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_3, 3, 4 }, - bc_test_params{ CASE_CONV3D_U8S8_5, 3, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_1, 3, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_2, 3, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_3, 3, 4 }, - bc_test_params{ CASE_CONV3D_S8S8_5, 3, 4 }, -})); - -class conv_fp32_activation_abs_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_fp32_activation_abs_onednn, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::abs), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_abs_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, -})); - -class conv_fp32_activation_mish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_fp32_activation_mish_onednn, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::mish), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_mish_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, -})); - -class conv_fp32_activation_swish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_fp32_activation_swish_onednn, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::swish), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_swish_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, -})); - -class conv_fp32_activation_hswish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_fp32_activation_hswish_onednn, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::hswish), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_hswish_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, -})); - -class conv_fp32_activation_exp_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_fp32_activation_exp_onednn, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::exp), - reorder("reorder_bfyx", "activation", p.default_format, data_types::f32) - ); - - tolerance = 1e-2f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_activation_exp_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_FP16_1, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_2, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_3, 2, 3 }, - bc_test_params{ CASE_CONV_FP16_4, 2, 3 }, -})); - -class conv_int8_quantize_u8_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_int8_quantize_u8_onednn, per_channel) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -2, 2)), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), -10, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 0, 10)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -TEST_P(conv_int8_quantize_u8_onednn, per_tensor) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -2, 2)), - data("bias", get_mem(get_bias_layout(p), 0)), - data("in_lo", get_mem(get_single_element_layout(p), -10)), - data("in_hi", get_mem(get_single_element_layout(p), 10)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 255)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.0f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_quantize_u8_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, -})); - -class conv_int8_activation_eltwise_quantize_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_int8_activation_eltwise_quantize_onednn, bsv32_fsv32) { - auto p = GetParam(); - layout eltwise_layout = get_output_layout(p); - eltwise_layout.format = format::bs_fs_yx_bsv32_fsv32; - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -1, 1)), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(eltwise_layout, -0.5, 0.5)), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::abs), - eltwise("eltwise", "activation", "eltwise_data", eltwise_mode::sum), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::bs_fs_yx_bsv32_fsv32, "", impl_types::onednn }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_activation_eltwise_quantize_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_7, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_8, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_11, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 5 }, - - bc_test_params{ CASE_CONV_S8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_4, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_7, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_8, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 5 }, -})); - -class conv_int8_scale_shift_swish_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_int8_scale_shift_swish_onednn, bsv32_fsv32) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -1, 1)), - data("bias", get_mem(get_bias_layout(p))), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - data("shift_data", get_mem(get_per_channel_layout(p), 1)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("scale0", { "conv_prim", "scale_data" }, eltwise_mode::sum), - eltwise("shift0", { "scale0", "shift_data" }, eltwise_mode::sum), - activation("sigmoid", "shift0", activation_func::swish), - eltwise("scale1", { "sigmoid", "scale_data" }, eltwise_mode::sum), - eltwise("shift1", { "scale1", "shift_data" }, eltwise_mode::sum), - reorder("reorder_bfyx", "shift1", p.default_format, data_types::f32) - ); - - implementation_desc conv_impl = { format::bs_fs_yx_bsv32_fsv32, "", impl_types::onednn }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_scale_shift_swish_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_1, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 7 }, - - bc_test_params{ CASE_CONV_U8S8_11, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 7 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 7 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 7 }, -})); - -class conv_int8_eltwise_scale_onednn : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(conv_int8_eltwise_scale_onednn, u8_eltwise_prod_out_reuse) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -2, 2)), - data("bias", get_mem(get_bias_layout(p))), - data("sum_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel.count())), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation, p.out_shape, data_types::f32, false), - eltwise("sum", { "conv_prim", "sum_data" }, eltwise_mode::sum, data_types::f32), - eltwise("scale", { "sum", "scale_data" }, eltwise_mode::prod, data_types::f32), - crop("crop", "scale", get_output_layout(p).size, { 0, 0, 0, 0 }), - reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - - auto input_prim = get_mem(get_input_layout(p)); - - auto forcing_format = p.input_format; - implementation_desc conv_impl = { forcing_format, "", impl_types::onednn }; - bo_fused.set_option(build_option::force_implementations({ { "conv_prim", conv_impl } })); - - network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused); - network network_fused(this->engine, this->topology_fused, bo_fused); - network_fused.set_input_data("input", input_prim); - network_not_fused.set_input_data("input", input_prim); - - // First network.execute() call - compare(network_not_fused, network_fused, p); - // Second network.execute() call to make sure that scales have not been wrongly overwritten within first iteration - // and don't affect final result of second iteration - compare(network_not_fused, network_fused, p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_int8_eltwise_scale_onednn, ::testing::ValuesIn(std::vector{ - bc_test_params{ CASE_CONV_U8S8_15, 2, 5 }, -})); - -/* ----------------------------------------------------------------------------------------------------- */ -/* ------------------------------ OneDNN post-ops cases with optimizations ----------------------------- */ -/* ----------------------------------------------------------------------------------------------------- */ - -// Before optimization: eltw_linear + eltw_linear -// After optimization: eltw_linear -// Limitations: no -// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_linear:1:-128 -// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:-0.5 -class post_ops_optimizations_onednn_eltw_linear_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_eltw_linear_eltw_linear, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), -10)), - data("in_hi", get_mem(get_single_element_layout(p), 10)), - data("out_lo", get_mem(get_single_element_layout(p), -128)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_linear_eltw_linear, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 3 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 3 }, -})); - -// Before optimization: eltw_non_linear + eltw_linear -// After optimization: eltw_non_linear -// Limitations: beta = 0 in eltw_linear -// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:2.00784+eltwise_clip:0:512 -// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round:0:0:2.00784+eltwise_clip:0:512 -class post_ops_optimizations_onednn_eltw_non_linear_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), -10)), - data("in_hi", get_mem(get_single_element_layout(p), 10)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 512)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::f32), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_non_linear_eltw_linear, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 3 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 3 }, -})); - -// Before optimization: binary_add + eltw_linear -// After optimization: binary_add -// Limitations: alpha = 1 and scale = 1 in eltw_linear; binary_add is a constant compile-time buffer -// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_linear:1:-127+eltwise_clip:-127:127 -// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_add:f32:2+eltwise_clip:-127:127 -class post_ops_optimizations_onednn_binary_add_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_binary_add_eltw_linear, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), -127)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_add_eltw_linear, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 3 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 3 }, -})); - -// Before optimization: binary_mul + eltw_linear -// After optimization: binary_mul -// Limitations: beta = 0 in eltw_linear; binary_mul is a constant compile-time buffer -// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_linear:2.01575+eltwise_clip:0:512 -// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:binary_mul:f32:2+eltwise_clip:0:512 -class post_ops_optimizations_onednn_binary_mul_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_binary_mul_eltw_linear, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("eltwise_data", get_mem(get_per_channel_layout(p), -1, 1)), - data("in_lo", get_mem(get_per_channel_layout(p), 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 512)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - eltwise("eltwise", { "conv_prim", "eltwise_data" }, eltwise_mode::prod), - quantize("quantize", "eltwise", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_binary_mul_eltw_linear, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 4 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_9, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 4 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 4 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 4 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 4 }, -})); - -// Before optimization: o_scale + eltw_linear -// After optimization: o_scale -// Limitations: beta = 0 in eltw_linear -// DNNL_VERBOSE log without optimization: attr-oscale:2 attr-post-ops:eltwise_linear:2.01575+eltwise_clip:0:512 -// DNNL_VERBOSE log with optimization: attr-oscale:2 attr-post-ops:eltwise_clip:0:512 -class post_ops_optimizations_onednn_oscale_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_oscale_eltw_linear, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_per_channel_layout(p), 0)), - data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 512)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_oscale_eltw_linear, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 3 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 3 }, -})); - -// Before optimization: eltw_any + sum + eltw_linear -// After optimization: eltw_any + sum -// Limitations: beta = 0 in eltw_linear -// DNNL_VERBOSE log without optimization: attr-post-ops:eltwise_relu+sum:1:0:u8+eltwise_linear:12.7+eltwise_clip:0:127 -// DNNL_VERBOSE log with optimization: attr-post-ops:eltwise_relu:0:0:12.7+sum:12.7:0:u8+eltwise_clip:0:127 -class post_ops_optimizations_onednn_eltw_any_sum_eltw_linear : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), 0)), - data("in_hi", get_mem(get_single_element_layout(p), 10)), - data("out_lo", get_mem(get_single_element_layout(p), 0)), - data("out_hi", get_mem(get_single_element_layout(p), 127)), - data("eltwise_data", get_mem(get_output_layout(p))), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - activation("activation", "conv_prim", activation_func::relu_negative_slope), - eltwise("sum", { "activation", "eltwise_data" }, eltwise_mode::sum), - quantize("quantize", "sum", "in_lo", "in_hi", "out_lo", "out_hi", 128, data_types::u8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_onednn_eltw_any_sum_eltw_linear, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 5 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_10, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 5 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 5 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 5 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 5 }, -})); - -// Input range uses in 2 cases: not per-tensor output range or out_lo > out_hi -// Here's out_lo > out_hi and no optimizations -// DNNL_VERBOSE log: attr-post-ops:eltwise_linear:12.75:127.5+eltwise_round+eltwise_linear:-1:127 -class post_ops_optimizations_input_range : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(post_ops_optimizations_input_range, basic) { - auto p = GetParam(); - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p))), - data("bias", get_mem(get_bias_layout(p))), - data("in_lo", get_mem(get_single_element_layout(p), -10)), - data("in_hi", get_mem(get_single_element_layout(p), 10)), - data("out_lo", get_mem(get_single_element_layout(p), 127)), - data("out_hi", get_mem(get_single_element_layout(p), -128)), - convolution("conv_prim", "input", { "weights" }, { "bias" }, p.groups, p.stride, p.pad, p.dilation), - quantize("quantize", "conv_prim", "in_lo", "in_hi", "out_lo", "out_hi", 256, data_types::i8), - reorder("reorder_bfyx", "quantize", p.default_format, data_types::f32) - ); - - tolerance = 1.f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, post_ops_optimizations_input_range, ::testing::ValuesIn(std::vector{ - // cases with batch = 1 - bc_test_params{ CASE_CONV_U8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_3, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_1, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_2, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_3, 2, 3 }, - - // cases with batch = 16 - bc_test_params{ CASE_CONV_U8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_10, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_9, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_10, 2, 3 }, - - // cases with batch = 32 - bc_test_params{ CASE_CONV_U8S8_11, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_U8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_12, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_13, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_14, 2, 3 }, - bc_test_params{ CASE_CONV_S8S8_15, 2, 3 }, -})); - - -// FC onednn sum case -class fc_int8_inputs_fused_fp32_sum : public WeightsPrimitiveFusingTestOneDNN {}; -TEST_P(fc_int8_inputs_fused_fp32_sum, basic) { - auto p = GetParam(); - auto shift_layout = layout{ p.default_type, p.default_format, tensor{ 1, 1, 1, p.kernel.batch[0] } }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_fc_weights_layout(p))), - data("bias", get_mem(get_fc_bias_layout(p))), - data("shift_data", get_mem(shift_layout, 1)), - fully_connected("fc_prim", "input", "weights", "bias", cldnn::data_types::f32, "", padding(), get_fc_output_dim_size(p)), - eltwise("shift", { "fc_prim", "shift_data" }, eltwise_mode::sum, cldnn::data_types::f32), - crop("crop", "shift", get_output_layout(p).size, { 0, 0, 0, 0 }), - reorder("reorder_bfyx", "crop", p.default_format, data_types::f32) - ); - - tolerance = 1e-5f; - execute(p); -} - -INSTANTIATE_TEST_SUITE_P(fusings_gpu, fc_int8_inputs_fused_fp32_sum, ::testing::ValuesIn(std::vector{ - // OneDNN has issue with small shapes - ticket 7064 - // bc_test_params{ CASE_FC_U8S8_3D_1, 2, 4 }, - // bc_test_params{ CASE_FC_U8S8_3D_2, 2, 4 }, - bc_test_params{ CASE_FC_U8S8_3D_4, 2, 4 }, -})); -#endif - - -// reorder(bfyx to fs_b_yx_fsv32) + conv -#define FSV32_CASE_CONV_FP32_1 { 1, 32, 4, 5 }, { 1, 32, 2, 3 }, { 1, 1, 3, 3 }, tensor{ 1 }, tensor{ 0 }, tensor{ 1 }, 1, data_types::f32, format::bfyx, data_types::f32, format::oiyx, data_types::f32, format::bfyx - -class conv_fp32_reorder_bfyx_to_fsv32_conv_basic : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_basic, basic) { - auto p = GetParam(); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - reorder("reorder_fsv32", "input", format::fs_b_yx_fsv32, data_types::f32), - convolution("conv_output", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation), - activation("activation", "conv_output", activation_func::abs) - ); - - implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); - - execute(p); -} -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_basic, ::testing::ValuesIn(std::vector{ - bc_test_params{ FSV32_CASE_CONV_FP32_1, 3, 3 } -})); - - -class conv_fp32_reorder_bfyx_to_fsv32_conv_mean : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_mean, have_mean) { - auto p = GetParam(); - memory::ptr mul = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 1, 3, 1, 2 } }); - set_values(mul, { 0.5f, 2.5f, -5.0f, 4.3f, 1.2f, -3.5f }); - - create_topologies( - input_layout("input", get_input_layout(p)), - data("mul", mul), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - reorder("reorder_fsv32", "input", format::fs_b_yx_fsv32, data_types::f32, "mul", reorder_mean_mode::mul), - convolution("conv_output", "reorder_fsv32", { "weights" }, 1, tensor{ 0, 0, 1, 1 }, p.pad, p.dilation), - activation("activation", "conv_output", activation_func::abs) - ); - - implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); - - execute(p); -} -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_mean, ::testing::ValuesIn(std::vector{ - bc_test_params{ FSV32_CASE_CONV_FP32_1, 3, 3 } -})); - - -class conv_fp32_reorder_bfyx_to_fsv32_conv_subtract : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, have_subtract_per_feature) { - auto p = GetParam(); - const std::vector& values_to_subtract = { - 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f, - 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f, - 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f, - 0.1f, 0.2f, 0.1f, 0.1f, 0.1f, 0.2f, 0.1f, 0.1f - }; - - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - auto dw_stride = tensor{ 0, 0, 1, 1 }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation), - reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32, values_to_subtract), - convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - activation("activation", "conv_output", activation_func::abs) - ); - - implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); - - execute(p); -} -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_subtract, ::testing::ValuesIn(std::vector{ - bc_test_params{ FSV32_CASE_CONV_FP32_1, 4, 4 } -})); - - -class conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, have_fused_activation) { - auto p = GetParam(); - - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - auto dw_stride = tensor{ 0, 0, 1, 1 }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation), - reorder("reorder_fsv32", "conv_prim", format::fs_b_yx_fsv32, data_types::f32), - activation("activation_quantize", "reorder_fsv32", activation_func::relu), - convolution("conv_output", "activation_quantize", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - activation("activation", "conv_output", activation_func::abs) - ); - - implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); - - execute(p); -} -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_fused_activation, ::testing::ValuesIn(std::vector{ - bc_test_params{ FSV32_CASE_CONV_FP32_1, 4, 5 } -})); - - -class conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding : public ConvFusingTest {}; -TEST_P(conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, have_data_padding) { - auto p = GetParam(); - - auto dw_tensor = cldnn::tensor(group(p.out_shape.feature[0]), batch(1), feature(1), spatial(3, 3)); - auto dw_weights_layout = layout{ p.default_type, format::goiyx, dw_tensor }; - auto dw_stride = tensor{ 0, 0, 1, 1 }; - - create_topologies( - input_layout("input", get_input_layout(p)), - data("weights", get_mem(get_weights_layout(p), -127, 127)), - data("weights_dw", get_mem(dw_weights_layout, -127, 127)), - convolution("conv_prim", "input", { "weights" }, p.groups, p.stride, p.pad, p.dilation), - reorder("reorder_fsv32", "conv_prim", layout(data_types::f32, format::fs_b_yx_fsv32, dw_tensor, padding{ { 0, 0, 1, 1 }, 0 })), - convolution("conv_output", "reorder_fsv32", { "weights_dw" }, p.out_shape.feature[0], dw_stride, p.pad, p.dilation), - activation("activation", "conv_output", activation_func::abs), - activation("activation2", "conv_prim", activation_func::abs), - eltwise("add_bias", { "activation", "activation2" }, eltwise_mode::sum) - ); - - implementation_desc conv_impl = { format::fs_b_yx_fsv32, "" }; - bo_fused.set_option(build_option::force_implementations({ { "conv_output", conv_impl } })); - - execute(p); -} -INSTANTIATE_TEST_SUITE_P(fusings_gpu, conv_fp32_reorder_bfyx_to_fsv32_conv_data_padding, ::testing::ValuesIn(std::vector{ - bc_test_params{ FSV32_CASE_CONV_FP32_1, 5, 6 } -})); From bd3a996239a0aeeb911e80de2b4b084b6e56fae8 Mon Sep 17 00:00:00 2001 From: Mikhail Nosov Date: Tue, 11 Jan 2022 13:13:37 +0300 Subject: [PATCH 72/78] [OV2.0] Preprocessing: support convert HWC->NCHW layout (#9540) * Initial implementation * Template reference tests * cpu & gpu tests for HWC -> NCHW conversion --- .../subgraph_reference/preprocess.cpp | 65 ++++++++ src/core/src/layout.cpp | 139 +++++++++++++++++- src/core/src/layout_utils.hpp | 16 ++ src/core/src/preprocess/pre_post_process.cpp | 13 +- .../src/preprocess/preprocess_steps_impl.cpp | 21 ++- src/core/tests/preprocess.cpp | 87 +++++++++++ .../subgraph_tests/preprocess.cpp | 1 + .../preprocess/preprocess_builders.hpp | 11 ++ 8 files changed, 347 insertions(+), 6 deletions(-) diff --git a/docs/template_plugin/tests/functional/subgraph_reference/preprocess.cpp b/docs/template_plugin/tests/functional/subgraph_reference/preprocess.cpp index ec7f5d74cf5..d1840156bdd 100644 --- a/docs/template_plugin/tests/functional/subgraph_reference/preprocess.cpp +++ b/docs/template_plugin/tests/functional/subgraph_reference/preprocess.cpp @@ -430,6 +430,68 @@ static RefPreprocessParams convert_layout_nhwc_to_nchw() { return res; } +static RefPreprocessParams convert_layout_nhwc_to_nchw_fully_dynamic() { + RefPreprocessParams res("convert_layout_nhwc_to_nchw_fully_dynamic"); + res.function = []() { + auto f = create_simple_function(element::u8, PartialShape::dynamic()); + f->get_parameters()[0]->set_layout("NCHW"); + + auto p = PrePostProcessor(f); + p.input().tensor().set_layout("NHWC"); + p.input().preprocess().convert_layout("NCHW"); + p.build(); + return f; + }; + res.inputs.emplace_back(element::u8, Shape{1, 2, 2, 3}, std::vector{1, 2, 3, // [H=0, W=0, RGB] + 4, 5, 6, // [H=0, W=1] + 7, 8, 9, // [H=1, W=0] + 10, 11, 12}); // [H=1, W=1] + res.expected.emplace_back(Shape{1, 3, 2, 2}, element::u8, std::vector{1, 4, 7, 10, // R + 2, 5, 8, 11, // G + 3, 6, 9, 12}); // B + return res; +} + +static RefPreprocessParams convert_layout_hwc_to_nchw() { + RefPreprocessParams res("convert_layout_hwc_to_nchw"); + res.function = []() { + auto f = create_simple_function(element::f32, {Dimension::dynamic(), 3, 2, 2}); + auto p = PrePostProcessor(f); + p.input().tensor().set_layout("HWC").set_element_type(element::u8); + p.input().model().set_layout("NCHW"); + p.build(); + return f; + }; + res.inputs.emplace_back(Shape{2, 2, 3}, element::u8, std::vector{1, 2, 3, // [H=0, W=0, RGB] + 4, 5, 6, // [H=0, W=1] + 7, 8, 9, // [H=1, W=0] + 10, 11, 12}); // [H=1, W=1] + res.expected.emplace_back(Shape{1, 3, 2, 2}, element::f32, std::vector{1, 4, 7, 10, // R + 2, 5, 8, 11, // G + 3, 6, 9, 12}); // B + return res; +} + +static RefPreprocessParams convert_layout_hwc_to_nchw_fully_dynamic() { + RefPreprocessParams res("convert_layout_hwc_to_nchw_fully_dynamic"); + res.function = []() { + auto f = create_simple_function(element::f32, PartialShape::dynamic()); + auto p = PrePostProcessor(f); + p.input().tensor().set_layout("HWC").set_element_type(element::u8); + p.input().model().set_layout("NCHW"); + p.build(); + return f; + }; + res.inputs.emplace_back(element::u8, Shape{2, 2, 3}, std::vector{1, 2, 3, // [H=0, W=0, RGB] + 4, 5, 6, // [H=0, W=1] + 7, 8, 9, // [H=1, W=0] + 10, 11, 12}); // [H=1, W=1] + res.expected.emplace_back(Shape{1, 3, 2, 2}, element::f32, std::vector{1, 4, 7, 10, // R + 2, 5, 8, 11, // G + 3, 6, 9, 12}); // B + return res; +} + static RefPreprocessParams convert_layout_nhwc_to_net_no_tensor_shape() { RefPreprocessParams res("convert_layout_nhwc_to_net_no_tensor_shape"); res.function = []() { @@ -1056,10 +1118,13 @@ std::vector allPreprocessTests() { resize_to_network_width_height(), resize_to_specified_width_height(), convert_layout_nhwc_to_nchw(), + convert_layout_nhwc_to_nchw_fully_dynamic(), convert_layout_nhwc_to_net_no_tensor_shape(), convert_layout_by_dims(), convert_layout_by_dims_multi(), convert_layout_by_dims_multi_layout(), + convert_layout_hwc_to_nchw(), + convert_layout_hwc_to_nchw_fully_dynamic(), resize_and_convert_layout(), convert_color_nv12_to_bgr_two_planes(), convert_color_nv12_single_plane(), diff --git a/src/core/src/layout.cpp b/src/core/src/layout.cpp index 109f1294c28..0f867f308c5 100644 --- a/src/core/src/layout.cpp +++ b/src/core/src/layout.cpp @@ -245,6 +245,12 @@ public: static std::vector find_permutation(const Layout& src_layout, const PartialShape& src_shape, const Layout& dst_layout); + static std::tuple find_squeeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout); + static std::tuple find_unsqueeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout); }; Layout LayoutUtils::apply_permutation(const Layout& src_layout, const std::vector& dims) { @@ -335,7 +341,10 @@ std::vector LayoutUtils::find_permutation(const Layout& src_layout, auto src_static = to_static(src_layout, rank); auto dst_static = to_static(dst, rank); OPENVINO_ASSERT(src_static.m_left_size == dst_static.m_left_size, - "Conversion is not supported for layouts with different sizes"); + "Conversion is not supported for layouts with different sizes, ", + src_layout.to_string(), + " <-> ", + dst.to_string()); OPENVINO_ASSERT(rank.is_dynamic() || src_static.m_left_size == rank.get_length(), "Conversion layout ", src_layout.to_string(), @@ -393,6 +402,121 @@ std::vector LayoutUtils::find_permutation(const Layout& src_layout, return check_trivial(res); } +std::tuple LayoutUtils::find_squeeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout) { + if (src_layout.m_dynamic || dst_layout.m_dynamic || src_layout.m_left_size <= dst_layout.m_left_size) { + return {src_shape, src_layout}; + } + + // Don't allow conversions like model_layout=NC??, tensor_layout=HWC + // Though in future such conversions may be possible to implement + OPENVINO_ASSERT(src_layout.m_left_size == src_layout.m_index_map.size(), + "Layout conversion ", + dst_layout.to_string(), + " <-> ", + src_layout.to_string(), + " is not supported. Please use fully specified model layout, current is ", + src_layout.to_string()); + + // Don't allow conversions like model_layout=NCHW, tensor_layout=?HW + OPENVINO_ASSERT(dst_layout.m_left_size == dst_layout.m_index_map.size(), + "Layout conversion ", + dst_layout.to_string(), + " <-> ", + src_layout.to_string(), + " is not supported. Please use fully specified tensor layout, current is ", + dst_layout.to_string()); + + bool rank_dynamic = src_shape.rank().is_dynamic(); + OPENVINO_ASSERT(rank_dynamic || src_shape.rank().get_length() == src_layout.m_left_size, + "Model input layout ", + src_layout.to_string(), + " is inconsistent with input shape ", + src_shape, + ". Layout and shape shall have same rank, got ", + src_layout.m_left_size, + " != ", + src_shape.rank().get_length()); + // At this point src_layout and dst_layout don't have '...' or '?' + std::vector res_dims(dst_layout.m_left_size); + Layout res; + res.m_dynamic = false; + res.m_left_size = dst_layout.m_left_size; + int64_t dst_idx = 0; + for (int64_t src_idx = 0; src_idx < src_layout.m_left_size; src_idx++) { + auto src_dim_name = src_layout.m_index_map.at(src_idx); + if (dst_layout.has_name(src_dim_name)) { + if (!rank_dynamic) { + res_dims[dst_idx] = src_shape[src_idx]; + } + res.m_index_map[dst_idx] = src_dim_name; + res.m_names[src_dim_name] = dst_idx; + dst_idx++; + } + } + if (dst_idx != dst_layout.m_left_size) { + std::stringstream missing_names; + missing_names << "( "; + for (const auto& dst_item : dst_layout.m_names) { + const auto& key = dst_item.first; + if (!res.m_names.count(key)) { + missing_names << "'" << key << "' "; + } + } + missing_names << ")"; + OPENVINO_ASSERT(dst_idx == dst_layout.m_left_size, + "Layout conversion failed. Tensor layout", + dst_layout.to_string(), + " has dimensions missing in model layout ", + src_layout.to_string(), + ". Missing dimensions are ", + missing_names.str()); + } + if (rank_dynamic) { + return {PartialShape::dynamic(), res}; + } else { + return {PartialShape(res_dims), res}; + } +} + +std::tuple LayoutUtils::find_unsqueeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout) { + if (src_layout.m_dynamic || dst_layout.m_dynamic || src_layout.m_left_size >= dst_layout.m_left_size) { + return {src_shape, src_layout, {}}; + } + + // find_squeeze already performed necessary validation, no need to repeat here + bool rank_dynamic = src_shape.rank().is_dynamic(); + auto dims_cnt = dst_layout.m_left_size - src_layout.m_left_size; + std::vector res_dims(dst_layout.m_left_size, 1); + Layout res; + res.m_dynamic = false; + res.m_left_size = dst_layout.m_left_size; + int64_t unset_idx = 0; + for (auto i = 0; i < dst_layout.m_left_size; i++) { + auto dim_name = dst_layout.m_index_map.at(i); + if (src_layout.has_name(dim_name)) { + auto src_idx = src_layout.get_index_by_name(dim_name); + res.m_names[dim_name] = src_idx + dims_cnt; + res.m_index_map[src_idx + dims_cnt] = dim_name; + if (!rank_dynamic) { + res_dims[src_idx + dims_cnt] = src_shape[src_idx]; + } + } else { + res.m_names[dim_name] = unset_idx; + res.m_index_map[unset_idx] = dim_name; + unset_idx++; + } + } + if (rank_dynamic) { + return {PartialShape::dynamic(), res, dims_cnt}; + } else { + return {PartialShape(res_dims), res, dims_cnt}; + } +} + namespace layout { namespace utils { Layout apply_permutation(const Layout& src_layout, const std::vector& dims) { @@ -404,6 +528,19 @@ std::vector find_permutation(const Layout& src_layout, const Layout& dst_layout) { return LayoutUtils::find_permutation(src_layout, src_shape, dst_layout); } + +std::tuple find_squeeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout) { + return LayoutUtils::find_squeeze(src_layout, src_shape, dst_layout); +} + +std::tuple find_unsqueeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout) { + return LayoutUtils::find_unsqueeze(src_layout, src_shape, dst_layout); +} + } // namespace utils // Helper functions diff --git a/src/core/src/layout_utils.hpp b/src/core/src/layout_utils.hpp index c6d9e8e32da..9df48fb48b9 100644 --- a/src/core/src/layout_utils.hpp +++ b/src/core/src/layout_utils.hpp @@ -12,6 +12,22 @@ namespace ov { namespace layout { namespace utils { +// Example is NCHW to HWC. Need to calculate user's shape from (?, 3, 480, 640) to (480, 640, 3) +// src_layout shall be 'bigger' than 'dst_layout' +// Returns shape and layout after 'squeeze' (CHW). Next step will be to apply "find_permutation" CHW->HWC +std::tuple find_squeeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout); + +// Example is HWC to NCDHW. Needs also to calculate user's shape from (480, 640, 3) to (1, 3, 1, 480, 640) +// src_layout shall be 'smaller' than 'dst_layout' +// Returns shape, layout and number of axis for unsqueeze after 'unsqueeze'. +// In this example, function will return: Shape {1,1,480,640,3}, Layout "NDCHW", axis=2 +// Next step will be to apply "find_permutation" NDCHW->NCDHW +std::tuple find_unsqueeze(const Layout& src_layout, + const PartialShape& src_shape, + const Layout& dst_layout); + std::vector find_permutation(const Layout& src_layout, const PartialShape& src_shape, const Layout& dst_layout); diff --git a/src/core/src/preprocess/pre_post_process.cpp b/src/core/src/preprocess/pre_post_process.cpp index 090708274d2..34af0657458 100644 --- a/src/core/src/preprocess/pre_post_process.cpp +++ b/src/core/src/preprocess/pre_post_process.cpp @@ -397,10 +397,15 @@ std::shared_ptr PrePostProcessor::build() { } if (input->get_tensor_data()->is_layout_set() && !param->get_layout().empty() && param->get_layout() != input->get_tensor_data()->get_layout()) { + auto sq_layout = Layout(); + // Find if some squeeze is needed between model and tensor + // E.g. model=NCHW, tensor=HWC + std::tie(new_param_shape, sq_layout) = + layout::utils::find_squeeze(param->get_layout(), net_shape, input->get_tensor_data()->get_layout()); // Find transpose between model and tensor layouts and update tensor shape auto net_to_tensor = - layout::utils::find_permutation(param->get_layout(), net_shape, input->get_tensor_data()->get_layout()); - if (!net_to_tensor.empty()) { + layout::utils::find_permutation(sq_layout, new_param_shape, input->get_tensor_data()->get_layout()); + if (!net_to_tensor.empty() && new_param_shape.rank().is_static()) { std::vector dims(new_param_shape.size()); std::transform(net_to_tensor.begin(), net_to_tensor.end(), dims.begin(), [&](int64_t v) { return new_param_shape[v]; @@ -525,7 +530,9 @@ std::shared_ptr PrePostProcessor::build() { "Resulting shape '", node.get_partial_shape(), "' after preprocessing is not aligned with original parameter's shape: ", - param->get_partial_shape()); + param->get_partial_shape(), + ", input parameter: ", + param->get_friendly_name()); // Replace parameter for (auto consumer : consumers) { diff --git a/src/core/src/preprocess/preprocess_steps_impl.cpp b/src/core/src/preprocess/preprocess_steps_impl.cpp index 39a6ca1ee03..6e8d8b4bbfa 100644 --- a/src/core/src/preprocess/preprocess_steps_impl.cpp +++ b/src/core/src/preprocess/preprocess_steps_impl.cpp @@ -178,7 +178,24 @@ void PreStepsList::add_convert_layout_impl(const Layout& layout) { "Can't convert layout for multi-plane input. Suggesting to convert current image to " "RGB/BGR color format using 'convert_color'"); Layout dst_layout = layout.empty() ? context.target_layout() : layout; - auto permutation = layout::utils::find_permutation(context.layout(), nodes[0].get_partial_shape(), dst_layout); + auto node = nodes[0]; + auto shape = node.get_partial_shape(); + size_t add_cnt; + Layout unsqueeze_layout; + std::tie(shape, unsqueeze_layout, add_cnt) = layout::utils::find_unsqueeze(context.layout(), shape, dst_layout); + if (add_cnt) { + std::vector dims; + dims.push_back(add_cnt); + Shape const_shape(dims); + std::vector vals(add_cnt); + for (auto i = 0; i < add_cnt; i++) { + vals[i] = i; + } + auto axes = op::v0::Constant::create(element::i64, const_shape, vals); + // Add unsqueeze on top + node = std::make_shared(node, axes); + } + auto permutation = layout::utils::find_permutation(unsqueeze_layout, shape, dst_layout); if (permutation.empty()) { // No transpose is needed, just update layout if (!layout.empty()) { @@ -187,7 +204,7 @@ void PreStepsList::add_convert_layout_impl(const Layout& layout) { return std::make_tuple(nodes, false); } auto perm_constant = op::v0::Constant::create(element::i64, Shape{permutation.size()}, permutation); - auto transpose = std::make_shared(nodes[0], perm_constant); + auto transpose = std::make_shared(node, perm_constant); context.layout() = dst_layout; // Update context's current layout // return false to avoid excess function revalidations as layout conversion // doesn't require shape or type propagation. diff --git a/src/core/tests/preprocess.cpp b/src/core/tests/preprocess.cpp index eb1bb522bc7..2fbbef7d2e4 100644 --- a/src/core/tests/preprocess.cpp +++ b/src/core/tests/preprocess.cpp @@ -967,6 +967,93 @@ TEST(pre_post_process, preprocess_convert_layout_partially_defined_trivial) { EXPECT_EQ(ops_num, f->get_ordered_ops().size()); } +TEST(pre_post_process, preprocess_convert_layout_squeeze) { + auto f = create_n_inputs<3>(element::f32, Shape{1, 3, 1, 480, 640}); + auto p = PrePostProcessor(f); + + p.input(0).tensor().set_layout("HWC"); + p.input(0).model().set_layout("NCDHW"); + + p.input(1).tensor().set_layout("NHWC"); + p.input(1).model().set_layout("NCDHW"); + + p.input(2).tensor().set_layout("WCHD"); + p.input(2).model().set_layout("NCDHW"); + + p.build(); + EXPECT_EQ(ov::layout::get_layout(f->input(0)), "HWC"); + EXPECT_EQ(f->input(0).get_partial_shape(), (PartialShape{480, 640, 3})); + EXPECT_EQ(ov::layout::get_layout(f->input(1)), "NHWC"); + EXPECT_EQ(f->input(1).get_partial_shape(), (PartialShape{1, 480, 640, 3})); + EXPECT_EQ(ov::layout::get_layout(f->input(2)), "WCHD"); + EXPECT_EQ(f->input(2).get_partial_shape(), (PartialShape{640, 3, 480, 1})); +} + +TEST(pre_post_process, preprocess_convert_layout_squeeze_dynamic) { + auto f = create_n_inputs<2>(element::f32, PartialShape{Dimension::dynamic(), 3, 1, 480, 640}); + auto p = PrePostProcessor(f); + + p.input(0).tensor().set_layout("HWC"); + p.input(0).model().set_layout("NCDHW"); + + p.input(1).tensor().set_layout("NHWC"); + p.input(1).model().set_layout("NCDHW"); + + p.build(); + EXPECT_EQ(ov::layout::get_layout(f->input(0)), "HWC"); + EXPECT_EQ(f->input(0).get_partial_shape(), (PartialShape{480, 640, 3})); + EXPECT_EQ(ov::layout::get_layout(f->input(1)), "NHWC"); + EXPECT_EQ(f->input(1).get_partial_shape(), (PartialShape{Dimension::dynamic(), 480, 640, 3})); +} + +TEST(pre_post_process, preprocess_convert_layout_squeeze_unsupported) { + auto f = create_n_inputs<1>(element::f32, PartialShape{Dimension::dynamic(), 3, 1, 480, 640}); + EXPECT_THROW( + { + auto p = PrePostProcessor(f); + p.input(0).tensor().set_layout("NCDHWS"); + p.input(0).model().set_layout("NCDHW"); + p.build(); + }, + ov::AssertFailure); + + EXPECT_THROW( + { + auto p = PrePostProcessor(f); + p.input(0).tensor().set_layout("HWC"); + p.input(0).model().set_layout("?????"); + p.build(); + }, + ov::AssertFailure); + + EXPECT_THROW( + { + auto p = PrePostProcessor(f); + p.input(0).tensor().set_layout("...S"); + p.input(0).model().set_layout("NCDHW"); + p.build(); + }, + ov::AssertFailure); + + EXPECT_THROW( + { + auto p = PrePostProcessor(f); + p.input(0).tensor().set_layout("HWC"); + p.input(0).model().set_layout("...NCDHW"); + p.build(); + }, + ov::AssertFailure); + + EXPECT_THROW( + { + auto p = PrePostProcessor(f); + p.input(0).tensor().set_layout("HW?"); + p.input(0).model().set_layout("NCDHW"); + p.build(); + }, + ov::AssertFailure); +} + TEST(pre_post_process, preprocess_convert_layout_partially_defined_error) { auto f = create_simple_function(element::f32, Shape{1, 2, 3, 4, 5}); diff --git a/src/tests/functional/plugin/gpu/shared_tests_instances/subgraph_tests/preprocess.cpp b/src/tests/functional/plugin/gpu/shared_tests_instances/subgraph_tests/preprocess.cpp index af253628e41..f092c192fa0 100644 --- a/src/tests/functional/plugin/gpu/shared_tests_instances/subgraph_tests/preprocess.cpp +++ b/src/tests/functional/plugin/gpu/shared_tests_instances/subgraph_tests/preprocess.cpp @@ -30,6 +30,7 @@ inline std::vector GPU_smoke_preprocess_functions() { preprocess_func(resize_cubic, "resize_cubic", 0.01f), preprocess_func(resize_dynamic, "resize_dynamic", 0.01f, { ov::Shape {1, 3, 123, 123} }), preprocess_func(convert_layout_by_dims, "convert_layout_by_dims", 0.01f), + preprocess_func(convert_layout_hwc_to_nchw, "convert_layout_hwc_to_nchw", 0.01f), preprocess_func(resize_and_convert_layout, "resize_and_convert_layout", 0.01f), preprocess_func(cvt_color_nv12_to_rgb_single_plane, "cvt_color_nv12_to_rgb_single_plane", 1.f), preprocess_func(cvt_color_nv12_to_bgr_two_planes, "cvt_color_nv12_to_bgr_two_planes", 1.f), diff --git a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/preprocess/preprocess_builders.hpp b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/preprocess/preprocess_builders.hpp index cb8b41ba24d..5a8975c213c 100644 --- a/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/preprocess/preprocess_builders.hpp +++ b/src/tests/ngraph_helpers/ngraph_functions/include/ngraph_functions/preprocess/preprocess_builders.hpp @@ -302,6 +302,16 @@ inline std::shared_ptr convert_layout_by_dims() { return function; } +inline std::shared_ptr convert_layout_hwc_to_nchw() { + using namespace ov::preprocess; + auto function = create_preprocess_1input(element::f32, PartialShape{1, 3, 30, 20}); + auto p = PrePostProcessor(function); + p.input().tensor().set_layout("HWC").set_element_type(element::u8); + p.input().model().set_layout("NCHW"); + function = p.build(); + return function; +} + inline std::shared_ptr resize_and_convert_layout_i8() { using namespace ov::preprocess; auto function = create_preprocess_1input(element::i8, PartialShape{1, 30, 20, 3}); @@ -413,6 +423,7 @@ inline std::vector generic_preprocess_functions() { preprocess_func(resize_cubic, "resize_cubic", 0.01f), preprocess_func(resize_dynamic, "resize_dynamic", 0.01f, { Shape {1, 3, 123, 123} }), preprocess_func(convert_layout_by_dims, "convert_layout_by_dims", 0.01f), + preprocess_func(convert_layout_hwc_to_nchw, "convert_layout_hwc_to_nchw", 0.01f), preprocess_func(resize_and_convert_layout, "resize_and_convert_layout", 0.01f), preprocess_func(resize_and_convert_layout_i8, "resize_and_convert_layout_i8", 0.01f), preprocess_func(cvt_color_nv12_to_rgb_single_plane, "cvt_color_nv12_to_rgb_single_plane", 1.f), From acdbbf4363f06505974dea5800419ebda747fd1b Mon Sep 17 00:00:00 2001 From: Vladimir Zinoviev Date: Tue, 11 Jan 2022 13:25:36 +0300 Subject: [PATCH 73/78] [LPT] fix build (#9566) --- .../low_precision_transformations/src/quantization_details.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/common/low_precision_transformations/src/quantization_details.cpp b/src/common/low_precision_transformations/src/quantization_details.cpp index cec290ae3c6..ada3df7e1ca 100644 --- a/src/common/low_precision_transformations/src/quantization_details.cpp +++ b/src/common/low_precision_transformations/src/quantization_details.cpp @@ -163,6 +163,7 @@ bool QuantizationDetails::empty() const noexcept { } bool QuantizationDetails::isSupportedLevel(const size_t level) { + using ngraph::pass::low_precision::levels; static const std::unordered_set supported_levels = { levels::int4, levels::int4_narrow_range, levels::int8, levels::int8_narrow_range, From ebcd9eaf0732e06a9d9860080d9104f2fc9c0daa Mon Sep 17 00:00:00 2001 From: Vladimir Gavrilov Date: Tue, 11 Jan 2022 15:20:24 +0300 Subject: [PATCH 74/78] Fixed conversion of some models with (I)DFT when a layer immediately before (I)DFT is a producer for Result (#9489) * Fix in the transformation PreserveRuntimeInfo: now Transpose is inserted before input port 0 of Result only, not after data node of layer before Result layer. * Deleted commented code. * Added more tests for the MO transformation PreserveRuntimeInfo. --- .../tools/mo/middle/PreserveRuntimeInfo.py | 2 +- .../mo/middle/PreserveRuntimeInfo_test.py | 126 +++++++++++++++++- 2 files changed, 125 insertions(+), 3 deletions(-) diff --git a/tools/mo/openvino/tools/mo/middle/PreserveRuntimeInfo.py b/tools/mo/openvino/tools/mo/middle/PreserveRuntimeInfo.py index a3ba0657f05..7bdb1c4f02f 100644 --- a/tools/mo/openvino/tools/mo/middle/PreserveRuntimeInfo.py +++ b/tools/mo/openvino/tools/mo/middle/PreserveRuntimeInfo.py @@ -118,7 +118,7 @@ class PreserveRuntimeInfo(MiddleReplacementPattern): transpose.name = in_node.name in_node.name += "/prev" - prev_node_out_port.get_connection().insert_node(transpose) + op.in_port(0).get_connection().insert_node(transpose) else: continue diff --git a/tools/mo/unit_tests/mo/middle/PreserveRuntimeInfo_test.py b/tools/mo/unit_tests/mo/middle/PreserveRuntimeInfo_test.py index 7a39ae62bf4..ac5f822b4ed 100644 --- a/tools/mo/unit_tests/mo/middle/PreserveRuntimeInfo_test.py +++ b/tools/mo/unit_tests/mo/middle/PreserveRuntimeInfo_test.py @@ -6,11 +6,12 @@ import unittest import numpy as np from generator import generator, generate -from openvino.tools.mo.middle.PreserveRuntimeInfo import PreserveRuntimeInfo -from openvino.tools.mo.ops.transpose import Transpose from openvino.tools.mo.front.common.partial_infer.elemental import copy_shape_infer +from openvino.tools.mo.front.common.partial_infer.utils import int64_array from openvino.tools.mo.graph.graph import Node +from openvino.tools.mo.middle.PreserveRuntimeInfo import PreserveRuntimeInfo from openvino.tools.mo.ops.op import PermuteAttrs +from openvino.tools.mo.ops.transpose import Transpose from openvino.tools.mo.utils.ir_engine.compare_graphs import compare_graphs from openvino.tools.mo.utils.runtime_info import RTInfo from unit_tests.utils.graph import build_graph, connect, valued_const_with_data, regular_op_with_empty_data, \ @@ -34,6 +35,65 @@ edges_with_transpose = [*connect('placeholder1', '0:transpose_parameter'), *connect('transpose_result', 'result')] +nodes_for_case_with_two_results = { + 'placeholder1': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder1_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': np.float32}, + 'placeholder2': {'type': 'Parameter', 'kind': 'op', 'op': 'Parameter'}, + 'placeholder2_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': np.float32}, + 'add': {'type': 'Add', 'kind': 'op', 'op': 'Add', 'infer': copy_shape_infer}, + 'add_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': np.float32}, + 'result1': {'kind': 'op', 'op': 'Result'}, + 'result2': {'kind': 'op', 'op': 'Result'}, + 'fft': {'kind': 'op', 'op': 'IDFT', 'type': 'IDFT', 'infer': copy_shape_infer}, + 'fft_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': np.float32}, + 'fft_axes': { + 'type': 'Const', 'kind': 'op', 'op': 'Const', 'shape': int64_array([1]), 'value': int64_array([-1]) + }, + 'fft_axes_data': {'value': int64_array([-1]), 'shape': int64_array([1]), 'kind': 'data', 'data_type': np.int64}, + 'transpose_parameter_order': { + 'type': 'Const', 'kind': 'op', 'op': 'Const', 'shape': None, 'value': None + }, + 'transpose_parameter_order_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': np.int64}, + 'transpose_parameter': {'type': 'Transpose', 'kind': 'op', 'op': 'Transpose', 'infer': Transpose.infer}, + 'transpose_parameter_data': {'value': None, 'shape': None, 'kind': 'data', 'data_type': None}, +} + +edges_for_case_with_two_results = [ + ('transpose_parameter_order', 'transpose_parameter_order_data'), + ('transpose_parameter_order_data', 'transpose_parameter', {'in': 1}), + ('transpose_parameter', 'transpose_parameter_data'), + ('placeholder1', 'placeholder1_data'), + ('placeholder2', 'placeholder2_data'), + ('placeholder1_data', 'add', {'in': 0}), + ('placeholder2_data', 'add', {'in': 1}), + ('add', 'add_data'), + ('add_data', 'result1', {'out': 0, 'in': 0}), + ('add_data', 'fft', {'out': 0, 'in': 0}), + ('fft_axes', 'fft_axes_data'), + ('fft_axes_data', 'fft', {'in': 1}), + ('fft', 'fft_data'), + ('fft_data', 'result2'), +] + +edges_with_transpose_for_case_with_two_results = [ + ('transpose_parameter_order', 'transpose_parameter_order_data'), + ('placeholder1_data', 'transpose_parameter', {'in': 0}), + ('transpose_parameter_order_data', 'transpose_parameter', {'in': 1}), + ('transpose_parameter', 'transpose_parameter_data'), + ('placeholder1', 'placeholder1_data'), + ('placeholder2', 'placeholder2_data'), + ('transpose_parameter_data', 'add', {'in': 0}), + ('placeholder2_data', 'add', {'in': 1}), + ('add', 'add_data'), + ('add_data', 'result1', {'out': 0, 'in': 0}), + ('add_data', 'fft', {'out': 0, 'in': 0}), + ('fft_axes', 'fft_axes_data'), + ('fft_axes_data', 'fft', {'in': 1}), + ('fft', 'fft_data'), + ('fft_data', 'result2'), +] + + @generator class PreserveRuntimeInfoTest(unittest.TestCase): @generate(*[ @@ -122,3 +182,65 @@ class PreserveRuntimeInfoTest(unittest.TestCase): rt_info = result_node.rt_info.info old_api_map = rt_info[('old_api_map_order', 0)].info self.assertTrue(np.array_equal(old_api_map['order'], [0, 3, 1, 2])) + + @generate(*[ + ([0, 3, 1, 2], [0, 2, 3, 1], True, 'DFT'), + ([0, 3, 1, 2], [0, 2, 3, 1], True, 'IDFT'), + (None, None, False, 'DFT'), + (None, None, False, 'IDFT'), + ([0, 4, 1, 2, 3], [0, 2, 3, 4, 1], True, 'DFT'), + ([0, 4, 1, 2, 3], [0, 2, 3, 4, 1], True, 'IDFT'), + ]) + def test_transpose_insert_with_two_result_nodes(self, nhwc_to_nchw_order, nchw_to_nhwc_order, + add_permutation_attrs, fft_kind): + shape_len = len(nhwc_to_nchw_order) if add_permutation_attrs else 3 + shape = np.array(range(shape_len)) + add_shape = shape if nhwc_to_nchw_order is None else shape[nhwc_to_nchw_order] + graph = build_graph(nodes_attrs=nodes_for_case_with_two_results, + edges=edges_for_case_with_two_results, + update_attributes={ + 'placeholder1_data': {'shape': int64_array(shape)}, + 'placeholder1': {'shape': int64_array(shape), 'rt_info': RTInfo()}, + 'transpose_parameter_order': { + 'value': np.array(nhwc_to_nchw_order), + 'shape': int64_array(np.array(nhwc_to_nchw_order).shape) + }, + 'transpose_parameter_order_data': { + 'value': np.array(nhwc_to_nchw_order), + 'shape': int64_array(np.array(nhwc_to_nchw_order).shape) + }, + 'fft': {'op': fft_kind, 'type': fft_kind}, + 'add_data': {'shape': add_shape}, + 'fft_data': {'shape': add_shape}, + 'result1': {'shape': shape, 'rt_info': RTInfo()}, + 'result2': {'shape': shape, 'rt_info': RTInfo()}, + }) + + if add_permutation_attrs: + graph_ref = build_graph(nodes_for_case_with_two_results, edges_with_transpose_for_case_with_two_results) + else: + graph_ref = build_graph(nodes_for_case_with_two_results, edges_for_case_with_two_results) + + param1_node = Node(graph, 'placeholder1') + result1_node = Node(graph, 'result1') + result2_node = Node(graph, 'result2') + + if add_permutation_attrs: + shape_len = len(nhwc_to_nchw_order) + param1_node['permute_attrs'] = PermuteAttrs().update_attrs(attrs=[('shape', 'output:0')]) + param1_node.out_node(0)['permutation'] = PermuteAttrs().get_nhwc_to_nchw_permutation(shape_len) + result1_node.in_node(0)['permutation'] = PermuteAttrs().get_nhwc_to_nchw_permutation(shape_len) + result2_node.in_node(0)['permutation'] = PermuteAttrs().get_nhwc_to_nchw_permutation(shape_len) + + PreserveRuntimeInfo().find_and_replace_pattern(graph) + + (flag, resp) = compare_graphs(graph, graph_ref, 'result1') + self.assertTrue(flag, resp) + + self.assertFalse(param1_node.has_valid('permute_attrs')) + self.assertFalse(param1_node.out_node(0).has_valid('permutation')) + + if add_permutation_attrs: + rt_info = param1_node.rt_info.info + old_api_map = rt_info[('old_api_map_order', 0)].info + self.assertTrue(np.array_equal(old_api_map['inverse_order'], nchw_to_nhwc_order)) From 6ddc1e981b4013fd28818d1ab0e636862471df64 Mon Sep 17 00:00:00 2001 From: Vladislav Volkov Date: Tue, 11 Jan 2022 16:06:37 +0300 Subject: [PATCH 75/78] Fix missing declarations for TBB_HYBRID_CPUS (#9567) --- .../threading/ie_parallel_custom_arena.cpp | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/inference/src/threading/ie_parallel_custom_arena.cpp b/src/inference/src/threading/ie_parallel_custom_arena.cpp index a987351bad2..1546847e090 100644 --- a/src/inference/src/threading/ie_parallel_custom_arena.cpp +++ b/src/inference/src/threading/ie_parallel_custom_arena.cpp @@ -22,6 +22,7 @@ namespace custom { namespace detail { # if USE_TBBBIND_2_5 + extern "C" { void __TBB_internal_initialize_system_topology(std::size_t groups_num, int& numa_nodes_count, @@ -55,7 +56,20 @@ static bool is_binding_environment_valid() { # endif /* _WIN32 && !_WIN64 */ } -# endif +# elif TBB_NUMA_SUPPORT_PRESENT || TBB_HYBRID_CPUS_SUPPORT_PRESENT + +static tbb::task_arena::constraints convert_constraints(const custom::task_arena::constraints& c) { + tbb::task_arena::constraints result{}; +# if TBB_HYBRID_CPUS_SUPPORT_PRESENT + result.core_type = c.core_type; + result.max_threads_per_core = c.max_threads_per_core; +# endif + result.numa_id = c.numa_id; + result.max_concurrency = c.max_concurrency; + return result; +} + +# endif // USE_TBBBIND_2_5 class TBBbindSystemTopology { TBBbindSystemTopology() { @@ -192,18 +206,7 @@ static binding_oberver_ptr construct_binding_observer(tbb::task_arena& ta, int n return observer; } -# elif TBB_NUMA_SUPPORT_PRESENT -static tbb::task_arena::constraints convert_constraints(const custom::task_arena::constraints& c) { - tbb::task_arena::constraints result{}; -# if TBB_HYBRID_CPUS_SUPPORT_PRESENT - result.core_type = c.core_type; - result.max_threads_per_core = c.max_threads_per_core; -# endif - result.numa_id = c.numa_id; - result.max_concurrency = c.max_concurrency; - return result; -} -# endif +# endif // USE_TBBBIND_2_5 } // namespace detail task_arena::task_arena(int max_concurrency_, unsigned reserved_for_masters) From 42c5be23b1848f073e530e21827c2516ac4defd1 Mon Sep 17 00:00:00 2001 From: Alexey Lebedev Date: Tue, 11 Jan 2022 16:12:11 +0300 Subject: [PATCH 76/78] [PYTHON API] infer helper (#9478) * inputs as list in infer * fix import * fix import 2 * refactor test --- .../python/src/openvino/runtime/ie_api.py | 18 +++++++------- .../test_infer_request.py | 24 +++++++++++++++++++ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/src/bindings/python/src/openvino/runtime/ie_api.py b/src/bindings/python/src/openvino/runtime/ie_api.py index 24ef9eddb7b..73e9db2aa25 100644 --- a/src/bindings/python/src/openvino/runtime/ie_api.py +++ b/src/bindings/python/src/openvino/runtime/ie_api.py @@ -22,21 +22,23 @@ def tensor_from_file(path: str) -> Tensor: return Tensor(np.fromfile(path, dtype=np.uint8)) -def normalize_inputs(py_dict: dict, py_types: dict) -> dict: +def normalize_inputs(inputs: Union[dict, list], py_types: dict) -> dict: """Normalize a dictionary of inputs to Tensors.""" - for k, val in py_dict.items(): + if isinstance(inputs, list): + inputs = {index: input for index, input in enumerate(inputs)} + for k, val in inputs.items(): if not isinstance(k, (str, int)): raise TypeError("Incompatible key type for tensor named: {}".format(k)) try: ov_type = py_types[k] except KeyError: raise KeyError("Port for tensor named {} was not found!".format(k)) - py_dict[k] = ( + inputs[k] = ( val if isinstance(val, Tensor) else Tensor(np.array(val, get_dtype(ov_type))) ) - return py_dict + return inputs def get_input_types(obj: Union[InferRequestBase, CompiledModelBase]) -> dict: @@ -55,14 +57,14 @@ def get_input_types(obj: Union[InferRequestBase, CompiledModelBase]) -> dict: class InferRequest(InferRequestBase): """InferRequest wrapper.""" - def infer(self, inputs: dict = None) -> dict: + def infer(self, inputs: Union[dict, list] = None) -> dict: """Infer wrapper for InferRequest.""" inputs = ( {} if inputs is None else normalize_inputs(inputs, get_input_types(self)) ) return super().infer(inputs) - def start_async(self, inputs: dict = None, userdata: Any = None) -> None: + def start_async(self, inputs: Union[dict, list] = None, userdata: Any = None) -> None: """Asynchronous infer wrapper for InferRequest.""" inputs = ( {} if inputs is None else normalize_inputs(inputs, get_input_types(self)) @@ -77,7 +79,7 @@ class CompiledModel(CompiledModelBase): """Create new InferRequest object.""" return InferRequest(super().create_infer_request()) - def infer_new_request(self, inputs: dict = None) -> dict: + def infer_new_request(self, inputs: Union[dict, list] = None) -> dict: """Infer wrapper for CompiledModel.""" inputs = ( {} if inputs is None else normalize_inputs(inputs, get_input_types(self)) @@ -92,7 +94,7 @@ class AsyncInferQueue(AsyncInferQueueBase): """Return i-th InferRequest from AsyncInferQueue.""" return InferRequest(super().__getitem__(i)) - def start_async(self, inputs: dict = None, userdata: Any = None) -> None: + def start_async(self, inputs: Union[dict, list] = None, userdata: Any = None) -> None: """Asynchronous infer wrapper for AsyncInferQueue.""" inputs = ( {} diff --git a/src/bindings/python/tests/test_inference_engine/test_infer_request.py b/src/bindings/python/tests/test_inference_engine/test_infer_request.py index 8d9d3c0b50a..c4949e12660 100644 --- a/src/bindings/python/tests/test_inference_engine/test_infer_request.py +++ b/src/bindings/python/tests/test_inference_engine/test_infer_request.py @@ -175,6 +175,30 @@ def test_start_async(device): assert callbacks_info["finished"] == jobs +def test_infer_list_as_inputs(device): + num_inputs = 4 + input_shape = [2, 1] + dtype = np.float32 + params = [ops.parameter(input_shape, dtype) for _ in range(num_inputs)] + model = Model(ops.relu(ops.concat(params, 1)), params) + core = Core() + compiled_model = core.compile_model(model, device) + + def check_fill_inputs(request, inputs): + for input_idx in range(len(inputs)): + assert np.array_equal(request.get_input_tensor(input_idx).data, inputs[input_idx]) + + request = compiled_model.create_infer_request() + + inputs = [np.random.normal(size=input_shape).astype(dtype)] + request.infer(inputs) + check_fill_inputs(request, inputs) + + inputs = [np.random.normal(size=input_shape).astype(dtype) for _ in range(num_inputs)] + request.infer(inputs) + check_fill_inputs(request, inputs) + + def test_infer_mixed_keys(device): core = Core() func = core.read_model(test_net_xml, test_net_bin) From ddc40984739a24c127d22ad48bd345a55991efbd Mon Sep 17 00:00:00 2001 From: Dmitrii Khurtin Date: Tue, 11 Jan 2022 16:22:28 +0300 Subject: [PATCH 77/78] [GNA] Fixed symbolic links in tarball (#9545) --- cmake/dependencies.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 5087a87a9bb..7f6358ec6e3 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -276,8 +276,8 @@ if(ENABLE_INTEL_GNA) GNA_LIB_DIR libGNA_INCLUDE_DIRS libGNA_LIBRARIES_BASE_PATH) - set(GNA_VERSION "03.00.00.1455") - set(GNA_HASH "8ac1af18eb32777b00193f4f8c252ee4f8bd64a9069138b4a5aaeebd82ead464") + set(GNA_VERSION "03.00.00.1455.0") + set(GNA_HASH "99891696269d8fa10116c96e6b7bda4362736881f0df8df8b56c751ee18e5820") set(FILES_TO_EXTRACT_LIST gna_${GNA_VERSION}/include) if(WIN32) From ab0913519530f4c7a6ba13f3a93d2a83000ae460 Mon Sep 17 00:00:00 2001 From: Anastasia Kuporosova Date: Tue, 11 Jan 2022 16:55:18 +0300 Subject: [PATCH 78/78] [Python API] Move wheel folder to the python dir (#9125) * [Python API] Move wheel folder to the python dir * codestyle files * one more codestyle * align with master * ignore some flake8 comments --- .ci/azure/linux.yml | 2 +- .github/workflows/py_checks.yml | 4 ++-- .../ie_bridges/python/CMakeLists.txt | 4 ---- src/bindings/python/CMakeLists.txt | 4 ++++ src/bindings/python/setup.cfg | 23 +++++++++++++++++++ .../src/compatibility/ngraph/opset8/ops.py | 3 +-- .../python/src/openvino/runtime/ie_api.py | 1 + .../python/src/openvino/runtime/opset8/ops.py | 3 +-- src/bindings/python/tox.ini | 4 ++-- .../bindings}/python/wheel/CMakeLists.txt | 4 ++++ .../bindings}/python/wheel/readme.txt | 0 .../python/wheel/requirements-dev.txt | 0 .../bindings}/python/wheel/setup.cfg | 0 .../bindings}/python/wheel/setup.py | 0 14 files changed, 39 insertions(+), 13 deletions(-) create mode 100644 src/bindings/python/setup.cfg rename {inference-engine/ie_bridges => src/bindings}/python/wheel/CMakeLists.txt (95%) rename {inference-engine/ie_bridges => src/bindings}/python/wheel/readme.txt (100%) rename {inference-engine/ie_bridges => src/bindings}/python/wheel/requirements-dev.txt (100%) rename {inference-engine/ie_bridges => src/bindings}/python/wheel/setup.cfg (100%) rename {inference-engine/ie_bridges => src/bindings}/python/wheel/setup.py (100%) diff --git a/.ci/azure/linux.yml b/.ci/azure/linux.yml index e8831030ed8..a2d670b8504 100644 --- a/.ci/azure/linux.yml +++ b/.ci/azure/linux.yml @@ -113,7 +113,7 @@ jobs: # For opencv-python: python3-setuptools and pip upgrade python3 -m pip install --upgrade pip python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/requirements.txt - python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/wheel/requirements-dev.txt + python3 -m pip install -r $(REPO_DIR)/src/bindings/python/wheel/requirements-dev.txt # For running Python API tests python3 -m pip install -r $(REPO_DIR)/inference-engine/ie_bridges/python/src/requirements-dev.txt # For running Paddle frontend unit tests diff --git a/.github/workflows/py_checks.yml b/.github/workflows/py_checks.yml index 0e9175f2114..5d0c756978d 100644 --- a/.github/workflows/py_checks.yml +++ b/.github/workflows/py_checks.yml @@ -54,13 +54,13 @@ jobs: path: src_diff.diff - name: Run Flake on wheel run: python -m flake8 ./ --config=../setup.cfg - working-directory: inference-engine/ie_bridges/python/wheel + working-directory: src/bindings/python/wheel - name: Create code style diff for wheel if: failure() run: | python -m black -l 160 -S ./ git diff > wheel_diff.diff - working-directory: inference-engine/ie_bridges/python/wheel + working-directory: src/bindings/python/wheel - uses: actions/upload-artifact@v2 if: failure() with: diff --git a/inference-engine/ie_bridges/python/CMakeLists.txt b/inference-engine/ie_bridges/python/CMakeLists.txt index 404def10358..9900c48227b 100644 --- a/inference-engine/ie_bridges/python/CMakeLists.txt +++ b/inference-engine/ie_bridges/python/CMakeLists.txt @@ -65,10 +65,6 @@ endfunction() set (PYTHON_BRIDGE_SRC_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory (src/openvino/inference_engine) -if(ENABLE_WHEEL) - add_subdirectory(wheel) -endif() - if(TARGET _pyngraph) add_dependencies(ie_api _pyngraph) endif() diff --git a/src/bindings/python/CMakeLists.txt b/src/bindings/python/CMakeLists.txt index 0d54e471203..25fb0494e1d 100644 --- a/src/bindings/python/CMakeLists.txt +++ b/src/bindings/python/CMakeLists.txt @@ -16,6 +16,10 @@ set(LIBRARY_OUTPUT_DIRECTORY_BIN ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) add_subdirectory(src) +if(ENABLE_WHEEL) + add_subdirectory(wheel) +endif() + if(ENABLE_OV_CORE_UNIT_TESTS) add_subdirectory(tests/mock/ov_mock_py_frontend) add_dependencies(pyopenvino ov_mock_py_frontend) diff --git a/src/bindings/python/setup.cfg b/src/bindings/python/setup.cfg new file mode 100644 index 00000000000..15abb7dc3de --- /dev/null +++ b/src/bindings/python/setup.cfg @@ -0,0 +1,23 @@ +[flake8] +filename = *.py, *.pyx +max-line-length = 160 +ignore = E203 +max-parameters-amount = 8 +show_source = True +docstring-convention = google +enable-extensions = G +per-file-ignores = + *.pyx: E225, E226, E251, E999, E800, E265, E203, E266, E227, E211 + tests/*: S101, T001 + *__init__.py: F403, F405, F405 + +[pydocstyle] +convention = google + +[mypy] +ignore_missing_imports = True +disable_error_code = attr-defined +show_column_numbers = True +show_error_context = True +show_absolute_path = True +pretty = True \ No newline at end of file diff --git a/src/bindings/python/src/compatibility/ngraph/opset8/ops.py b/src/bindings/python/src/compatibility/ngraph/opset8/ops.py index 78b02dd46c3..e729ac5ab7f 100644 --- a/src/bindings/python/src/compatibility/ngraph/opset8/ops.py +++ b/src/bindings/python/src/compatibility/ngraph/opset8/ops.py @@ -781,8 +781,7 @@ def detection_output( @nameable_op def softmax(data: NodeInput, axis: int, name: Optional[str] = None) -> Node: - """ - Apply softmax operation on each element of input tensor. + """Apply softmax operation on each element of input tensor. @param data: The tensor providing input data. @param axis: An axis along which Softmax should be calculated. Can be positive or negative. diff --git a/src/bindings/python/src/openvino/runtime/ie_api.py b/src/bindings/python/src/openvino/runtime/ie_api.py index 73e9db2aa25..fb911ed77bc 100644 --- a/src/bindings/python/src/openvino/runtime/ie_api.py +++ b/src/bindings/python/src/openvino/runtime/ie_api.py @@ -167,6 +167,7 @@ class OVAny(OVAnyBase): any = OVAny(Test()) print(any.value.data) @endcode + """ def __getitem__(self, key: Union[str, int]) -> Any: diff --git a/src/bindings/python/src/openvino/runtime/opset8/ops.py b/src/bindings/python/src/openvino/runtime/opset8/ops.py index 7282c257b23..2a9d1dd7ba6 100644 --- a/src/bindings/python/src/openvino/runtime/opset8/ops.py +++ b/src/bindings/python/src/openvino/runtime/opset8/ops.py @@ -782,8 +782,7 @@ def detection_output( @nameable_op def softmax(data: NodeInput, axis: int, name: Optional[str] = None) -> Node: - """ - Apply softmax operation on each element of input tensor. + """Apply softmax operation on each element of input tensor. @param data: The tensor providing input data. @param axis: An axis along which Softmax should be calculated. Can be positive or negative. diff --git a/src/bindings/python/tox.ini b/src/bindings/python/tox.ini index 22a79879cd3..9f16d05aca0 100644 --- a/src/bindings/python/tox.ini +++ b/src/bindings/python/tox.ini @@ -18,7 +18,7 @@ commands= {envbindir}/python setup.py bdist_wheel {envbindir}/pip install --no-index --pre --find-links=dist/ openvino flake8 {posargs:src/ setup.py} - flake8 --ignore=D100,D101,D102,D103,D104,D105,D107,W503 tests/ tests_compatibility/ # ignore lack of docs in tests + flake8 --ignore=D100,D101,D102,D103,D104,D105,D107,D212,W503 tests/ tests_compatibility/ # ignore lack of docs in tests mypy --config-file=tox.ini {posargs:src/} pytest --backend={env:NGRAPH_BACKEND} tests -v -k 'not _cuda' --ignore=tests/test_onnx/test_zoo_models.py --ignore=tests/test_utils --ignore=tests/test_inference_engine pytest --backend={env:NGRAPH_BACKEND} tests_compatibility -v -k 'not _cuda' --ignore=tests_compatibility/test_onnx/test_zoo_models.py @@ -46,7 +46,7 @@ max-complexity=7 # D412 - No blank lines allowed between a section header and its content # F401 - module imported but unused # W503 - line break before binary operator (prefer line breaks before op, not after) -ignore=D100,D104,D105,D107,D412,F401,W503 +ignore=D100,D104,D105,D107,D410,D411,D412,F401,W503 [mypy] ignore_missing_imports=True diff --git a/inference-engine/ie_bridges/python/wheel/CMakeLists.txt b/src/bindings/python/wheel/CMakeLists.txt similarity index 95% rename from inference-engine/ie_bridges/python/wheel/CMakeLists.txt rename to src/bindings/python/wheel/CMakeLists.txt index 69e49deffb6..b8ae694160b 100644 --- a/inference-engine/ie_bridges/python/wheel/CMakeLists.txt +++ b/src/bindings/python/wheel/CMakeLists.txt @@ -1,8 +1,12 @@ # Copyright (C) 2018-2021 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # + +find_package(PythonInterp 3 REQUIRED) +set(PYTHON_VERSION python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}) set(WHEEL_VERSION "${IE_VERSION}" CACHE STRING "Version of this release" FORCE) set(WHEEL_BUILD "${IE_VERSION_BUILD}" CACHE STRING "Build number of this release" FORCE) +set(PYTHON_BRIDGE_CPACK_PATH "python") set(PY_PACKAGES_DIR ${PYTHON_BRIDGE_CPACK_PATH}/${PYTHON_VERSION}) set(TBB_LIBS_DIR runtime/3rdparty/tbb/lib) if(WIN32) diff --git a/inference-engine/ie_bridges/python/wheel/readme.txt b/src/bindings/python/wheel/readme.txt similarity index 100% rename from inference-engine/ie_bridges/python/wheel/readme.txt rename to src/bindings/python/wheel/readme.txt diff --git a/inference-engine/ie_bridges/python/wheel/requirements-dev.txt b/src/bindings/python/wheel/requirements-dev.txt similarity index 100% rename from inference-engine/ie_bridges/python/wheel/requirements-dev.txt rename to src/bindings/python/wheel/requirements-dev.txt diff --git a/inference-engine/ie_bridges/python/wheel/setup.cfg b/src/bindings/python/wheel/setup.cfg similarity index 100% rename from inference-engine/ie_bridges/python/wheel/setup.cfg rename to src/bindings/python/wheel/setup.cfg diff --git a/inference-engine/ie_bridges/python/wheel/setup.py b/src/bindings/python/wheel/setup.py similarity index 100% rename from inference-engine/ie_bridges/python/wheel/setup.py rename to src/bindings/python/wheel/setup.py