[Snippets] Added support of INT8 models (#12395)

This commit is contained in:
Alexandra Sidorova 2022-10-05 13:05:15 +04:00 committed by GitHub
parent f7e05ad402
commit f6d6f5629f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 2073 additions and 577 deletions

View File

@ -24,6 +24,7 @@ namespace op {
class ConvertSaturation : public ov::op::v0::Convert {
public:
OPENVINO_OP("ConvertSaturation", "SnippetsOpset", ov::op::v0::Convert);
BWDCMP_RTTI_DECLARATION;
ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type);
ConvertSaturation() = default;

View File

@ -23,6 +23,7 @@ namespace op {
class ConvertTruncation : public ov::op::v0::Convert {
public:
OPENVINO_OP("ConvertTruncation", "SnippetsOpset", ov::op::v0::Convert);
BWDCMP_RTTI_DECLARATION;
ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type);
ConvertTruncation() = default;

View File

@ -88,6 +88,17 @@ public:
return m_generator;
}
size_t get_non_scalar_constants_count() const {
return m_non_scalar_constants_count;
}
bool is_quantized() const {
return config.m_is_quantized;
}
bool has_type_relaxed_ops() const {
return config.m_has_type_relaxed_ops;
}
snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, ngraph::pass::Manager& opt,
const void* compile_params = nullptr);
@ -99,6 +110,7 @@ public:
// plugin sets generator for a snippet to some specific generator.
// it's going to be replaced with Jitters table later
void set_generator(std::shared_ptr<ngraph::snippets::Generator> generator);
void set_non_scalar_constants_count(const size_t count);
void print() const;
void print_statistics(bool verbose);
@ -111,9 +123,29 @@ public:
private:
void align_element_types(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes);
void convert_to_snippet_dialect();
Shape exec_domain;
std::shared_ptr<ov::Model> m_body;
std::shared_ptr<ngraph::snippets::Generator> m_generator;
// Count of potentional non-scalar Consants that will be created after some tranformations
// At the moment it's relevant only for FakeQuantize decomposition
// NOTE: To avoid overheads in each calcution of this count (for example, in validate_and_type_infer()),
// we should MANUALLY calculate it where it needed.
size_t m_non_scalar_constants_count = 0;
Shape exec_domain = {};
std::shared_ptr<ov::Model> m_body = nullptr;
std::shared_ptr<ngraph::snippets::Generator> m_generator = nullptr;
// TODO: Change logic of insert Converts. This exec element type can be different for plugins
const ov::element::Type execution_element_type = ov::element::f32;
// Config to know which transformations should be called.
// It helps to avoid overheads of extra transformation calls
struct {
// True if Subgraph contains FakeQuantize -> FQ decomposition should be called
bool m_is_quantized = false;
// True if we should align element types indise body
bool m_is_needed_to_align_precision = false;
// True if Subgraph contains TypeRelaxed nodes -> for several streams in tp mode we should copy body using mutexes
// because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
bool m_has_type_relaxed_ops = false;
} config;
};
static inline std::ostream& operator<<(std::ostream& os, const op::Subgraph::BlockedShape& blocked_shape) {
@ -121,10 +153,6 @@ static inline std::ostream& operator<<(std::ostream& os, const op::Subgraph::Blo
return os;
}
static inline auto is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_output_node) -> bool {
return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1;
};
static inline auto create_body(std::string name, const ngraph::ResultVector& results, const ngraph::ParameterVector& parameters) ->
std::shared_ptr<ov::Model> {
auto body = std::make_shared<ov::Model>(results, parameters, name);

View File

@ -0,0 +1,46 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface AlignElementType
* @brief Wrap sequence of operations which doesn't support execution on original element type by ConvertSaturation
* and reset element type for type relaxed nodes inside body to align element type between nodes.
* Example 1:
* - After FQ decomposition there may be Convert[U8/I8]. If after the Convert there are other operations
* that don't support U8/I8, new ConvertSaturation[exec_type] will be inserted after the FQ decomposition
* to execute these operations on supported element type
* Example 2:
* - Input[I8] -> Unsupported I8 op -> Movement op -> Output[I8]. There will be inserted two ConvertSaturation:
* * ConvertSatiration[exec_type] before op which is unsupported I8
* * ConvertSaturation[I8] before Movement op to return original low precision.
* Note: We cannot just remove original Convert[I8/U8] in Example 1 because we should cover two things:
* * allow execution of operations on supported element type for them
* * keep computations mathematically equivalent to the original function
* Thus, for these cases we should have the following pipeline: FP32 -> Convert[I8/U8] -> Convert[FP32] -> FP32
* Note: We shouldn't call validate_and_infer_type() after Convert insertions to avoid element type conflicts on inputs of ops
* @ingroup snippets
*/
class AlignElementType: public ngraph::pass::FunctionPass {
public:
OPENVINO_RTTI("AlignElementType", "0");
AlignElementType(const ov::element::Type exec_type = ov::element::f32);
bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
static bool opNeedsAlignElementType(const std::shared_ptr<ov::Node>& n, const ov::element::Type exec_type = ov::element::f32);
private:
ov::element::Type exec_type;
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -0,0 +1,22 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
class CommonOptimizations : public ngraph::pass::MatcherPass {
public:
NGRAPH_RTTI_DECLARATION;
CommonOptimizations();
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -14,7 +14,7 @@ namespace pass {
/**
* @interface ConvertConstantsToScalars
* @brief Replace only constants which are should be represented as scalars during code generation.
* Only single-value (0D) constants are currently supported.
* Only single-value (0D) constants are currently supported.
* @ingroup snippets
*/
class ConvertConstantsToScalars: public ngraph::pass::MatcherPass {
@ -24,4 +24,4 @@ public:
} // namespace pass
} // namespace snippets
} // namespace ngraph
} // namespace ngraph

View File

@ -0,0 +1,91 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/op/fake_quantize.hpp"
#include "ngraph/pass/graph_rewrite.hpp"
#include "ngraph/pass/constant_folding.hpp"
#include "snippets/pass/transform_convert.hpp"
#include "transformations_visibility.hpp"
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface FakeQuantizeDecomposition
* @ingroup snippets
* @brief FakeQuantizeDecomposition transformation decomposes FakeQuantize layer.
*
* Expression from specification:
* if x <= min(il, ih):
* output = ol
* elif x > max(il, ih):
* output = oh
* else:
* output = round((x - il) / (ih - il) * (levels-1)) / (levels-1) * (oh - ol) + ol
*
* Expand brackets:
* round(x * (levels-1) / (ih - il) - il * (levels-1) / (ih - il)) * (oh - ol) / (levels-1) + ol
*
* Marking:
* - isc := (levels-1) / (ih - il)
* - ish := -il * isc
* - osc := (oh - ol) / (levels-1)
* - osh := ol
* Final expression:
* round(x * isc + ish) * osc + osh
*
* Some optimizations (example for scalars):
* 1. If output element type of FQ is U8 and il = 0, ish = 0, osc = 1, osh = 0, there is enough expression: x * isc
* 2. If output element type of FQ is I8 and ish ~= 128, osc = 1, osh ~= -128, il * isc ~= -128, ih * isc ~= 127 there is enough expression: x * isc
* 3. If osc = 1, osh = 0, there isn't dequantization
* 4. If there isn't dequantization and output element type of FQ isn't FP32, there isn't rounding
*
* This transformation doesn't support following cases:
* 1. At least one 'range' input is not Constant
* 2. At least one 'il' input value greater or equal than 'ih' input value
*
*/
class FakeQuantizeDecomposition : public ngraph::pass::MatcherPass {
public:
FakeQuantizeDecomposition();
static bool isAllScalarConstant(const std::shared_ptr<const ngraph::Node>& node);
static bool getScalesAndShifts(const std::shared_ptr<const ngraph::op::v0::FakeQuantize>& fq_node,
std::vector<float>& cl,
std::vector<float>& ch,
std::vector<float>& isc,
std::vector<float>& ish,
std::vector<float>& osc,
std::vector<float>& osh);
static std::vector<float> calculateScales(const ngraph::element::Type& out_type,
const std::vector<float>& cl,
const std::vector<float>& ch,
const std::vector<float>& isc,
const std::vector<float>& ish,
const std::vector<float>& osc,
const std::vector<float>& osh);
};
/**
* @interface CommonFakeQuantizeDecomposition
* @ingroup snippets
* @brief CommonFakeQuantizeDecomposition pass applies all needed transformations for
* correct FQ Decomposition:
* 0. Disable Validate() pass after each transformations
* 1. FakeQuantization decomposition
* 2. ConstantFolding
* 3. Validate
*/
class CommonFakeQuantizeDecomposition: public ngraph::pass::FunctionPass {
public:
bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -1,31 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface InsertConvertOnInputs
* @brief Inserts ConvertSaturation op after Parameters and Scalars to convert data type of inputs
* to supported execution data type.
* Note: ConvertSaturation op isn't covered by specification of "Convert" op
* This op is used for conversion into and from FP32 after the correspoding Load
* and before Store to calculate in FP32 inside subgraph body in CPU Plugin
* @ingroup snippets
*/
class InsertConvertOnInputs: public ngraph::pass::MatcherPass {
public:
InsertConvertOnInputs(const ov::element::Type exec_type = ov::element::f32);
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -13,7 +13,7 @@ namespace pass {
/**
* @interface InsertMoveBroadcast
* @brief Inserts explicit MoveBroadcast instruction if broadcasting by most warying dimension is needed.
* @brief Inserts explicit MoveBroadcast instruction if broadcasting by most varying dimension is needed.
* The pass is used to convert model to a canonical form for code generation
* @ingroup snippets
*/

View File

@ -1,31 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/pass/graph_rewrite.hpp>
#include <ngraph/pattern/matcher.hpp>
namespace ngraph {
namespace snippets {
namespace pass {
/**
* @interface ResetTypeRelaxedNodePrecision
* @brief Reset precision for type relaxed nodes inside body to align precision between nodes.
* Should be called after all Convert insertions
* @ingroup snippets
*/
class ResetTypeRelaxedNodePrecision: public ngraph::pass::FunctionPass {
public:
OPENVINO_RTTI("ResetTypeRelaxedNodePrecision", "0");
ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type = ov::element::f32);
bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
private:
ov::element::Type exec_type;
};
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -0,0 +1,28 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
/**
* @brief A file contains public utilities.
* @file utils.hpp
*/
#pragma once
#include "snippets_isa.hpp"
#include "emitter.hpp"
namespace ngraph {
namespace snippets {
namespace utils {
// Get non-scalar Constant count that will be created after FakeQuantize decomposition.
// This count is needed to know exact count of non-scalar Constants during tokenization.
auto get_non_scalar_constant_count_for_fq(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) -> size_t;
inline auto is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_output_node) -> bool {
return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) && ngraph::shape_size(source_output_node->get_shape()) == 1;
}
} // namespace utils
} // namespace snippets
} // namespace ngraph

View File

@ -8,6 +8,9 @@
#include "ngraph/runtime/host_tensor.hpp"
BWDCMP_RTTI_DEFINITION(ngraph::snippets::op::ConvertSaturation);
ngraph::snippets::op::ConvertSaturation::ConvertSaturation(const Output<Node>& x, const ov::element::Type& destination_type)
: ov::op::v0::Convert({x}, destination_type) {
}

View File

@ -8,6 +8,9 @@
#include "ngraph/runtime/host_tensor.hpp"
BWDCMP_RTTI_DEFINITION(ngraph::snippets::op::ConvertTruncation);
ngraph::snippets::op::ConvertTruncation::ConvertTruncation(const Output<Node>& x, const ov::element::Type& destination_type)
: ov::op::v0::Convert({x}, destination_type) {
}

View File

@ -11,18 +11,19 @@
#include "snippets/pass/insert_movebroadcast.hpp"
#include "snippets/pass/load_movebroadcast_to_broadcastload.hpp"
#include "snippets/pass/assign_registers.hpp"
#include "snippets/pass/convert_constants_to_scalars.hpp"
#include "snippets/pass/convert_constants.hpp"
#include "snippets/pass/convert_power_to_powerstatic.hpp"
#include "snippets/pass/vector_to_scalar.hpp"
#include "snippets/pass/transform_convert_to_truncation.hpp"
#include "snippets/pass/insert_convert_on_inputs.hpp"
#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
#include "snippets/pass/transform_convert.hpp"
#include "snippets/pass/align_element_type.hpp"
#include "snippets/utils.hpp"
#include "transformations/common_optimizations/nop_elimination.hpp"
#include "transformations/utils/utils.hpp"
#include <ngraph/pass/manager.hpp>
#include "ngraph/pass/constant_folding.hpp"
#include "ngraph_ops/type_relaxed.hpp"
#include <openvino/pass/serialize.hpp>
#include <algorithm>
@ -36,8 +37,20 @@ void snippets::op::Subgraph::set_generator(std::shared_ptr<ngraph::snippets::Gen
m_generator = generator;
}
void snippets::op::Subgraph::set_non_scalar_constants_count(const size_t count) {
m_non_scalar_constants_count = count;
}
snippets::op::Subgraph::Subgraph(const OutputVector& args, std::shared_ptr<ov::Model> body)
: Op(args), m_body(body), m_generator(nullptr) {
const auto ops = m_body->get_ops();
for (const auto& op : ops) {
config.m_is_quantized = config.m_is_quantized || ov::is_type<ov::op::v0::FakeQuantize>(op);
config.m_has_type_relaxed_ops = config.m_has_type_relaxed_ops || std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op);
config.m_is_needed_to_align_precision = config.m_is_needed_to_align_precision || is_quantized() || has_type_relaxed_ops() ||
snippets::pass::AlignElementType::opNeedsAlignElementType(op, execution_element_type);
}
constructor_validate_and_infer_types();
}
@ -86,7 +99,8 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
ngraph::OutputVector subgraph_inputs;
for (const auto& input : node->input_values()) {
if (is_scalar_constant(input.get_node_shared_ptr())) {
if ((utils::is_scalar_constant(input.get_node_shared_ptr())) ||
(ov::is_type<ov::op::v0::FakeQuantize>(node) && ov::is_type<ov::op::v0::Constant>(input.get_node_shared_ptr()))) {
body_inputs.push_back(input);
} else {
auto parameter = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
@ -119,6 +133,10 @@ auto snippets::op::Subgraph::wrap_node_as_subgraph(const std::shared_ptr<ov::Nod
auto body = create_body(node->get_friendly_name(), body_results, body_parameters);
auto subgraph = build_subgraph(node, subgraph_inputs, body);
if (auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
subgraph->set_non_scalar_constants_count(utils::get_non_scalar_constant_count_for_fq(fq_node));
}
for (size_t i = 0; i < body->get_parameters().size(); i++) {
body->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
}
@ -251,25 +269,18 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
const BlockedShapeVector& inputShapes) {
// TODO: At the moment snippets support execution in only one element type
const auto execution_element_type = ov::element::f32;
ngraph::pass::Manager p_manager;
p_manager.register_pass<snippets::pass::TransformConvertToConvertTruncation>();
p_manager.run_passes(m_body);
const auto& body_results = m_body->get_results();
for (size_t i = 0; i < outputShapes.size(); i++) {
const auto needed_out_type = std::get<2>(outputShapes[i]);
// If there is real Convert from graph (ConvertTruncation) before Result
// If there is real Convert from graph (ConvertTruncation) or after FQ decomp (ConvertSaturation) before Result
// we should check destination type and insert ConvertSaturation before that if needed.
// For example, to return original element type after Convert insertion on inputs
std::shared_ptr<ov::Node> first_convert = body_results[i];
while (ov::is_type<ngraph::snippets::op::ConvertTruncation>(first_convert->get_input_node_ptr(0))) {
while (ov::is_type<ngraph::op::v0::Convert>(first_convert->get_input_node_ptr(0))) {
first_convert = first_convert->get_input_node_shared_ptr(0);
}
if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(first_convert)) {
if (auto existing_convert_t = ngraph::as_type_ptr<ngraph::op::v0::Convert>(first_convert)) {
const auto original_input_element_type = existing_convert_t->get_input_element_type(0);
if (original_input_element_type != execution_element_type) {
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(
@ -283,16 +294,16 @@ void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outpu
body_results[i]->get_input_node_shared_ptr(0), needed_out_type);
body_results[i]->set_argument(0, convert);
}
// After Convert insertion we should make the following steps:
// - insert ConvertSaturation after inputs and scalar to start aligning of exec data type inside body
// - manually set output element types of type relaxed nodes to align element type inside subgraph body
// - after Convert insertion on inputs and after scalars we should use ConstantFolding pass to convert
// element type of Scalars before inference
// - eliminate redundant Convert that could have been inserted
// We should align element type inside body using the corresponding pass:
// - Insert Convert before operations that doesn't support original element type for execution
// - Insert reverse Convert before operations that support original element type
// but have inputs that doesn't support it (because before them will be inserted Convert with exec_type - first point)
// Then we should use ConstantFolding pass to convert element type of Scalars before inference.
// At the end eliminate redundant Convert that could be inserted
ngraph::pass::Manager manager;
manager.register_pass<snippets::pass::InsertConvertOnInputs>(execution_element_type);
manager.register_pass<snippets::pass::ResetTypeRelaxedNodePrecision>(execution_element_type);
if (config.m_is_needed_to_align_precision) {
manager.register_pass<snippets::pass::AlignElementType>(execution_element_type);
}
manager.register_pass<ngraph::pass::ConstantFolding>();
manager.register_pass<ngraph::pass::EliminateConvert>();
manager.run_passes(m_body);

View File

@ -0,0 +1,97 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/snippets_isa.hpp"
#include "snippets/op/convert_saturation.hpp"
#include "snippets/pass/align_element_type.hpp"
#include "snippets/utils.hpp"
#include "ngraph_ops/type_relaxed.hpp"
#include "ngraph/op/util/op_types.hpp"
#include <ngraph/rt_info.hpp>
namespace {
auto is_in_out_op(const std::shared_ptr<ov::Node>& n) -> bool {
return ov::is_type<ov::op::v0::Parameter>(n)
|| ov::is_type<ov::op::v0::Constant>(n)
|| ov::is_type<ov::op::v0::Result>(n);
}
// At the moment Subgraph supports only Eltwise, Convert and FQ (which is decomposed into Eltwises and Convert)
// And only Eltwises supports execution only in "exec_type". So we can check op type from the opposite
auto op_supports_only_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
return !ov::is_type<ov::op::v0::Convert>(n);
}
// Check if executable operation supports only execution element type f32
// NOTE: Executable op is node that isn't Parameter/Constant/Result
auto is_executable_op_only_on_exec_type(const std::shared_ptr<ov::Node>& n) -> bool {
return op_supports_only_exec_type(n) && !is_in_out_op(n);
}
} // namespace
ngraph::snippets::pass::AlignElementType::AlignElementType(const ov::element::Type exec_type) : exec_type(exec_type) { }
bool ngraph::snippets::pass::AlignElementType::run_on_model(const std::shared_ptr<ov::Model> &m) {
RUN_ON_FUNCTION_SCOPE(AlignElementType);
auto insertConvert = [](const std::shared_ptr<ov::Node>& op, const size_t idx, const ov::element::Type& element_type) -> void {
auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(op->input(idx).get_source_output(), element_type);
ngraph::copy_runtime_info(op->get_input_node_shared_ptr(idx), convert);
op->set_argument(idx, convert);
};
// NOTE: We don't call validate_and_infer_types() to avoid precision conflicts on inputs
bool rewritten = false;
auto ops = m->get_ordered_ops();
for (auto& op : ops) {
if (is_in_out_op(op) || ov::is_type<ov::op::v0::Convert>(op)) {
continue;
}
if (op_supports_only_exec_type(op)) {
for (auto i = 0; i < op->inputs().size(); i++) {
auto shared_input = op->get_input_node_shared_ptr(i);
auto existing_convert = ov::as_type_ptr<ov::op::v0::Convert>(shared_input);
// We should insert Convert before Ops, which supports only exec element type, only when:
// - Input is Convert with unsupported destination type
// - Input is Op which support any element type
// We couldn't unite these conditions and just check that element type isn't supported exec type
// because we don't call validate_and_infer_types() so we don't know new precisions
if ((existing_convert && existing_convert->get_destination_type() != exec_type) || (!is_executable_op_only_on_exec_type(shared_input))) {
insertConvert(op, i, exec_type);
rewritten |= true;
}
}
if (auto tr_node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
tr_node->set_overridden_output_type(exec_type, 0);
rewritten |= true;
}
} else { // branch for the Movement ops and MatMul ops in the future
for (auto i = 0; i < op->inputs().size(); i++) {
auto shared_input = op->get_input_node_shared_ptr(i);
// it's original element type because we don't use validate_and_infer_type() anywhere
const auto original_eltype = op->input(i).get_element_type();
// If before op there is another op that doesn't support execution on original element type, we know that
// before this op will be inserted reverse Convert to support execution on supported element type (first branch of condition).
// So we should return original element type for operations that can support low precision
if (is_executable_op_only_on_exec_type(shared_input) && original_eltype != exec_type) {
insertConvert(op, i, original_eltype);
rewritten |= true;
}
}
}
}
return rewritten;
}
bool ngraph::snippets::pass::AlignElementType::opNeedsAlignElementType(const std::shared_ptr<ov::Node>& op, const ov::element::Type exec_type) {
// At the moment Snippets support only Eltwise/Convert/FQ which one output so we can just call get_element_type()
return is_executable_op_only_on_exec_type(op) && op->get_element_type() != exec_type;
}

View File

@ -7,6 +7,7 @@
#include "snippets/pass/collapse_subgraph.hpp"
#include "snippets/op/subgraph.hpp"
#include "snippets/utils.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset5.hpp>
@ -56,9 +57,19 @@ auto outputs_are_not_broadcastable(const std::shared_ptr<const Node>& node) -> b
return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs);
}
auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_layout_oblivious")
auto is_layout_oblivious_binary = [](const std::shared_ptr<const Node> &n) -> bool {
auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::is_supported_op")
auto is_supported_fq_op = [](const std::shared_ptr<const Node>& n) -> bool {
// TODO [92179]: Add support of FakeQuantize with non-constants inputs and with binarization algorithm.
const auto fq = ov::as_type_ptr<const opset1::FakeQuantize>(n);
return fq && fq->get_levels() != 2 &&
is_type<opset1::Constant>(n->get_input_node_shared_ptr(1)) &&
is_type<opset1::Constant>(n->get_input_node_shared_ptr(2)) &&
is_type<opset1::Constant>(n->get_input_node_shared_ptr(3)) &&
is_type<opset1::Constant>(n->get_input_node_shared_ptr(4));
};
auto is_supported_binary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
return ov::is_type<opset1::Add>(n)
|| ov::is_type<opset1::Divide>(n)
|| ov::is_type<opset1::Equal>(n)
@ -79,10 +90,11 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
|| ov::is_type<opset1::Power>(n)
|| ov::is_type<opset1::SquaredDifference>(n)
|| ov::is_type<opset1::Subtract>(n)
|| ov::is_type<opset1::Xor>(n);
|| ov::is_type<opset1::Xor>(n)
|| ov::is_type<ngraph::op::v0::Convert>(n);
};
auto is_layout_oblivious_unary = [](const std::shared_ptr<const Node> &n) -> bool {
auto is_supported_unary_eltwise_op = [](const std::shared_ptr<const Node> &n) -> bool {
return ov::is_type<opset1::Abs>(n)
|| ov::is_type<opset1::Clamp>(n)
|| ov::is_type<opset1::Floor>(n)
@ -99,10 +111,10 @@ auto is_layout_oblivious(const std::shared_ptr<const Node> &n) -> bool {
|| ov::is_type<opset1::Tanh>(n)
|| ov::is_type<ngraph::op::v0::Gelu>(n)
|| ov::is_type<ngraph::op::v7::Gelu>(n)
|| ov::is_type<ngraph::op::v4::HSwish>(n)
|| ov::is_type<ngraph::op::v0::Convert>(n);
|| ov::is_type<ngraph::op::v4::Swish>(n)
|| ov::is_type<ngraph::op::v4::HSwish>(n);
};
return is_layout_oblivious_unary(n) || is_layout_oblivious_binary(n);
return is_supported_fq_op(n) || is_supported_unary_eltwise_op(n) || is_supported_binary_eltwise_op(n);
}
auto has_supported_in_out(const std::shared_ptr<const Node> &n) -> bool {
@ -162,7 +174,7 @@ auto update_out_tensor_name(std::shared_ptr<ngraph::snippets::op::Subgraph> &sub
} // namespace
bool AppropriateForSubgraph(const std::shared_ptr<const Node> &node) {
return is_layout_oblivious(node) && has_supported_in_out(node);
return is_supported_op(node) && has_supported_in_out(node);
}
void SetSnippetsNodeType(const std::shared_ptr<Node> &node, SnippetsNodeType nodeType) {
@ -435,7 +447,10 @@ TokenizeSnippets::TokenizeSnippets() {
// Result op has a single input
internal_inputs.push_back(source_result->input_value(0));
} else {
if (op::is_scalar_constant(input_node)) {
// We have to save explicitly FQ Constants to call ConstantFolding after Tokenization.
// After ConstantFolding we will move remaining non-scalar Constants from body using ConvertConstantsToParameters pass
if ((utils::is_scalar_constant(input_node)) ||
(ov::is_type<ov::op::v0::Constant>(input_node) && ov::is_type<ov::op::v0::FakeQuantize>(node))) {
internal_inputs.push_back(input_node->output(0));
} else {
external_inputs.push_back(input_value);
@ -461,10 +476,23 @@ TokenizeSnippets::TokenizeSnippets() {
throw ngraph_error("original node outputs size and extracted node outputs size doesn't much");
}
// After some transformations, a different number of Constants for some operations may be created
// than the actual number of Constants during tokenization.
// To avoid unsupported number of non-scalar Constants in the future (plugin specific limitation)
// we should calculate potentional number of non-scalar Constants that will be moved up from body.
size_t hidden_non_scalar_constant_count = 0;
if (const auto fq_node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node)) {
hidden_non_scalar_constant_count += ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(fq_node);
}
ResultVector body_results;
std::vector<std::set<Input<Node>>> subgraph_result_inputs;
for (auto subgraph : input_subgraphs) {
// we should summurize non-scalar Constants count from all input subgraphs
// because we will collapse them with our node and we should get total count of non-scalar Constants
hidden_non_scalar_constant_count += ov::as_type_ptr<ngraph::snippets::op::Subgraph>(subgraph)->get_non_scalar_constants_count();
for (auto output : subgraph->outputs()) {
bool first_side_consumer = true;
@ -502,12 +530,15 @@ TokenizeSnippets::TokenizeSnippets() {
if (body_results.size() != subgraph_result_inputs.size()) {
throw ngraph_error("body results and node results size mismatch during subgraph collaps");
}
// todo: move this plugin-specific constraint to the plugin callback
if (body_parameters.size() + body_results.size() > 12) {
if (body_parameters.size() + body_results.size() + hidden_non_scalar_constant_count > 12) {
const std::string message_reset = "new subgraph is created. Impossible to schedule subgraph with " +
std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
const std::string message_abort = "failed to continue subgraph. Impossible to schedule subgraph with " +
std::to_string(body_parameters.size()) + " inputs and " + std::to_string(body_results.size()) + " outputs.";
std::to_string(body_parameters.size()) + " inputs, " + std::to_string(body_results.size()) + " outputs and " +
std::to_string(hidden_non_scalar_constant_count) + " non-scalar constants.";
return abort_with_strategy(message_reset, message_abort);
}
@ -542,6 +573,7 @@ TokenizeSnippets::TokenizeSnippets() {
act_body1->get_parameters()[i]->set_friendly_name(body_parameters[i]->get_friendly_name());
}
subgraph->get_rt_info()["originalLayersNames"] = fusedNames;
subgraph->set_non_scalar_constants_count(hidden_non_scalar_constant_count);
remark(1) << "Replacement (merge) done for: "
<< subgraph->get_friendly_name()

View File

@ -0,0 +1,87 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/pass/common_optimizations.hpp"
#include <memory>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/pass/constant_folding.hpp>
#include "transformations/utils/utils.hpp"
#include "snippets/pass/fq_decomposition.hpp"
#include "snippets/op/subgraph.hpp"
#include "snippets/itt.hpp"
NGRAPH_RTTI_DEFINITION(ngraph::snippets::pass::CommonOptimizations, "Snippets::CommonOptimizations", 0);
namespace ngraph {
namespace snippets {
namespace pass {
// Move up Constants which aren't scalars from body to Subgraph and replace them with Parameters inside body
void ConvertConstantsToParameters(const std::shared_ptr<ngraph::snippets::op::Subgraph>& subgraph) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::ConvertConstantsToParameters");
auto body = subgraph->get_body();
ParameterVector new_parameters;
OutputVector new_external_inputs = subgraph->input_values();
for (auto& op : body->get_ops()) {
auto constant = ov::as_type_ptr<ov::op::v0::Constant>(op);
if (!(constant && ngraph::shape_size(constant->get_shape()) != 1ul))
continue;
auto parameter = std::make_shared<opset1::Parameter>(constant->get_element_type(), constant->output(0).get_partial_shape());
parameter->set_friendly_name(constant->get_friendly_name());
ngraph::copy_runtime_info(constant, parameter);
constant->output(0).replace(parameter->output(0));
new_external_inputs.push_back(constant);
new_parameters.push_back(parameter);
}
if (new_parameters.size() != 0) {
body->add_parameters(new_parameters);
body->validate_nodes_and_infer_types();
subgraph->set_arguments(new_external_inputs);
}
}
CommonOptimizations::CommonOptimizations() {
ngraph::graph_rewrite_callback callback = [this](pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::CommonOptimizations");
auto subgraph = ngraph::as_type_ptr<ngraph::snippets::op::Subgraph>(m.get_match_root());
if (transformation_callback(subgraph)) {
return false;
}
auto body = subgraph->get_body();
const auto is_quantized = subgraph->is_quantized();
// Firsly we should transform all original Converts inside body to ConvertTruncation to save original behavior.
// Then if Subgraph contains FakeQuantize we enable specific transformation for quantized subgraphs.
ngraph::pass::Manager manager;
manager.register_pass<ngraph::snippets::pass::TransformConvertToConvertTruncation>();
if (is_quantized) {
manager.register_pass<ngraph::snippets::pass::CommonFakeQuantizeDecomposition>();
}
manager.run_passes(body);
// At the moment only non-scalar Constants of FakeQuantize can be inside Subgraph
// so we can enable ConvertConstantsToParameters pass for quantized models
if (is_quantized) {
ConvertConstantsToParameters(subgraph);
}
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(ngraph::pattern::wrap_type<ngraph::snippets::op::Subgraph>(), "snippets::pass::CommonOptimizations");
this->register_matcher(m, callback);
}
} // namespace pass
} // namespace snippets
} // namespace ngraph

View File

@ -3,9 +3,12 @@
//
#include <snippets/itt.hpp>
#include "snippets/snippets_isa.hpp"
#include "snippets/pass/convert_constants_to_scalars.hpp"
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include "snippets/snippets_isa.hpp"
#include "snippets/pass/convert_constants.hpp"
#include "snippets/op/subgraph.hpp"
ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
@ -24,5 +27,5 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
return true;
};
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants, matcher_name), callback);
register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
}

View File

@ -0,0 +1,308 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/pass/fq_decomposition.hpp"
#include "snippets/op/convert_saturation.hpp"
#include "snippets/itt.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset5.hpp>
#include <ngraph/partial_shape.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pass/manager.hpp>
#include <numeric>
namespace {
bool isValidRangesInputs(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) {
auto il = fq->input_value(1);
auto ih = fq->input_value(2);
auto greater_equal = std::make_shared<ngraph::opset1::GreaterEqual>(il, ih);
ngraph::OutputVector result(1);
if (!greater_equal->constant_fold(result, greater_equal->input_values()))
return false;
auto res_node = std::dynamic_pointer_cast<const ngraph::opset1::Constant>(result[0].get_node_shared_ptr());
const std::vector<bool> comp_result = res_node->cast_vector<bool>();
return !std::any_of(comp_result.begin(), comp_result.end(), [](const bool value) {
return value;
});
}
bool is_scalar_constant(const std::shared_ptr<ngraph::Node>& source_output_node) {
return ngraph::is_type<ngraph::opset1::Constant>(source_output_node) &&
ngraph::shape_size(source_output_node->get_shape()) == 1;
}
} // namespace
ngraph::snippets::pass::FakeQuantizeDecomposition::FakeQuantizeDecomposition() {
MATCHER_SCOPE(FakeQuantizeDecomposition);
auto fake_quantize = ngraph::pattern::wrap_type<ngraph::opset1::FakeQuantize>(
OutputVector{ngraph::pattern::any_input(),
ngraph::pattern::wrap_type<opset1::Constant>(),
ngraph::pattern::wrap_type<opset1::Constant>(),
ngraph::pattern::wrap_type<opset1::Constant>(),
ngraph::pattern::wrap_type<opset1::Constant>()});
ngraph::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::FakeQuantizeDecomposition")
auto& pattern_to_output = m.get_pattern_value_map();
const auto fake_quantize_node = std::dynamic_pointer_cast<ngraph::opset1::FakeQuantize>(
pattern_to_output.at(fake_quantize).get_node_shared_ptr());
if (!fake_quantize_node || transformation_callback(fake_quantize_node) ||
!isValidRangesInputs(fake_quantize_node)) {
return false;
}
Output<Node> data{fake_quantize_node->input_value(0)};
const Output<Node> input_low{fake_quantize_node->input_value(1)};
const Output<Node> input_high{fake_quantize_node->input_value(2)};
const Output<Node> output_low{fake_quantize_node->input_value(3)};
const Output<Node> output_high{fake_quantize_node->input_value(4)};
auto input_type = data.get_element_type();
std::vector<float> out_scales;
std::vector<float> cl, ch, isc, ish, osc, osh;
const bool status = getScalesAndShifts(fake_quantize_node, cl, ch, isc, ish, osc, osh);
if (status) {
out_scales = calculateScales(fake_quantize_node->get_output_element_type(0), cl, ch, isc, ish, osc, osh);
}
const bool do_dequantize = !(status && ((std::all_of(osc.cbegin(),
osc.cend(),
[](float val) {
return val == 1.f;
}) &&
std::all_of(osh.cbegin(),
osh.cend(),
[](float val) {
return val == 0.f;
})) ||
out_scales.size() != 0));
const bool do_rounding = do_dequantize || fake_quantize_node->get_output_element_type(0) == ngraph::element::f32;
ngraph::NodeVector decomp_ops;
if (input_type != input_low.get_element_type()) {
input_type = input_low.get_element_type();
data = std::make_shared<ngraph::snippets::op::ConvertSaturation>(data, input_type);
decomp_ops.push_back(data.get_node_shared_ptr());
}
// if we set input_low or input_high in formula we got output = output_low and output = output_high
// respectively so we just clamp x
const auto max = std::make_shared<ngraph::opset1::Maximum>(data, input_low);
const auto min = std::make_shared<ngraph::opset1::Minimum>(max, input_high);
decomp_ops.push_back(max);
decomp_ops.push_back(min);
std::shared_ptr<ngraph::Node> result = nullptr;
if (out_scales.size() != 0) {
PartialShape scale_shape = input_low.get_partial_shape();
ngraph::PartialShape::broadcast_merge_into(scale_shape,
input_high.get_partial_shape(),
ov::op::AutoBroadcastType::NUMPY);
const auto scales =
std::make_shared<ngraph::opset1::Constant>(ngraph::element::f32, scale_shape.get_shape(), out_scales);
decomp_ops.push_back(scales);
result = std::make_shared<ngraph::opset1::Multiply>(min, scales);
decomp_ops.push_back(result);
} else {
// (levels-1)
const auto levels_minus_one =
std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
decomp_ops.push_back(levels_minus_one);
// (input_high - input_low)
const auto subInHighLow = std::make_shared<ngraph::opset1::Subtract>(input_high, input_low);
// (levels-1) / (input_high - input_low)
const auto isc = std::make_shared<ngraph::opset1::Divide>(levels_minus_one, subInHighLow);
// input_low * (levels-1) / (input_high - input_low)
const auto ish = std::make_shared<ngraph::opset1::Multiply>(input_low, isc);
decomp_ops.push_back(subInHighLow);
decomp_ops.push_back(isc);
decomp_ops.push_back(ish);
// x * (levels-1) / (input_high - input_low)
const auto after_isc_apply = std::make_shared<ngraph::opset1::Multiply>(min, isc);
// x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)
result = std::make_shared<ngraph::opset1::Subtract>(after_isc_apply, ish);
decomp_ops.push_back(after_isc_apply);
decomp_ops.push_back(result);
}
if (do_rounding) {
// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low))
result = std::make_shared<ngraph::opset5::Round>(result, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
decomp_ops.push_back(result);
}
if (do_dequantize) {
// (levels-1)
const auto levels_minus_one =
std::make_shared<ngraph::opset1::Constant>(input_type, Shape{}, fake_quantize_node->get_levels() - 1);
// (output_high - output_low)
const auto sub_out_high_low = std::make_shared<ngraph::opset1::Subtract>(output_high, output_low);
// (output_high - output_low) / (levels-1)
const auto osc = std::make_shared<ngraph::opset1::Divide>(sub_out_high_low, levels_minus_one);
decomp_ops.push_back(sub_out_high_low);
decomp_ops.push_back(osc);
// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) *
// (output_high - output_low) / (levels-1)
const auto after_osc_apply = std::make_shared<ngraph::opset1::Multiply>(result, osc);
// round(x * (levels-1) / (input_high - input_low) - input_low * (levels-1) / (input_high - input_low)) *
// (output_high - output_low) / (levels-1) + output_low
result = std::make_shared<ngraph::opset1::Add>(after_osc_apply, output_low);
decomp_ops.push_back(after_osc_apply);
decomp_ops.push_back(result);
}
if (result->get_output_element_type(0) != fake_quantize_node->get_output_element_type(0)) {
result = std::make_shared<snippets::op::ConvertSaturation>(result, fake_quantize_node->get_output_element_type(0));
decomp_ops.push_back(result);
}
result->set_friendly_name(m.get_match_root()->get_friendly_name());
ngraph::copy_runtime_info(fake_quantize_node, decomp_ops);
ngraph::replace_node(m.get_match_root(), result);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(fake_quantize, matcher_name);
register_matcher(m, callback);
}
bool ngraph::snippets::pass::FakeQuantizeDecomposition::isAllScalarConstant(const std::shared_ptr<const ngraph::Node>& node) {
return is_scalar_constant(node->get_input_node_shared_ptr(1)) &&
is_scalar_constant(node->get_input_node_shared_ptr(2)) &&
is_scalar_constant(node->get_input_node_shared_ptr(3)) &&
is_scalar_constant(node->get_input_node_shared_ptr(4));
}
bool ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(
const std::shared_ptr<const ngraph::opset1::FakeQuantize>& fq_node,
std::vector<float>& cl,
std::vector<float>& ch,
std::vector<float>& isc,
std::vector<float>& ish,
std::vector<float>& osc,
std::vector<float>& osh) {
auto input_low_constant =
std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(1));
auto input_high_constant =
std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(2));
auto output_low_constant =
std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(3));
auto output_high_constant =
std::dynamic_pointer_cast<ngraph::opset1::Constant>(fq_node->get_input_node_shared_ptr(4));
if (!input_low_constant || !input_high_constant || !output_low_constant || !output_high_constant)
return false;
auto input_low = input_low_constant->cast_vector<float>();
auto input_high = input_high_constant->cast_vector<float>();
auto output_low = output_low_constant->cast_vector<float>();
auto output_high = output_high_constant->cast_vector<float>();
auto levels = fq_node->get_levels();
const auto input_size = std::max(input_low.size(), input_high.size());
const auto output_size = std::max(output_low.size(), output_high.size());
cl = input_low;
ch = input_high;
isc.resize(input_size, 0);
ish.resize(input_size, 0);
osc.resize(output_size, 0);
osh.resize(output_size, 0);
for (int i = 0; i < input_size; i++) {
float il = input_low[input_low.size() == 1 ? 0 : i];
float ih = input_high[input_high.size() == 1 ? 0 : i];
isc[i] = (levels - 1) / (ih - il);
ish[i] = -il * isc[i];
}
for (int i = 0; i < output_size; i++) {
float ol = output_low[output_low.size() == 1 ? 0 : i];
float oh = output_high[output_high.size() == 1 ? 0 : i];
osc[i] = (oh - ol) / (levels - 1);
osh[i] = ol;
}
return true;
}
std::vector<float> ngraph::snippets::pass::FakeQuantizeDecomposition::calculateScales(const ngraph::element::Type& out_type,
const std::vector<float>& cl,
const std::vector<float>& ch,
const std::vector<float>& isc,
const std::vector<float>& ish,
const std::vector<float>& osc,
const std::vector<float>& osh) {
std::vector<float> out_scales;
if (out_type == ngraph::element::u8 &&
std::all_of(cl.cbegin(),
cl.cend(),
[](float val) {
return val == 0.0f;
}) &&
std::all_of(ish.cbegin(),
ish.cend(),
[](float val) {
return val == 0.0f;
}) &&
std::all_of(osc.cbegin(),
osc.cend(),
[](float val) {
return val == 1.0f;
}) &&
std::all_of(osh.cbegin(), osh.cend(), [](float val) {
return val == 0.0f;
})) {
out_scales = isc;
}
static const float thr = 0.0001f;
if (out_type == ngraph::element::i8 &&
std::all_of(ish.cbegin(), ish.cend(), [](float val) { return std::abs(val - 128.f) < thr; }) &&
std::all_of(osc.cbegin(), osc.cend(), [](float val) { return val == 1.f; }) &&
std::all_of(osh.cbegin(), osh.cend(), [](float val) { return std::abs(val + 128.f) < thr; })) {
bool is_crop_aligned = true;
for (int i = 0; i < std::max(cl.size(), isc.size()); i++) {
if (std::abs(cl[cl.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] + 128.f) > thr) {
is_crop_aligned = false;
}
}
for (int i = 0; i < std::max(ch.size(), isc.size()); i++) {
if (std::abs(ch[ch.size() == 1 ? 0 : i] * isc[isc.size() == 1 ? 0 : i] - 127.f) > thr) {
is_crop_aligned = false;
}
}
if (is_crop_aligned) {
out_scales = isc;
}
}
return out_scales;
}
bool ngraph::snippets::pass::CommonFakeQuantizeDecomposition::run_on_model(const std::shared_ptr<ngraph::Function>& f) {
RUN_ON_FUNCTION_SCOPE(CommonFakeQuantizeDecomposition);
ngraph::pass::Manager manager;
manager.set_per_pass_validation(false);
manager.register_pass<ngraph::snippets::pass::FakeQuantizeDecomposition>();
manager.register_pass<ngraph::pass::ConstantFolding>();
manager.register_pass<ngraph::pass::Validate>();
manager.run_passes(f);
return false;
}

View File

@ -1,72 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/remarks.hpp"
#include "snippets/pass/insert_convert_on_inputs.hpp"
#include "snippets/snippets_isa.hpp"
#include "ngraph/type.hpp"
#include "ngraph/node.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
#include <ngraph/pattern/op/or.hpp>
// We should recursivelly (after full sequences of ConvertTruncation) go through inputs and
// insert ConvertSaturation with supported element type before eltwises
// NOTE: JUST EXAMPLE:
// Parameter I8
// ConvertTruncation U8
// / | \
// ConvertTruncation F32 ConvertTruncation I32 ConvertTruncation BF16
// Eltwise ConvertSaturation FP32 ConvertTruncation I32
// <> Eltwise ConvertSaturation FP32
// <> Eltwise
bool insertConvertSaturationAfterNode(const std::shared_ptr<ov::Node>& node, const ov::element::Type element_type) {
bool rewritten = false;
for (const auto& output : node->outputs()) {
for (auto consumer : output.get_target_inputs()) {
const auto output_shared_node = consumer.get_node()->shared_from_this();
// Go down through ConvertTruncation sequence
if (auto existing_convert_t = ov::as_type_ptr<ngraph::snippets::op::ConvertTruncation>(output_shared_node)) {
rewritten = insertConvertSaturationAfterNode(existing_convert_t, element_type);
continue;
}
// Check if ConvertSaturation already exists with supported element type or not and insert ConvertSaturation with supported element type
auto existing_convert_s = ov::as_type_ptr<ngraph::snippets::op::ConvertSaturation>(output_shared_node);
if ((!existing_convert_s && !ov::is_type<ov::op::v0::Result>(output_shared_node) && consumer.get_element_type() != element_type) ||
(existing_convert_s && existing_convert_s->get_destination_type() != element_type)) {
const auto convert = std::make_shared<ngraph::snippets::op::ConvertSaturation>(node, element_type);
consumer.replace_source_output(convert);
rewritten |= true;
}
}
}
return rewritten;
}
ngraph::snippets::pass::InsertConvertOnInputs::InsertConvertOnInputs(const ov::element::Type exec_type) {
MATCHER_SCOPE(InsertConvertOnInputs);
auto param_pattern = ngraph::pattern::wrap_type<ngraph::opset1::Parameter>();
auto scalar_pattern = pattern::wrap_type<opset1::Constant>(
[=](Output<Node> output) -> bool { return ngraph::shape_size(output.get_shape()) == 1; });
auto input = std::make_shared<pattern::op::Or>(OutputVector{ param_pattern, scalar_pattern });
ngraph::matcher_pass_callback callback = [this, exec_type](ngraph::pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::InsertConvertOnInputs")
auto root = m.get_match_root();
auto rewritten = insertConvertSaturationAfterNode(root, exec_type);
return rewritten;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(input, matcher_name);
register_matcher(m, callback);
}

View File

@ -17,124 +17,43 @@ using namespace ngraph;
namespace {
std::shared_ptr<ngraph::Node> numpy_broadcast_node(const ngraph::Output<ngraph::Node>& value,
const ngraph::Shape& output_shape, const ngraph::Shape& source_shape) {
std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngraph::Node>& value,
const ov::Shape& target_shape, const ov::Shape& normalized_shape) {
std::shared_ptr<ngraph::Node> broadcasted_node = value.get_node_shared_ptr();
if (output_shape == value.get_shape()) {
if (target_shape == value.get_shape()) {
return broadcasted_node;
}
NGRAPH_CHECK(source_shape.size() == output_shape.size(),
"Ranks of source_shape and output_shape dont match: ",
source_shape.size(),
" vs ",
output_shape.size());
bool do_broadcast = output_shape.size() > value.get_shape().size();
if (!do_broadcast) {
for (size_t index = 0; index < output_shape.size(); ++index) {
if (source_shape.at(index) == 1 && output_shape.at(index) != 1) {
do_broadcast = true;
break;
}
}
}
remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name()
<< " " << broadcasted_node->get_shape() << " -> " << output_shape << std::endl;
// it shouldn't be a probrem for now since we don't consider StridedSlice and Broadcast here
if (auto constant = ngraph::as_type_ptr<ngraph::opset1::Constant>(broadcasted_node)) {
if (constant->get_shape() == ngraph::Shape() || ngraph::shape_size(constant->get_shape()) == 1) {
remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name()
<< " to scalar constant " << constant->get_shape() << " -- aborting!" << std::endl;
return broadcasted_node;
}
}
if (auto constant = ngraph::as_type_ptr<ngraph::snippets::op::Scalar>(broadcasted_node)) {
if (constant->get_shape() == ngraph::Shape() || ngraph::shape_size(constant->get_shape()) == 1) {
remark(2) << "Insert explicit broadcast " << value.get_node()->get_type_name()
<< " to scalar constant " << constant->get_shape() << " -- aborting!" << std::endl;
return broadcasted_node;
}
}
if (do_broadcast) {
// ShapeOf
broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, output_shape);
// Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
// will be handled by pointer arithmetics in TileScheduler
if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
ov::Shape broadcasted_shape = normalized_shape;
*broadcasted_shape.rbegin() = *target_shape.rbegin();
broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, broadcasted_shape);
}
return broadcasted_node;
}
ngraph::Shape calculate_broadcast_shape(ngraph::Shape lhs_shape, ngraph::Shape rhs_shape) {
ngraph::Shape result;
auto lhs_rank = lhs_shape.size();
auto rhs_rank = rhs_shape.size();
auto max_rank = std::max(lhs_rank, rhs_rank);
// left-pad the lhs_shape with ones
lhs_shape.insert(begin(lhs_shape), max_rank - lhs_rank, 1);
// left-pad the rhs_shape with ones
rhs_shape.insert(begin(rhs_shape), max_rank - rhs_rank, 1);
for (size_t index = 0; index < max_rank; ++index) {
size_t lhs_dim = lhs_shape.at(index);
size_t rhs_dim = rhs_shape.at(index);
if (lhs_dim != rhs_dim && lhs_dim != 1 && rhs_dim != 1) {
throw ngraph::ngraph_error("incompatible shapes");
}
result.push_back(std::max(lhs_dim, rhs_dim));
std::pair<ov::Shape, std::vector<ov::Shape>> get_numpy_broadcast_shapes(const std::vector<ov::Shape>& input_shapes) {
ov::PartialShape target_shape = input_shapes.front();
for (auto i = 1; i < input_shapes.size(); i++) {
if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY))
throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes");
}
return result;
}
std::pair<ngraph::Shape, std::vector<ngraph::Shape>> get_numpy_broadcast_shapes(const std::vector<ngraph::Shape>& input_shapes) {
ngraph::Shape target_shape = std::accumulate(begin(input_shapes), end(input_shapes), ngraph::Shape{}, calculate_broadcast_shape);
std::vector<ngraph::Shape> full_shapes;
for (const ngraph::Shape& input : input_shapes) {
ngraph::Shape padded_shape{input};
padded_shape.insert(begin(padded_shape), target_shape.size() - padded_shape.size(), 1);
full_shapes.push_back(move(padded_shape));
std::vector<ov::Shape> normalized_shapes;
for (const auto& input : input_shapes) {
ov::Shape padded_shape{input};
padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1);
normalized_shapes.push_back(std::move(padded_shape));
}
return {target_shape, full_shapes};
}
auto reset_broacast_config(const std::shared_ptr<ngraph::Node>& op) -> void {
using namespace ngraph;
bool is_scalar = false;
for (auto input : op->inputs()) {
if (input.get_shape() == Shape() || ngraph::shape_size(input.get_shape()) == 1) {
is_scalar = true;
}
}
if (!is_scalar) {
if (auto binary = std::dynamic_pointer_cast<ngraph::op::util::BinaryElementwiseArithmetic>(op)) {
binary->set_autob(ngraph::op::AutoBroadcastType::NONE);
} else if (auto binary = std::dynamic_pointer_cast<ngraph::op::util::BinaryElementwiseComparison>(op)) {
binary->set_autob(ngraph::op::AutoBroadcastType::NONE);
} else if (auto binary = std::dynamic_pointer_cast<ngraph::op::util::BinaryElementwiseLogical>(op)) {
binary->set_autob(ngraph::op::AutoBroadcastType::NONE);
}
}
return {target_shape.get_shape(), normalized_shapes};
}
} // namespace
// adds explicit broadcasts if needed
// ToDO: this indeed make model not reshapable, need to come up with more clever way to insert fake broadcast,
// well on the other hand, if we replace scalar constant with Scalar op / or ShapeOf, we could have broadcasts that are reshapable
// TODO: generate FakeBroadcast if and only if broadcast is done by w dimension
ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
MATCHER_SCOPE(InsertMoveBroadcast);
ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
@ -145,28 +64,39 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
return false;
}
std::vector<ngraph::Shape> input_shapes;
for (const auto& input : values) {
input_shapes.push_back(input.get_shape());
auto is_scalar_constant = [](const ov::Output<ov::Node>& v){
if (auto constant = ov::as_type_ptr<ov::op::v0::Constant>(v.get_node_shared_ptr())) {
if (constant->get_shape().empty() || ngraph::shape_size(constant->get_shape()) == 1) {
return true;
}
}
return false;
};
std::vector<ov::Shape> input_shapes;
std::vector<bool> ignore_as_scalar;
for (const auto& val : values) {
input_shapes.emplace_back(val.get_shape());
ignore_as_scalar.push_back(is_scalar_constant(val));
}
// find the output tensor's shape, then broadcast all inputs so that they are compatible
// find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim
auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes);
ngraph::OutputVector broadcasted_inputs;
for (size_t i = 0; i < values.size(); ++i) {
auto node = numpy_broadcast_node(values[i], bcast_shapes.first, bcast_shapes.second[i]);
ngraph::copy_runtime_info(root, node);
broadcasted_inputs.push_back(node);
if (ignore_as_scalar[i]) {
broadcasted_inputs.push_back(values[i]);
} else {
auto node = broadcast_node_last_dim(values[i], bcast_shapes.first, bcast_shapes.second[i]);
ngraph::copy_runtime_info(root, node);
broadcasted_inputs.push_back(node);
}
}
auto new_args = ngraph::as_node_vector(broadcasted_inputs);
for (size_t i = 0; i < new_args.size(); i++) {
root->input(i).replace_source_output(new_args[i]->output(0));
}
reset_broacast_config(root);
return true;
};

View File

@ -27,32 +27,20 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro
const auto input = pm.at(load_pattern).get_node_shared_ptr();
const auto param = pm.at(param_pattern).get_node_shared_ptr();
// check if load has more than 1 user to avoid load+broadcast load on the same parameter
if (input->output(0).get_target_inputs().size() != 1) {
// Cannot rewrite Broadcast + Load if load has more than 1 user
// or more than one input, or if Broadcast has several inputs
if (input->output(0).get_target_inputs().size() != 1 ||
root->inputs().size() != 1 || input->inputs().size() != 1) {
return false;
}
if (root->inputs().size() != 1 || input->inputs().size() != 1) {
throw ngraph_error("cannot rewrite Broadcast load with more than one input");
}
auto inshape = root->input(0).get_shape();
auto outshape = root->output(0).get_shape();
auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape);
Shape bct(inshape.size(), 0);
for (size_t k = 0; k < inshape.size(); k++) {
if (inshape[k] != outshape[k] && inshape[k] == 1) {
bct[k] = 1;
}
}
// Todo: consider refactoring BroadcastLoad, it seems we don't need broadcast_info at this point.
broadcastload->set_broadcast_info(bct);
if (inshape.back() == 1 && outshape.back() != 1) {
ngraph::copy_runtime_info(root, broadcastload);
ngraph::replace_node(root, broadcastload);
return true;
} else {
return false;
}
ngraph::copy_runtime_info(root, broadcastload);
ngraph::replace_node(root, broadcastload);
return true;
});
}

View File

@ -1,31 +0,0 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "snippets/op/convert_saturation.hpp"
#include "snippets/pass/reset_type_relaxed_node_precision.hpp"
#include "ngraph_ops/type_relaxed.hpp"
#include <ngraph/rt_info.hpp>
ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::ResetTypeRelaxedNodePrecision(const ov::element::Type exec_type) : exec_type(exec_type) { }
bool ngraph::snippets::pass::ResetTypeRelaxedNodePrecision::run_on_model(const std::shared_ptr<ov::Model> &m) {
RUN_ON_FUNCTION_SCOPE(ResetTypeRelaxedNodePrecision);
bool rewritten = false;
for (auto& op : m->get_ordered_ops()) {
if (auto node = std::dynamic_pointer_cast<ngraph::op::TypeRelaxedBase>(op)) {
for (int i = 0; i < op->outputs().size(); i++) {
node->set_overridden_output_type(exec_type, i);
rewritten |= true;
}
} else {
op->validate_and_infer_types();
}
}
return rewritten;
}

View File

@ -5,7 +5,7 @@
#include "snippets/remarks.hpp"
#include <snippets/itt.hpp>
#include "snippets/pass/transform_convert_to_truncation.hpp"
#include "snippets/pass/transform_convert.hpp"
#include "snippets/snippets_isa.hpp"
#include <ngraph/opsets/opset1.hpp>
@ -14,15 +14,19 @@
ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToConvertTruncation() {
MATCHER_SCOPE(TransformConvertToConvertTruncation);
auto convert = std::make_shared<pattern::op::Label>(pattern::any_input(),
[](const std::shared_ptr<const Node> &n) {
return ov::is_type<ngraph::opset1::Convert>(n) &&
!ov::is_type<op::ConvertTruncation>(n) &&
!ov::is_type<op::ConvertSaturation>(n);
});
register_matcher(std::make_shared<ngraph::pattern::Matcher>(
ngraph::pattern::wrap_type<ngraph::opset1::Convert>()),
ngraph::pattern::wrap_type<ngraph::opset1::Convert>(), matcher_name),
[this](ngraph::pattern::Matcher &m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::TransformConvertToConvertTruncation")
const auto root = m.get_match_root();
const auto convert = ngraph::as_type_ptr<ngraph::opset1::Convert>(root);
if (!convert)
return false;
auto convert_truncation = std::make_shared<op::ConvertTruncation>(convert->get_input_source_output(0),
convert->get_destination_type());
convert_truncation->set_friendly_name(convert->get_friendly_name());
@ -31,4 +35,4 @@ ngraph::snippets::pass::TransformConvertToConvertTruncation::TransformConvertToC
return true;
});
}
}

View File

@ -0,0 +1,57 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/utils.hpp"
#include "snippets/pass/fq_decomposition.hpp"
auto ngraph::snippets::utils::get_non_scalar_constant_count_for_fq(const std::shared_ptr<ngraph::opset1::FakeQuantize>& fq) -> size_t {
std::vector<float> out_scales;
std::vector<float> cl, ch, isc, ish, osc, osh;
const bool status = ngraph::snippets::pass::FakeQuantizeDecomposition::getScalesAndShifts(fq, cl, ch, isc, ish, osc, osh);
if (status) {
out_scales = ngraph::snippets::pass::FakeQuantizeDecomposition::calculateScales(fq->get_output_element_type(0), cl, ch, isc, ish, osc, osh);
if (out_scales.size() != 0) {
return out_scales.size() != 1;
}
}
const bool only_quantized = status &&
std::all_of(osc.cbegin(), osc.cend(),
[](float val) { return val == 1.f; }) &&
std::all_of(osh.cbegin(), osh.cend(),
[](float val) { return val == 0.f; });
const bool il = ngraph::shape_size(fq->input(1).get_shape()) != 1lu;
const bool ih = ngraph::shape_size(fq->input(2).get_shape()) != 1lu;
const bool ol = !only_quantized && ngraph::shape_size(fq->input(3).get_shape()) != 1lu;
const bool oh = !only_quantized && ngraph::shape_size(fq->input(4).get_shape()) != 1lu;
// FakeQuantize decompoisition has the folowwing formula:
// round(x * (levels-1) / (ih - il) - il * (levels-1) / (ih - il)) * (oh - ol) / (levels-1) + ol
// After the decomposition there is call of ConstantsFolding pass that generates new Constants:
// - isc := (levels-1) / (ih - il)
// - ish := -il * isc
// - osc := (oh - ol) / (levels-1)
// - osh := ol
// New formula:
// round(x * isc + ish) * osc + osh
// Thus, after FakeQuantize decompoisition we have 6 Constants instead of original 4:
// ih, il (for Max/Min), isc, ish, osc, osh
// Some of them can be scalar or non-scalar. It depends on which original 4 Constants are non-scalar
// To sum it up, below conditions check all possible cases to calculate count of new generated non-scalars
if (ol && il && ih)
return 6;
else if ((ol && (il || ih)) || (il && ih && oh))
return 5;
else if ((il && oh) || (ih && oh) || (il && ih))
return 4;
else if (il || ih)
return 3;
else if (ol)
return 2;
else if (oh)
return 1;
return 0;
}

View File

@ -52,40 +52,6 @@ TEST(TransformationTests, FuseLoadWithBroadcastMoveByX) {
ASSERT_TRUE(res.first) << res.second;
}
TEST(TransformationTests, NotFuseLoadWithBroadcastMoveByY) {
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
{
auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2});
auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load0 = std::make_shared<snippets::isa::Load>(data0);
auto load1 = std::make_shared<snippets::isa::Load>(data1);
auto bct = std::make_shared<snippets::isa::BroadcastMove>(load0, load1->get_shape());
auto add = std::make_shared<opset1::Add>(bct, load1);
auto store = std::make_shared<snippets::isa::Store>(add);
f = std::make_shared<Function>(NodeVector{store}, ParameterVector{data0, data1});
pass::Manager m;
m.register_pass<pass::InitNodeInfo>();
m.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
m.run_passes(f);
ASSERT_NO_THROW(check_rt_info(f));
}
{
auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2});
auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 2});
auto load0 = std::make_shared<snippets::isa::Load>(data0);
auto load1 = std::make_shared<snippets::isa::Load>(data1);
auto bct = std::make_shared<snippets::isa::BroadcastMove>(load0, load1->get_shape());
auto add = std::make_shared<opset1::Add>(bct, load1);
auto store = std::make_shared<snippets::isa::Store>(add);
f_ref = std::make_shared<Function>(NodeVector{store}, ParameterVector{data0, data1});
}
auto res = compare_functions(f, f_ref);
ASSERT_TRUE(res.first) << res.second;
}
TEST(TransformationTests, NoFuseLoadWithBroadcastMoveMultipleUsers) {
std::shared_ptr<Function> f(nullptr), f_ref(nullptr);
{

View File

@ -22,7 +22,7 @@ using namespace ngraph;
TEST_F(TransformationTestsF, InsertBroadcastMove) {
{
auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3});
auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 3});
auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 1});
auto add = std::make_shared<opset1::Add>(data0, data1);
function = std::make_shared<Function>(NodeVector{add}, ParameterVector{data0, data1});
@ -30,10 +30,9 @@ TEST_F(TransformationTestsF, InsertBroadcastMove) {
}
{
auto data0 = std::make_shared<opset1::Parameter>(element::f32, Shape{2, 3});
auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 1, 3});
auto move0 = std::make_shared<snippets::isa::BroadcastMove>(data0, Shape{1, 2, 3});
auto data1 = std::make_shared<opset1::Parameter>(element::f32, Shape{1, 2, 1});
auto move1 = std::make_shared<snippets::isa::BroadcastMove>(data1, Shape{1, 2, 3});
auto add = std::make_shared<opset1::Add>(move0, move1);
auto add = std::make_shared<opset1::Add>(data0, move1);
function_ref = std::make_shared<Function>(NodeVector{add}, ParameterVector{data0, data1});
}
}

View File

@ -0,0 +1,49 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include "common_test_utils/ngraph_test_utils.hpp"
#include "snippets/pass/common_optimizations.hpp"
#include "snippets/op/subgraph.hpp"
#include "fake_quantize_function.hpp"
#include "function_helper.hpp"
namespace ov {
namespace test {
namespace snippets {
class FakeQuantizeDecompositionTest : public TransformationTestsF {
public:
void register_passes() {
manager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
}
void TearDown() override {
TransformationTestsF::TearDown();
auto subgraph = FunctionHelper::getSubgraph(function);
auto body = subgraph == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph)->get_body();
auto subgraph_ref = FunctionHelper::getSubgraph(function_ref);
auto body_ref = subgraph_ref == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph_ref)->get_body();
auto res = comparator.compare(body, body_ref);
ASSERT_TRUE(res.valid) << res.message;
}
};
TEST_F(FakeQuantizeDecompositionTest, smoke_Snippets_PerTensorFakeQuantizeDecomposition) {
function = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
{1, 3, 16, 16}, element::f32, {{}, {}, {}, {}}, 1.f);
function_ref = FakeQuantizeFunction::getSubgraphWithDecomposedFakeQuantize(
{1, 3, 16, 16}, element::f32, {{}, {}, {}, {}}, 1.f);
register_passes();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -41,31 +41,21 @@ TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) {
namespace InsertLoadStoreTestsInstantiation {
using ov::Shape;
std::vector<Shape> inputShapes1{{1, 1, 2, 5, 1}, {1, 4, 1, 5, 1}};
std::vector<Shape> inputShapes2{{1, 1, 2, 5, 1}, {1, 4, 1, 5, 1}, {1, 4, 1, 5, 16}};
std::vector<Shape> inputShapes{{1, 4, 1, 5, 1}, {1, 4, 2, 5, 1}};
std::vector<Shape> broadcastShapes{{1, 4, 1, 5, 16}, {1, 4, 2, 5, 16}};
Shape exec_domain{1, 4, 2, 5, 16};
Shape emptyShape{};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastLoad, InsertLoadStoreTests,
::testing::Combine(
::testing::Values(exec_domain),
::testing::ValuesIn(inputShapes1),
::testing::ValuesIn(inputShapes1),
::testing::Values(inputShapes[0]),
::testing::Values(inputShapes[1]),
::testing::Values(emptyShape),
::testing::Values(exec_domain),
::testing::Values(exec_domain)),
::testing::Values(broadcastShapes[0]),
::testing::Values(broadcastShapes[1])),
InsertLoadStoreTests::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastMove, InsertLoadStoreTests,
::testing::Combine(
::testing::Values(exec_domain),
::testing::Values(Shape {1, 4, 1, 5, 16}),
::testing::ValuesIn(inputShapes2),
::testing::Values(emptyShape),
::testing::Values(exec_domain),
::testing::Values(exec_domain)),
InsertLoadStoreTests::getTestCaseName);
} // namespace InsertLoadStoreTestsInstantiation
} // namespace snippets
} // namespace test

View File

@ -39,7 +39,7 @@ TEST_P(InsertMoveBroadcastTests, AddBroadcast) {
namespace InsertMoveBroadcastTestsInstantiation {
using ov::Shape;
std::vector<Shape> inputShapes0 {{1, 1, 1, 3}, {1, 1, 2, 3}, {1, 8, 1, 3}};
std::vector<Shape> inputShapes0 {{1, 8, 2, 1}};
std::vector<Shape> inputShapes1 {{1, 8, 2, 3}};
Shape broadcastShape {1, 8, 2, 3};
Shape emptyShape {};
@ -59,12 +59,12 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastOn1, InsertMoveBroadcastTests,
::testing::Values(broadcastShape)),
InsertMoveBroadcastTests::getTestCaseName);
std::vector<Shape> inputShapesBoth0 {{4, 1, 2, 3}, {1, 8, 1, 3}, {1, 1, 2, 3}};
std::vector<Shape> inputShapesBoth1 {{1, 8, 1, 3}, {4, 1, 2, 3}, {4, 8, 1, 3}};
Shape broadcastShapeBoth{4, 8, 2, 3};
std::vector<insertMoveBroadcastParams> params = {std::make_tuple(inputShapesBoth0[0], inputShapesBoth1[0], broadcastShapeBoth, broadcastShapeBoth),
std::make_tuple(inputShapesBoth0[1], inputShapesBoth1[1], broadcastShapeBoth, broadcastShapeBoth),
std::make_tuple(inputShapesBoth0[2], inputShapesBoth1[2], broadcastShapeBoth, broadcastShapeBoth)};
std::vector<Shape> inputShapesBoth0 {{4, 1, 2, 1}, {1, 8, 1, 1}, {1, 1, 2, 3}};
std::vector<Shape> inputShapesBoth1 {{4, 8, 2, 3}, {4, 1, 2, 3}, {4, 8, 1, 1}};
std::vector<Shape> broadcastShapeBoth{{4, 1, 2, 3}, {1, 8, 1, 3}, {4, 8, 1, 3}};
std::vector<insertMoveBroadcastParams> params = {std::make_tuple(inputShapesBoth0[0], inputShapesBoth1[0], broadcastShapeBoth[0], emptyShape),
std::make_tuple(inputShapesBoth0[1], inputShapesBoth1[1], broadcastShapeBoth[1], emptyShape),
std::make_tuple(inputShapesBoth0[2], inputShapesBoth1[2], emptyShape, broadcastShapeBoth[2])};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BroadcastOnBoth, InsertMoveBroadcastTests,
::testing::ValuesIn(params),

View File

@ -10,6 +10,7 @@
#include <string>
#include <map>
#include <mutex>
namespace ov {
namespace intel_cpu {

View File

@ -17,6 +17,7 @@
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include <ngraph/opsets/opset5.hpp>
@ -114,6 +115,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
// jitters[ngraph::opset1::Tan::get_type_info_static()] = CREATE_EMITTER(); // not supported
jitters[ngraph::opset1::Tanh::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_tanh_emitter);
jitters[ov::intel_cpu::SwishNode::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_swish_emitter);
jitters[ngraph::op::v4::HSwish::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_hswish_emitter);
// jitters[ngraph::opset1::HardSigmoid::get_type_info_static()] = CREATE_EMITTER(); // not supported
// jitters[ngraph::opset1::Selu::get_type_info_static()] = CREATE_EMITTER(); // not supported

View File

@ -5,6 +5,7 @@
#pragma once
#include "ngraph/opsets/opset5.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
#include "jit_dnnl_emitters.hpp"
namespace ov {
@ -102,6 +103,20 @@ public:
}
};
class jit_swish_emitter : public jit_dnnl_emitter {
public:
jit_swish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,
InferenceEngine::Precision exec_prc = InferenceEngine::Precision::FP32)
: jit_dnnl_emitter(host, host_isa, n, exec_prc) {
kind = dnnl_eltwise_swish;
auto op = ngraph::as_type_ptr<ov::intel_cpu::SwishNode>(n);
alpha = op->get_alpha();
beta = 0.f;
set_injector();
}
};
class jit_hswish_emitter : public jit_dnnl_emitter {
public:
jit_hswish_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,
@ -114,6 +129,7 @@ public:
set_injector();
}
};
class jit_gelu_v0_emitter : public jit_dnnl_emitter {
public:
jit_gelu_v0_emitter(dnnl::impl::cpu::x64::jit_generator *host, dnnl::impl::cpu::x64::cpu_isa_t host_isa, const std::shared_ptr<ngraph::Node>& n,

View File

@ -18,10 +18,12 @@ using namespace Xbyak::util;
namespace ov {
namespace intel_cpu {
namespace {
// heuristic threshold number by byte between mask load and emulation with several simple partial load
const int threshold_for_mask_emu_load = 14;
constexpr int threshold_for_mask_emu_load = 14;
// heuristic threshold number by byte between mask store and emulation with several simple partial store
const int threshold_for_mask_emu_store = 6;
constexpr int threshold_for_mask_emu_store = 6;
} // namespace
size_t load_emitter_params::hash() const {
size_t seed = 0;

View File

@ -387,13 +387,6 @@ void TileEmitter::emit_impl(const std::vector<size_t>& in,
BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
if (n->get_input_shape(0).empty())
use_broadcast = true;
else if (*n->get_input_shape(0).rbegin() != *n->get_output_shape(0).rbegin())
use_broadcast = true;
else
use_broadcast = false;
if (n->get_input_element_type(0) != n->get_output_element_type(0))
IE_THROW() << "BroadcastMoveEmitter supports only equal input and output types but gets: "
<< n->get_input_element_type(0) << " and " << n->get_output_element_type(0);
@ -420,20 +413,14 @@ template <dnnl::impl::cpu::x64::cpu_isa_t isa>
void BroadcastMoveEmitter::emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const {
using Vmm = typename dnnl::impl::utils::conditional3<isa == dnnl::impl::cpu::x64::sse41,
Xmm, isa == dnnl::impl::cpu::x64::avx2, Ymm, Zmm>::type;
Vmm vmm_src0 = Vmm(in[0]);
Xmm xmm_src0 = Xmm(in[0]);
Vmm vmm_dst = Vmm(out[0]);
if (use_broadcast) {
switch (byte_size) {
case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
default: assert(!"unsupported data type");
}
} else {
if (vmm_src0 != vmm_dst)
h->uni_vmovups(vmm_dst, vmm_src0);
switch (byte_size) {
case 4: h->uni_vbroadcastss(vmm_dst, xmm_src0); break;
case 2: h->vpbroadcastw(vmm_dst, xmm_src0); break;
case 1: h->vpbroadcastb(vmm_dst, xmm_src0); break;
default: assert(!"unsupported data type");
}
}

View File

@ -78,6 +78,7 @@ ExecNetwork::ExecNetwork(const InferenceEngine::CNNNetwork &network,
bool isFloatModel = !ngraph::op::util::has_op_with_type<ngraph::op::FakeQuantize>(function);
_cfg.isNewApi = !isLegacyAPI();
_mutex = std::make_shared<std::mutex>();
// WA for inference dynamic batch cases in new API
if (_cfg.isNewApi) {
@ -176,10 +177,10 @@ ExecNetwork::GraphGuard::Lock ExecNetwork::GetGraph() const {
auto makeGraph = [&] {
try {
{
std::lock_guard<std::mutex> lock{_cfgMutex};
std::lock_guard<std::mutex> lock{*_mutex.get()};
graphLock._graph.setConfig(_cfg);
}
graphLock._graph.CreateGraph(_network, extensionManager, _numaNodesWeights[numaNodeId]);
graphLock._graph.CreateGraph(_network, extensionManager, _numaNodesWeights[numaNodeId], _mutex);
} catch(...) {
exception = std::current_exception();
}
@ -198,7 +199,7 @@ ExecNetwork::GraphGuard::Lock ExecNetwork::GetGraph() const {
void ExecNetwork::setProperty(const std::map<std::string, std::string> &properties) {
{
std::lock_guard<std::mutex> lock{_cfgMutex};
std::lock_guard<std::mutex> lock{*_mutex.get()};
_cfg.readProperties(properties);
}
for (auto& g : _graphs) {

View File

@ -53,7 +53,9 @@ protected:
ExtensionManager::Ptr extensionManager;
std::vector<InferenceEngine::IVariableStateInternal::Ptr> memoryStates;
const InferenceEngine::CNNNetwork _network;
mutable std::mutex _cfgMutex;
// Generic synchronization primitive on ExecNetwork level.
// Usage example: helps to avoid data races during CPU Graph initialization in multi-streams scenario
mutable std::shared_ptr<std::mutex> _mutex;
Config _cfg;
std::atomic_int _numRequests = {0};
std::string _name;
@ -67,7 +69,7 @@ protected:
// WARNING: Do not use _graphs directly.
mutable std::deque<GraphGuard> _graphs;
mutable NumaNodesWeights _numaNodesWeights;
mutable NumaNodesWeights _numaNodesWeights;
/* WARNING: Use GetGraph() function to get access to graph in current stream.
* NOTE: Main thread is interpreted as master thread of external stream so use this function to get access to graphs

View File

@ -25,6 +25,7 @@
#include "nodes/input.h"
#include <nodes/reorder.h>
#include "nodes/convert.h"
#include "nodes/subgraph.h"
#include <ie_algorithm.hpp>
#include <blob_factory.hpp>
@ -68,7 +69,7 @@ Graph::~Graph() {
template<typename NET>
void Graph::CreateGraph(NET &net, const ExtensionManager::Ptr& extMgr,
WeightsSharing::Ptr &w_cache) {
WeightsSharing::Ptr &w_cache, const std::shared_ptr<std::mutex>& mutex) {
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "CreateGraph");
if (IsReady())
@ -77,6 +78,7 @@ void Graph::CreateGraph(NET &net, const ExtensionManager::Ptr& extMgr,
weightsCache = config.streamExecutorConfig._streams != 1 ? w_cache : nullptr;
rtParamsCache = std::make_shared<MultiCache>(config.rtCacheCapacity);
sharedMutex = mutex;
Replicate(net, extMgr);
InitGraph();
@ -119,9 +121,9 @@ void Graph::CreateGraph(const std::vector<NodePtr> &graphNodes,
}
template void Graph::CreateGraph(const std::shared_ptr<const ngraph::Function>&,
const ExtensionManager::Ptr&, WeightsSharing::Ptr&);
const ExtensionManager::Ptr&, WeightsSharing::Ptr&, const std::shared_ptr<std::mutex>& mutex);
template void Graph::CreateGraph(const CNNNetwork&,
const ExtensionManager::Ptr&, WeightsSharing::Ptr&);
const ExtensionManager::Ptr&, WeightsSharing::Ptr&, const std::shared_ptr<std::mutex>& mutex);
void Graph::Replicate(const std::shared_ptr<const ov::Model> &subgraph, const ExtensionManager::Ptr& extMgr) {
this->_name = "subgraph";
@ -153,7 +155,9 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model> &subgraph, const Ex
if (isQuantized()) {
node->setQuantizedGraphFlag(true);
}
node->setRuntimeCache(rtParamsCache);
node->setSharedMutex(sharedMutex);
graphNodes.push_back(node);
@ -265,7 +269,10 @@ void Graph::Replicate(const CNNNetwork &network, const ExtensionManager::Ptr& ex
if (isQuantized()) {
node->setQuantizedGraphFlag(true);
}
node->setRuntimeCache(rtParamsCache);
node->setSharedMutex(sharedMutex);
graphNodes.push_back(node);
if (op->get_type_info() == ngraph::op::v0::Parameter::get_type_info_static()) {

View File

@ -53,7 +53,8 @@ public:
template<typename NET>
void CreateGraph(NET &network,
const ExtensionManager::Ptr& extMgr,
WeightsSharing::Ptr &w_cache);
WeightsSharing::Ptr &w_cache,
const std::shared_ptr<std::mutex>& mutex);
void CreateGraph(const std::vector<NodePtr> &graphNodes,
const std::vector<EdgePtr> &graphEdges,
@ -262,6 +263,7 @@ private:
std::vector<NodePtr> executableGraphNodes;
MultiCachePtr rtParamsCache;
std::shared_ptr<std::mutex> sharedMutex = nullptr;
void EnforceBF16();
};

View File

@ -2,7 +2,9 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets_mark_skipped.hpp"
#include <snippets/pass/collapse_subgraph.hpp>
#include "snippets/pass/collapse_subgraph.hpp"
#include "snippets/op/subgraph.hpp"
#include "snippets/utils.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <utils/general_utils.h>
#include <utils/cpu_utils.hpp>
@ -15,6 +17,7 @@ namespace ov {
namespace intel_cpu {
namespace {
static const int DEFAULT_AXIS = 1;
NodeFusingType GetNodeFusingType(const std::shared_ptr<const Node> &node) {
auto &rt = node->get_rt_info();
const auto rinfo = rt.find("MayBeFusedInPlugin");
@ -110,13 +113,18 @@ bool canBePerformedAsScaleShift(const std::shared_ptr<const Node> &node, const i
isBroadcastableToDataInput();
}
bool SupportsFusingWithConvolution_Simple(const std::shared_ptr<const Node> &node, const int channelAxis = 1) {
inline bool canBeMatMulExecutedInInt8(const ov::element::Type& firstType, const ov::element::Type& secondType) {
return one_of(firstType, ov::element::i8, ov::element::u8) && secondType == ov::element::i8;
}
bool SupportsFusingWithConvolution_Simple(const std::shared_ptr<const Node> &node, const int channelAxis = DEFAULT_AXIS) {
return SupportsFusingWithConvolution_SumActivation(node) ||
ov::is_type<ngraph::op::Tanh>(node) ||
ov::is_type<ngraph::op::v0::Gelu>(node) ||
ov::is_type<ngraph::op::v7::Gelu>(node) ||
ov::is_type<ngraph::op::Abs>(node) ||
ov::is_type<ngraph::op::Sqrt>(node) ||
ov::is_type<ngraph::op::FakeQuantize>(node) ||
canBePerformedAsScaleShift(node, channelAxis);
}
// Convolution is a special case, since it supports peculiar fusings
@ -136,7 +144,7 @@ bool isSuitableBinaryConvolutionParent(const std::shared_ptr<const Node> &node)
return is_suitable_node && has_only_child;
}
int getChannelAxis(const ov::AxisSet &axes, bool keep_dims) {
int channelAxis = 1;
int channelAxis = DEFAULT_AXIS;
if (!keep_dims) {
for (auto &axis : axes) {
if (axis == 1) {
@ -150,7 +158,7 @@ int getChannelAxis(const ov::AxisSet &axes, bool keep_dims) {
}
return channelAxis;
}
bool isSuitableMiscParent(const std::shared_ptr<const Node> &node, int &channelAxis) {
bool isSuitableMiscParent(const std::shared_ptr<const Node> &node) {
const bool is_suitable_node = ov::is_type<ngraph::op::v0::MVN>(node) ||
ov::is_type<ngraph::op::v6::MVN>(node) ||
ov::is_type<ngraph::op::v0::NormalizeL2>(node) ||
@ -160,13 +168,8 @@ bool isSuitableMiscParent(const std::shared_ptr<const Node> &node, int &channelA
ov::is_type<ngraph::op::v4::LSTMCell>(node) ||
ov::is_type<ngraph::opset1::ConvolutionBackpropData>(node) ||
ov::is_type<ngraph::op::util::ArithmeticReductionKeepDims>(node) ||
ov::is_type<ngraph::op::util::LogicalReductionKeepDims>(node) ||
ov::is_type<ngraph::opset1::GroupConvolutionBackpropData>(node);
if (const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::ArithmeticReductionKeepDims>(node)) {
channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
} else if (const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::LogicalReductionKeepDims>(node)) {
channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
}
ov::is_type<ngraph::opset1::GroupConvolutionBackpropData>(node) ||
ov::is_type<ngraph::opset1::AvgPool>(node);
// has a single output, connected to a single child
const auto out = node->outputs();
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
@ -180,6 +183,13 @@ bool isSuitableMatMulParent(const std::shared_ptr<const Node> &node) {
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
return is_suitable_node && has_only_child;
}
// From Reduce::canFuse() corner case. CanFuseSimpleOperation is covered by Misc
inline bool isSuitableReduceParent(const std::shared_ptr<const Node> &node) {
bool is_suitable_reduce = ov::is_type<ov::op::util::ArithmeticReductionKeepDims>(node) && isSuitableMiscParent(node);
bool is_not_min_max = !ov::is_type<ov::op::v1::ReduceMax>(node) && !ov::is_type<ov::op::v1::ReduceMin>(node);
bool out_is_f32 = node->get_output_element_type(0) == ov::element::f32;
return is_suitable_reduce && is_not_min_max && out_is_f32;
}
// Subtract as ZeroPoints for Convolution
bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &node) {
const bool is_suitable_node = ov::is_type<ngraph::op::v1::Subtract>(node);
@ -197,21 +207,24 @@ bool isSuitableSubtractAsZeroPointsParent(const std::shared_ptr<const Node> &nod
const auto weight_shape = child->get_input_shape(1);
const bool is_depthwise = is_group_conv && weight_shape[1] == 1 && weight_shape[2] == 1;
const bool deptwise_is_suitable = implication(is_depthwise, child->get_input_shape(0).size() < 5);
if (!(is_conv && deptwise_is_suitable))
if (!deptwise_is_suitable)
return false;
const bool first_input_is_suitable = node->get_input_node_shared_ptr(0)->get_output_element_type(0) == ov::element::u8;
const auto zp_weights = node->get_input_node_shared_ptr(1);
const auto zp_weight_shape = zp_weights->get_output_shape(0);
bool second_input_is_suitable =
ov::is_type<ngraph::op::v0::Constant>(zp_weights) &&
zp_weights->get_output_element_type(0) == ov::element::u8 &&
zp_weight_shape.size() >= 2;
if (!(first_input_is_suitable && second_input_is_suitable))
return false;
auto correct_shape = ov::Shape(zp_weight_shape.size(), 1);
correct_shape[1] = zp_weight_shape[1];
return correct_shape == zp_weight_shape;
if (zp_weight_shape.size() > 1)
correct_shape[1] = zp_weight_shape[1];
const bool zp_weights_is_suitable = ov::is_type<ov::op::v0::Constant>(zp_weights) &&
zp_weights->get_element_type() == ov::element::u8 &&
zp_weight_shape.size() >= 2 && correct_shape == zp_weight_shape;
const bool first_conv_input_is_suitable = node->get_input_element_type(0) == ov::element::u8 &&
zp_weights_is_suitable;
const auto conv_weights = child->get_input_node_shared_ptr(1);
bool second_conv_input_is_suitable = ov::is_type<ngraph::op::v0::Constant>(conv_weights) &&
conv_weights->get_output_element_type(0) == ov::element::i8;
return first_conv_input_is_suitable && second_conv_input_is_suitable;
}
bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
const bool is_suitable_node = ov::is_type<ngraph::op::v1::MaxPool>(node);
@ -220,11 +233,12 @@ bool isSuitablePoolChild(const std::shared_ptr<const Node> &node) {
const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
return is_suitable_node && has_only_child;
}
bool isSuitableChildForFusingSimple(const std::shared_ptr<const Node> &node, int channelAxis = 1) {
bool isSuitableChildForFusingSimple(const std::shared_ptr<const Node> &node, const int channelAxis = DEFAULT_AXIS) {
// Note: Fusing child is allowed to have several users, but that must be the end of the chain
return SupportsFusingWithConvolution_Simple(node, channelAxis) && getNumNonConstInputs(node) == 1;
}
bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, NodeFusingType &updatedChainType) {
bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, const bool canMatMulBeExecutedInI8,
NodeFusingType &updatedChainType, int& fusingAxis) {
int num_non_const_inputs = 0;
bool can_be_converted_to_FC = false;
ov::Shape bias_shape;
@ -255,52 +269,66 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, Nod
if (num_non_const_inputs != 1)
return false;
// Matmul / FC bias fusion
if (ov::is_type<ngraph::opset1::Add>(node) &&
bias_shape.back() == matmul_shape.back() &&
bias_shape.back() == shape_size(bias_shape)) {
return true;
}
// FuseMatMulAndSimpleOperation or FuseFullyConnectedAndSimpleOperation
// Invoke SupportsFusingWithConvolution_Simple directly instead of isSuitableChildForFusingSimple to
// eliminate getNumNonConstInputs() check
int fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
fusingAxis = can_be_converted_to_FC ? (matmul_shape.size() == 3 ? 2 : 1) : matmul_shape.size() - 1;
if (SupportsFusingWithConvolution_Simple(node, fusingAxis)) {
updatedChainType = NodeFusingType::FusedWithMisc;
return true;
}
// canFuse() from MatMul for case with rank > 2
// Algorithm::EltwisePowerStatic is ignored
if (!can_be_converted_to_FC &&
node->get_output_shape(0).size() > 2) {
if (ov::is_type<ov::op::v1::Add>(node) ||
ov::is_type<ov::op::v1::Multiply>(node) ||
ov::is_type<ov::op::v1::Subtract>(node) ||
ov::is_type<ov::op::v1::Divide>(node) ||
ov::is_type<ov::op::v0::PRelu>(node)) {
const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
int constPort = -1;
if (const2) {
constPort = 1;
} else if (const1) {
constPort = 0;
}
// MatMul specific checks from ::canFuse()
if (!can_be_converted_to_FC) {
// can with rank() > 2
// Algorithm::EltwisePowerStatic is ignored
if (node->get_output_shape(0).size() > 2) {
if (ov::is_type<ov::op::v1::Add>(node) ||
ov::is_type<ov::op::v1::Multiply>(node) ||
ov::is_type<ov::op::v1::Subtract>(node) ||
ov::is_type<ov::op::v1::Divide>(node) ||
ov::is_type<ov::op::v0::PRelu>(node)) {
const auto const1 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(0));
const auto const2 = ov::is_type<ov::op::v0::Constant>(node->get_input_node_shared_ptr(1));
int constPort = -1;
if (const2) {
constPort = 1;
} else if (const1) {
constPort = 0;
}
if (constPort != -1) {
auto const_shape = node->get_input_shape(constPort);
if (ov::shape_size(const_shape) != 1) {
if (constPort != -1) {
auto const_shape = node->get_input_shape(constPort);
if (ov::shape_size(const_shape) != 1) {
return false;
}
}
} else if (ov::is_type<ov::op::v0::FakeQuantize>(node)) {
const bool is_per_tensor_broadcasting = ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(1)) &&
ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(2)) &&
ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(3)) &&
ngraph::snippets::utils::is_scalar_constant(node->get_input_node_shared_ptr(4));
if (!is_per_tensor_broadcasting) {
return false;
}
}
}
// specific case for FQ
if (ov::is_type<ov::op::v0::FakeQuantize>(node)) {
if (one_of(node->get_output_element_type(0), ov::element::i8, ov::element::u8) && canMatMulBeExecutedInI8) {
return false;
}
}
}
// FullyConnectedBiasFusion
if (!(can_be_converted_to_FC && ov::is_type<ngraph::opset1::Add>(node) &&
bias_shape.back() == matmul_shape.back() &&
bias_shape.back() == shape_size(bias_shape))) {
return false;
}
// Fusing chain must be interrupted after the node, since reshape will be inserted
if (bias_shape.size() >= 2)
updatedChainType = NodeFusingType::FusedTerminator;
return true;
}
bool isSuitableParentForFusingSumActivation(const std::shared_ptr<const Node> &node) {
@ -334,11 +362,21 @@ bool isSuitableParentForFusingSumActivation(const std::shared_ptr<const Node> &n
}
return true;
};
auto isFusedFQNode = [&isFusedBiasNode](std::shared_ptr<Node> n) {
if (!(ov::is_type<ngraph::op::v0::FakeQuantize>(n) &&
GetNodeFusingType(n) == NodeFusingType::FusedWithConvolution))
return false;
const auto& parent = n->get_input_node_shared_ptr(0);
const bool is_suitable_parent = isSuitableConvolutionParent(parent)
|| isFusedBiasNode(parent)
|| (GetNodeFusingType(parent) == NodeFusingType::FusedWithConvolution);
return is_suitable_parent;
};
int num_conv_parents = 0;
for (size_t i = 0; i < node->get_input_size(); i++) {
const auto n = node->get_input_node_shared_ptr(i);
//BinaryConvolution allows other ops to be fused before the Add, while Convolution doesn't
num_conv_parents += (isSuitableConvolutionParent(n) || isFusedBiasNode(n) ||
num_conv_parents += (isSuitableConvolutionParent(n) || isFusedBiasNode(n) || isFusedFQNode(n) ||
GetNodeFusingType(n) == NodeFusingType::FusedWithBinaryConvolution);
}
return getNumNonConstInputs(node) == 2 && num_conv_parents >=1;
@ -346,6 +384,9 @@ bool isSuitableParentForFusingSumActivation(const std::shared_ptr<const Node> &n
bool isSuitableChildForFusingSumActivation(const std::shared_ptr<const Node> &node) {
return SupportsFusingWithConvolution_SumActivation(node);
}
bool isSuitableReduceChild(const std::shared_ptr<const Node> &node, const int channelAxis = DEFAULT_AXIS) {
return node->get_output_element_type(0) == ov::element::f32 && isSuitableChildForFusingSimple(node, channelAxis);
}
// Continue fusing chain of the passed type if the node has one child
// Otherwise mark node as FusedTerminator (Fused, but fusing chain is interrupted)
void PropagateIfHasOnlyChild(const std::shared_ptr<Node> &node, NodeFusingType nodeType) {
@ -378,59 +419,77 @@ void MarkSubgraphOpAsSkipped(const std::shared_ptr<Node> &node) {
bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped);
int channelAxis = 1;
int channelAxis = DEFAULT_AXIS;
for (auto &node : m->get_ordered_ops()) {
if (ngraph::op::is_constant(node))
continue;
if (ngraph::op::is_parameter(node)) {
SetNodeFusingType(node, NodeFusingType::IgnoredAfterInputs);
continue;
} else if (isSuitableConvolutionParent(node)) {
// Initiate fusing chain
SetNodeFusingType(node, NodeFusingType::FusedWithConvolution);
continue;
channelAxis = DEFAULT_AXIS;
} else if (isSuitableBinaryConvolutionParent(node)) {
SetNodeFusingType(node, NodeFusingType::FusedWithBinaryConvolution);
continue;
} else if (isSuitableMiscParent(node, channelAxis)) {
channelAxis = DEFAULT_AXIS;
} else if (isSuitableReduceParent(node)) {
const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::ArithmeticReductionKeepDims>(node);
channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
SetNodeFusingType(node, NodeFusingType::FusedWithReduce);
} else if (isSuitableMiscParent(node)) {
if (const auto reduce = std::dynamic_pointer_cast<const ngraph::op::util::ArithmeticReductionKeepDims>(node)) {
channelAxis = getChannelAxis(reduce->get_reduction_axes(), reduce->get_keep_dims());
} else {
channelAxis = DEFAULT_AXIS;
}
SetNodeFusingType(node, NodeFusingType::FusedWithMisc);
continue;
} else if (isSuitableMatMulParent(node)) {
SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
continue;
if (canBeMatMulExecutedInInt8(node->get_input_element_type(0), node->get_input_element_type(1)))
SetNodeFusingType(node, NodeFusingType::FusedWithMatMulI8);
else
SetNodeFusingType(node, NodeFusingType::FusedWithMatMul);
channelAxis = DEFAULT_AXIS;
} else if (isSuitableSubtractAsZeroPointsParent(node)) {
SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
continue;
}
for (const auto fusingChainType : getContinuableChains(node)) {
if (isSuitableChildForFusingSimple(node, channelAxis)) {
PropagateIfHasOnlyChild(node, fusingChainType);
} else if (fusingChainType == NodeFusingType::FusedWithConvolution ||
fusingChainType == NodeFusingType::FusedWithBinaryConvolution) {
if (isSuitableParentForFusingSumActivation(node)) {
PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolutionSumActivation);
// Mimic FuseConvolutionAndSimpleOperationThroughMaxPool
} else if (isSuitablePoolChild(node)) {
channelAxis = DEFAULT_AXIS;
} else {
for (const auto fusingChainType : getContinuableChains(node)) {
if (fusingChainType == NodeFusingType::FusedWithReduce) {
if (isSuitableReduceChild(node, channelAxis))
PropagateIfHasOnlyChild(node, fusingChainType);
} else if (isSuitableChildForFusingSimple(node, channelAxis)) {
PropagateIfHasOnlyChild(node, fusingChainType);
} else if (fusingChainType == NodeFusingType::FusedWithConvolution ||
fusingChainType == NodeFusingType::FusedWithBinaryConvolution) {
if (isSuitableParentForFusingSumActivation(node)) {
PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolutionSumActivation);
// Mimic FuseConvolutionAndSimpleOperationThroughMaxPool
} else if (isSuitablePoolChild(node)) {
PropagateIfHasOnlyChild(node, fusingChainType);
}
} else if (fusingChainType == NodeFusingType::FusedWithConvolutionSumActivation &&
isSuitableChildForFusingSumActivation(node)) {
// Todo: Chain could be converted from FusedWithBinaryConvolution to FusedWithConvolution at this point
// Set FusedWithConvolution, so the fusing chain could be propagated
PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolution);
} else if (fusingChainType == NodeFusingType::FusedWithMatMul ||
fusingChainType == NodeFusingType::FusedWithMatMulI8) {
const bool isExecutedInINT8 = fusingChainType == NodeFusingType::FusedWithMatMulI8;
// Handle fusings for both MatMul and FullyConnected
NodeFusingType updatedChainType = fusingChainType;
if (isSuitableChildForFusingMatMul(node, isExecutedInINT8, updatedChainType, channelAxis))
PropagateIfHasOnlyChild(node, updatedChainType);
} else if (fusingChainType == NodeFusingType::IgnoredAfterInputs && (snippets::pass::AppropriateForSubgraph(node) ||
ov::is_type<ngraph::op::v0::Convert>(node) || ov::is_type<ngraph::op::v1::Transpose>(node))) {
// In OV_API 2.0 after Input node with I8/U8 precisions incerts Convert node, moreother on TF models inserts
// Transpose layer. These brakes an idea to leave Eltwise node with I8/U8 inputs and FP32 outputs instead of Subgrath node
// TODO Remove an additional check on Convert/Transpose here after enabling Subgraths with I8/U8 inputs and FP32 outputs
SetNodeFusingType(node, NodeFusingType::IgnoredAfterInputs);
}
} else if (fusingChainType == NodeFusingType::FusedWithConvolutionSumActivation &&
isSuitableChildForFusingSumActivation(node)) {
// Todo: Chain could be converted from FusedWithBinaryConvolution to FusedWithConvolution at this point
// Set FusedWithConvolution, so the fusing chain could be propagated
PropagateIfHasOnlyChild(node, NodeFusingType::FusedWithConvolution);
} else if (fusingChainType == NodeFusingType::FusedWithMatMul) {
// Handle fusings for both MatMul and FullyConnected
NodeFusingType updatedChainType = fusingChainType;
if (isSuitableChildForFusingMatMul(node, updatedChainType))
PropagateIfHasOnlyChild(node, updatedChainType);
} else if (fusingChainType == NodeFusingType::IgnoredAfterInputs && (snippets::pass::AppropriateForSubgraph(node) ||
ov::is_type<ngraph::op::v0::Convert>(node) || ov::is_type<ngraph::op::v1::Transpose>(node))) {
// In OV_API 2.0 after Input node with I8/U8 precisions incerts Convert node, moreother on TF models inserts
// Transpose layer. These brakes an idea to leave Eltwise node with I8/U8 inputs and FP32 outputs instead of Subgrath node
// TODO Remove an additional check on Convert/Transpose here after enabling Subgraths with I8/U8 inputs and FP32 outputs
SetNodeFusingType(node, NodeFusingType::IgnoredAfterInputs);
}
}
if (GetNodeFusingType(node) != NodeFusingType::NotSet) {
SetSnippetsNodeType(node, snippets::pass::SnippetsNodeType::SkippedByPlugin);
} else {

View File

@ -37,7 +37,7 @@ enum class NodeFusingType : int64_t {
NotSet,
FusedTerminator,
FusedWithConvolution, FusedWithBinaryConvolution, FusedWithConvolutionSumActivation,
FusedWithMatMul, FusedWithMisc, IgnoredAfterInputs};
FusedWithMatMul, FusedWithMatMulI8, FusedWithReduce, FusedWithMisc, IgnoredAfterInputs};
} // namespace intel_cpu
} // namespace ov

View File

@ -573,6 +573,10 @@ public:
rtParamsCache = cache;
}
void setSharedMutex(const std::shared_ptr<std::mutex>& mutex) {
sharedMutex = mutex;
}
protected:
bool canFuseSimpleOperation(const NodePtr& node) const;
@ -747,6 +751,8 @@ protected:
std::shared_ptr<IShapeInfer> shapeInference;
std::shared_ptr<std::mutex> sharedMutex = nullptr;
private:
std::vector<EdgeWeakPtr> parentEdges;
std::vector<EdgeWeakPtr> childEdges;

View File

@ -70,8 +70,8 @@ void If::getSupportedDescriptors() {
const std::shared_ptr<const ov::Model>& thenBody = ifOp->get_then_body();
const std::shared_ptr<const ov::Model>& elseBody = ifOp->get_else_body();
subGraphThen.CreateGraph(thenBody, ext_mng, weightCache);
subGraphElse.CreateGraph(elseBody, ext_mng, weightCache);
subGraphThen.CreateGraph(thenBody, ext_mng, weightCache, sharedMutex);
subGraphElse.CreateGraph(elseBody, ext_mng, weightCache, sharedMutex);
const auto &inMapThen = subGraphThen.GetInputNodesMap();
for (const auto &param : ifOp->get_then_body()->get_parameters()) {

View File

@ -20,9 +20,12 @@
#include <ngraph/rt_info.hpp>
#include <ie_ngraph_utils.hpp>
#include <shared_mutex>
#include <snippets/op/subgraph.hpp>
#include "emitters/cpu_generator.hpp"
#include "snippets_transformations/fuse_load_store_and_convert.hpp"
#include "ngraph_transformations/convert_to_swish_cpu.hpp"
using namespace InferenceEngine;
using namespace dnnl::impl::utils;
@ -34,30 +37,42 @@ namespace ov {
namespace intel_cpu {
namespace node {
Snippet::Snippet(const std::shared_ptr<ngraph::Node>& op, const dnnl::engine& eng, WeightsSharing::Ptr &cache)
: Node(op, eng, cache) {
host_isa = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core) ?
dnnl::impl::cpu::x64::avx512_core : dnnl::impl::cpu::x64::avx2;
// Create a deep local copy of the input snippet to perform canonicalization & code generation
// Todo: Probably better to implement a proper copy constructor
if (const auto tmp_snippet = ov::as_type_ptr<ngraph::snippets::op::Subgraph>(op)) {
ngraph::OutputVector subgraph_node_inputs;
for (const auto &input : tmp_snippet->input_values()) {
auto new_input = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
subgraph_node_inputs.push_back(new_input);
}
auto new_body = ov::clone_model(*tmp_snippet->get_body().get());
snippet = std::make_shared<ngraph::snippets::op::Subgraph>(subgraph_node_inputs, new_body);
ngraph::copy_runtime_info(tmp_snippet, snippet);
snippet->set_friendly_name(tmp_snippet->get_friendly_name());
snippet->set_generator(std::make_shared<CPUGenerator>(host_isa));
} else {
original_snippet = ov::as_type_ptr<ngraph::snippets::op::Subgraph>(op);
if (!original_snippet) {
IE_THROW(NotImplemented) << "Node is not an instance of snippets::op::Subgraph";
}
}
void Snippet::copy_snippet() {
ngraph::OutputVector subgraph_node_inputs;
for (const auto &input : original_snippet->input_values()) {
auto new_input = std::make_shared<ngraph::opset1::Parameter>(input.get_element_type(), input.get_partial_shape());
subgraph_node_inputs.push_back(new_input);
}
std::shared_ptr<ov::Model> new_body = nullptr;
// Ticket[79554]: TypeRelaxed ops aren't thread safe so we use mutex to avoid collision in throughput mode
if (original_snippet->has_type_relaxed_ops()) {
if (!sharedMutex) {
IE_THROW() << "Subgraph doesn't have shared mutex";
}
std::lock_guard<std::mutex> lock(*sharedMutex.get());
new_body = ov::clone_model(*original_snippet->get_body().get());
} else {
new_body = ov::clone_model(*original_snippet->get_body().get());
}
snippet = std::make_shared<ngraph::snippets::op::Subgraph>(subgraph_node_inputs, new_body);
ngraph::copy_runtime_info(original_snippet, snippet);
snippet->set_friendly_name(original_snippet->get_friendly_name());
snippet->set_generator(std::make_shared<CPUGenerator>(host_isa));
}
void Snippet::initSupportedPrimitiveDescriptors() {
copy_snippet();
if (!supportedPrimitiveDescriptors.empty())
return;
@ -488,6 +503,7 @@ void Snippet::generate() {
ov::pass::Manager optManager;
optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
optManager.register_pass<ConvertToSwishCPU>();
// LoadConvert uses Load emitter that support conversion from any type to only f32
optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseLoadConvert>(

View File

@ -32,6 +32,10 @@ public:
void selectOptimalPrimitiveDescriptor() override;
InferenceEngine::Precision getRuntimePrecision() const override;
// to avoid collisions in throughput mode with copy of TypeRelaxed nodes
// we should have common shared mutex between streams
void setSharedMutex(const std::shared_ptr<std::mutex>& mutex);
// Here we convert to canonical for & jit everything
void createPrimitive() override;
@ -46,6 +50,11 @@ private:
typedef void (*kernel)(const void *, const void *);
// Create a deep local copy of the input snippet to perform canonicalization & code generation
// TODO: Probably better to implement a proper copy constructor
// NOTE: Before call mutex should be initialized
void copy_snippet();
void define_schedule();
void generate();
@ -54,6 +63,8 @@ private:
void schedule_6d(const jit_snippets_call_args& const_args) const;
void schedule_nt(const jit_snippets_call_args& const_args) const;
// Original subgraph node
std::shared_ptr<ngraph::snippets::op::Subgraph> original_snippet;
// Local copy of subgraph node for canonization & code generation
std::shared_ptr<ngraph::snippets::op::Subgraph> snippet;

View File

@ -363,7 +363,7 @@ void TensorIterator::getSupportedDescriptors() {
THROW_ERROR << "cannot be cast to ov::op::util::SubGraphOp";
}
const std::shared_ptr<const ov::Model> body = tiOp->get_function();
sub_graph.CreateGraph(body, ext_mng, weightCache);
sub_graph.CreateGraph(body, ext_mng, weightCache, sharedMutex);
const auto &inMap = sub_graph.GetInputNodesMap();
for (const auto &param : tiOp->get_function()->get_parameters()) {

View File

@ -82,6 +82,8 @@
#include <transformations/op_conversions/fq_decomposition.hpp>
#include <transformations/utils/utils.hpp>
#include <snippets/pass/collapse_subgraph.hpp>
#include <snippets/pass/common_optimizations.hpp>
#include <snippets/pass/convert_constants.hpp>
#include "ngraph_transformations/snippets_mark_skipped.hpp"
#include <transformations/op_conversions/convert_roi_align_v9_to_v3.hpp>
#include <transformations/op_conversions/convert_roi_align_v3_to_v9.hpp>
@ -579,20 +581,12 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
}
ngraph::pass::Manager postLPTPassManager;
postLPTPassManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
postLPTPassManager.register_pass<ngraph::pass::UnrollTensorIterator>();
postLPTPassManager.register_pass<ReshapePRelu>();
postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr &node) -> bool {
std::string errMsg;
return node::FakeQuantize::isSupportedOperation(node, errMsg);
});
postLPTPassManager.get_pass_config()->set_callback<ngraph::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});
postLPTPassManager.register_pass<MoveEltwiseUpThroughDataMov>();
postLPTPassManager.get_pass_config()->set_callback<MoveEltwiseUpThroughDataMov>([](const std::shared_ptr<const ngraph::Node>& node) -> bool {
if (node->get_input_size() >= 2) {
@ -625,13 +619,19 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
});
postLPTPassManager.run_passes(nGraphFunc);
if (!useLpt && _enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
ngraph::pass::Manager tokenization_manager;
tokenization_manager.register_pass<SnippetsMarkSkipped>();
tokenization_manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
tokenization_manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
tokenization_manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
if (_enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
ngraph::pass::Manager snippetsManager;
snippetsManager.register_pass<SnippetsMarkSkipped>();
snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
[](const std::shared_ptr<const ov::Node>& n) -> bool {
// CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
if (ov::is_type<const ov::op::v4::Swish>(n)) {
if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
return true;
}
const auto& inputs = n->inputs();
// todo: clarify whether we can evaluate snippets on const paths
const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
@ -650,8 +650,18 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
[&](const ov::Output<const ov::Node>& out) {return rank_is_too_large(out.get_tensor());});
return has_only_const_inputs || bad_input_rank || bad_output_rank;
});
tokenization_manager.run_passes(nGraphFunc);
snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
snippetsManager.run_passes(nGraphFunc);
}
ngraph::pass::Manager postSnippetsManager;
postSnippetsManager.register_pass<ngraph::pass::FakeQuantizeDecomposition>();
postSnippetsManager.get_pass_config()->set_callback<ngraph::pass::FakeQuantizeDecomposition>([](const_node_ptr& node) -> bool {
std::string errMsg;
return node::FakeQuantize::isSupportedOperation(node, errMsg);
});
postSnippetsManager.register_pass<ngraph::pass::ConstantFolding>();
postSnippetsManager.run_passes(nGraphFunc);
}
static void Transformation(CNNNetwork& clonedNetwork, const bool _enableLPT, const bool _enableBF16, const bool _enableSnippets, const bool isLegacyApi) {

View File

@ -0,0 +1,131 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <vector>
#include "snippets/fake_quantize_decomposition_test.hpp"
using namespace LayerTestsDefinitions;
using namespace ngraph;
namespace {
namespace decompositionInSubgraph {
const std::vector<TestValues> testValuesDecompositionScalars = {
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{}, {}, {}, {}},
},
};
const std::vector<TestValues> testValuesDecompositionPerChannel = {
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}},
},
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{1, 3, 1, 1}, {1, 3, 1, 1}, {}, {}},
},
};
std::vector<std::pair<std::shared_ptr<Node>, std::pair<std::string, std::string> >> operations = {
{std::make_shared<opset1::Abs>(), {"Subgraph", "Abs,fakeQuantize"}},
{std::make_shared<ngraph::op::v4::Swish>(), {"Subgraph", "Swish,fakeQuantize"}},
};
INSTANTIATE_TEST_SUITE_P(
smoke_Snippets_FQDecomposition_Scalars,
FakeQuantizeDecompositionTest,
::testing::Combine(
::testing::ValuesIn(testValuesDecompositionScalars),
::testing::ValuesIn(operations),
// reorder (nChw[16|8]c) + MaxPool + Subgraph + reorder(nchw)
::testing::Values(std::pair<size_t, size_t>{4, 1}),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
FakeQuantizeDecompositionTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(
smoke_Snippets_FQDecomposition_PerChannel,
FakeQuantizeDecompositionTest,
::testing::Combine(
::testing::Values(testValuesDecompositionPerChannel[0]),
::testing::ValuesIn(operations),
// reorder (nChw[16|8]c) + MaxPool + reorder(nChw[16|8]c) x6 + Subgraph + reorder(nchw)
::testing::Values(std::pair<size_t, size_t>{10, 1}),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
FakeQuantizeDecompositionTest::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(
smoke_Snippets_FQDecomposition_PerChannel_Input,
FakeQuantizeDecompositionTest,
::testing::Combine(
::testing::Values(testValuesDecompositionPerChannel[1]),
::testing::ValuesIn(operations),
// reorder (nChw[16|8]c) + MaxPool + reorder(nChw[16|8]c) x4 + Subgraph + reorder(nchw)
::testing::Values(std::pair<size_t, size_t>{8, 1}),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
FakeQuantizeDecompositionTest::getTestCaseName);
} // namespace decompositionInSubgraph
namespace legacyFuse {
const std::vector<TestValues> testValuesLegacyFuse = {
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{1, 3, 1, 1}, {1, 3, 1, 1}, {}, {}}
},
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{}, {}, {1, 3, 1, 1}, {1, 3, 1, 1}}
},
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{}, {}, {}, {}}
},
{
ov::element::f32,
ngraph::Shape{1, 3, 16, 16},
ov::element::f32,
1.f,
{{1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}}
},
};
std::vector<std::pair<std::shared_ptr<Node>, std::pair<std::string, std::string>>> operations = {
{std::make_shared<opset1::Convolution>(), {"Convolution", "Convolution,fakeQuantize"}},
};
INSTANTIATE_TEST_SUITE_P(
smoke_Snippets,
FakeQuantizeDecompositionTest,
::testing::Combine(
::testing::ValuesIn(testValuesLegacyFuse),
::testing::ValuesIn(operations),
// reorder (nChw[16|8]c) + MaxPool + reorder(nhwc) + reorder(ABcd16b16a) + Convolution + reorder(nchw)
::testing::Values(std::pair<size_t, size_t>{6, 0}),
::testing::Values(CommonTestUtils::DEVICE_CPU)),
FakeQuantizeDecompositionTest::getTestCaseName);
} // namespace legacyFuse
} // namespace

View File

@ -0,0 +1,107 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include "common_test_utils/ngraph_test_utils.hpp"
#include "snippets/pass/fq_decomposition.hpp"
#include "snippets/pass/collapse_subgraph.hpp"
#include "fake_quantize_function.hpp"
#include "snippets/op/subgraph.hpp"
#include "ngraph_transformations/snippets_mark_skipped.hpp"
#include "function_helper.hpp"
namespace ov {
namespace test {
namespace snippets {
class FakeQuantizeTokenizationTest : public TransformationTestsF {
public:
void register_passes() {
manager.register_pass<ov::intel_cpu::SnippetsMarkSkipped>();
manager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
manager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
manager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>([](const std::shared_ptr<const ov::Node>& n) -> bool {
return false;
});
}
void TearDown() override {
TransformationTestsF::TearDown();
auto subgraph = FunctionHelper::getSubgraph(function);
auto body = subgraph == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph)->get_body();
auto subgraph_ref = FunctionHelper::getSubgraph(function_ref);
auto body_ref = subgraph_ref == nullptr ? nullptr : std::dynamic_pointer_cast<ngraph::snippets::op::Subgraph>(subgraph_ref)->get_body();
if ((body != nullptr) && (body_ref != nullptr)) {
auto res = comparator.compare(body, body_ref);
ASSERT_TRUE(res.valid) << res.message;
} else {
ASSERT_EQ(nullptr, body);
ASSERT_EQ(nullptr, body_ref);
}
}
};
TEST_F(FakeQuantizeTokenizationTest, smoke_Snippets_FakeQuantize_PerTensor) {
function = FakeQuantizeFunction::getOperationAndFakeQuantize(
{ {1, 3, 16, 16} },
element::f32,
{ {}, {}, {}, {} },
true,
FunctionHelper::makePrerequisitesOriginal());
function_ref = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
{ {1, 3, 16, 16} },
element::f32,
{ {}, {}, {}, {} },
true,
FunctionHelper::makePrerequisitesOriginal());
register_passes();
}
TEST_F(FakeQuantizeTokenizationTest, smoke_Snippets_FakeQuantize_PerChannels) {
function = FakeQuantizeFunction::getOperationAndFakeQuantize(
{ {1, 3, 16, 16} },
element::f32,
{ {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1} },
true,
FunctionHelper::makePrerequisitesOriginal());
function_ref = FakeQuantizeFunction::getSubgraphWithFakeQuantize(
{ {1, 3, 16, 16} },
element::f32,
{ {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1}, {1, 3, 1, 1} },
true,
FunctionHelper::makePrerequisitesOriginal());
register_passes();
}
TEST_F(FakeQuantizeTokenizationTest, smoke_Snippets_ConvolutionWithFakeQuantize) {
function = FakeQuantizeFunction::getOperationAndFakeQuantize(
{{1, 3, 16, 16}},
element::f32,
{{}, {}, {}, {}},
true,
FunctionHelper::makePrerequisitesOriginal(),
std::make_shared<ngraph::opset1::Convolution>());
function_ref = FakeQuantizeFunction::getOperationAndFakeQuantize(
{{1, 3, 16, 16}},
element::f32,
{{}, {}, {}, {}},
true,
FunctionHelper::makePrerequisitesOriginal(),
std::make_shared<ngraph::opset1::Convolution>());
register_passes();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,50 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <tuple>
#include <string>
#include "shared_test_classes/base/layer_test_utils.hpp"
#include "ngraph_functions/utils/ngraph_helpers.hpp"
#include "ngraph_functions/builders.hpp"
#include "shared_test_classes/base/snippets_test_utils.hpp"
namespace LayerTestsDefinitions {
class ActualValues {
public:
ov::element::Type modelType;
ngraph::Shape inputShape;
ov::element::Type inputType;
float zeroPoint;
std::vector<ngraph::Shape> fakeQuantizeShapes;
};
class TestValues {
public:
ov::element::Type modelType;
ngraph::Shape inputShape;
ov::element::Type inputType;
float zeroPoint;
std::vector<ngraph::Shape> fakeQuantizeShapes;
};
typedef std::tuple<
TestValues, // test values
std::pair<std::shared_ptr<ngraph::Node>, std::pair<std::string, std::string>>, // operation
std::pair<size_t, size_t>, // number of nodes
std::string // target device
> testsParams;
class FakeQuantizeDecompositionTest : public testing::WithParamInterface<testsParams>, virtual public ov::test::SnippetsTestsCommon {
public:
static std::string getTestCaseName(testing::TestParamInfo<testsParams> obj);
protected:
void SetUp() override;
};
} // namespace LayerTestsDefinitions

View File

@ -0,0 +1,78 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "snippets/fake_quantize_decomposition_test.hpp"
#include <memory>
#include <tuple>
#include <vector>
#include <string>
#include <ie_core.hpp>
#include "ngraph_ops/type_relaxed.hpp"
#include "fake_quantize_function.hpp"
#include "function_helper.hpp"
namespace LayerTestsDefinitions {
std::string FakeQuantizeDecompositionTest::getTestCaseName(testing::TestParamInfo<testsParams> obj) {
std::ostringstream result;
const auto values = std::get<0>(obj.param);
const auto operation = std::get<1>(obj.param);
const auto operations_number = std::get<2>(obj.param);
const auto targetDevice = std::get<3>(obj.param);
const auto type_info = operation.first->get_type_info();
const auto operationString = ngraph::is_type<ngraph::opset1::Parameter>(operation.first) ?
"nullptr" :
(std::string(type_info.name) + "_" + std::string(type_info.version_id));
result << "IS=" << CommonTestUtils::vec2str(values.inputShape) << "_";
result << "netPRC=" << values.modelType << "_";
result << "D=" << targetDevice << "_";
result << "IN=" << values.inputType << "_";
result << "OP=" << operationString << "_";
result << "ON1=" << std::string(operation.second.first) << "_";
result << "ON1=" << std::string(operation.second.second) << "_";
result << "LP=" << values.zeroPoint;
result << "SH1=" << values.fakeQuantizeShapes[0] << "SH2=" << values.fakeQuantizeShapes[1]
<< "SH3=" << values.fakeQuantizeShapes[2] << "SH4=" << values.fakeQuantizeShapes[3];
return result.str();
}
void FakeQuantizeDecompositionTest::SetUp() {
auto& testsParams = this->GetParam();
const auto values = std::get<0>(testsParams);
const auto operation = std::get<1>(testsParams);
const auto operations_number = std::get<2>(testsParams);
targetDevice = std::get<3>(testsParams);
ref_num_nodes = operations_number.first;
ref_num_subgraphs = operations_number.second;
init_input_shapes({{values.inputShape, {values.inputShape}}});
std::shared_ptr<ngraph::Node> op = ngraph::is_type<ngraph::opset1::Parameter>(operation.first) ? nullptr : operation.first;
function = ov::test::snippets::FakeQuantizeFunction::getOperationAndFakeQuantize(
{values.inputShape},
values.inputType,
values.fakeQuantizeShapes,
values.zeroPoint,
ov::test::snippets::FunctionHelper::makePrerequisitesOriginal(),
op);
}
TEST_P(FakeQuantizeDecompositionTest, CompareWithRefImpl) {
run();
const auto operation = std::get<1>(this->GetParam());
auto elementType = std::string(operation.second.first);
validateOriginalLayersNamesByType(elementType, operation.second.second);
validateNumSubgraphs();
};
} // namespace LayerTestsDefinitions

View File

@ -12,6 +12,9 @@ namespace test {
class SnippetsTestsCommon : virtual public ov::test::SubgraphBaseTest {
protected:
void validateNumSubgraphs();
void validateOriginalLayersNamesByType(const std::string& layerType, const std::string& originalLayersNames);
// Expected num nodes and subgraphs in exec graphs depends on the plugin
// pipeline, tokenization callback for example. Therefore, they have to be provided manually.
size_t ref_num_nodes = 0;

View File

@ -36,5 +36,23 @@ void SnippetsTestsCommon::validateNumSubgraphs() {
ASSERT_EQ(ref_num_subgraphs, num_subgraphs) << "Compiled model contains invalid number of subgraphs.";
}
void SnippetsTestsCommon::validateOriginalLayersNamesByType(const std::string& layerType, const std::string& originalLayersNames) {
const auto& compiled_model = compiledModel.get_runtime_model();
for (const auto& op : compiled_model->get_ops()) {
const auto& rtInfo = op->get_rt_info();
const auto& typeIt = rtInfo.find("layerType");
const auto type = typeIt->second.as<std::string>();
if (type == layerType) {
const auto& nameIt = rtInfo.find("originalLayersNames");
const auto name = nameIt->second.as<std::string>();
ASSERT_EQ(originalLayersNames, name);
return;
}
}
ASSERT_TRUE(false) << "Layer type '" << layerType << "' was not found in compiled model";
}
} // namespace test
} // namespace ov

View File

@ -0,0 +1,43 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/ngraph.hpp"
namespace ov {
namespace test {
namespace snippets {
class FakeQuantizeFunction {
public:
// Parameter => Operation => FakeQuantize => Result
static std::shared_ptr<ov::Model> getOperationAndFakeQuantize(
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint,
const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites,
std::shared_ptr<ngraph::Node> operation = nullptr);
// Parameter => Subgraph (Parameter => FakeQuantize => Result) => Result
static std::shared_ptr<ov::Model> getSubgraphWithFakeQuantize(
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint,
const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites = {},
const std::vector<std::shared_ptr<Node>>& beforeFakeQuantizeOperations = {});
// Parameter => Subgraph (Parameter => element-wise ops from FakeQuantize decomposition results => Result) => Result
static std::shared_ptr<ov::Model> getSubgraphWithDecomposedFakeQuantize(
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint);
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,28 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include <ngraph/ngraph.hpp>
namespace ov {
namespace test {
namespace snippets {
// TODO: workaround while element-wise operations after `Parameter` are not added in Subgraph
class FunctionHelper {
public:
static std::vector<std::shared_ptr<Node>> makePrerequisitesOriginal();
static std::shared_ptr<Node> applyPrerequisites(
const std::shared_ptr<Node>& parent,
const std::vector<std::shared_ptr<Node>>& prerequisites);
// index: -1 - latest `Subgraph` operation
static std::shared_ptr<Node> getSubgraph(const std::shared_ptr<Model>& f, const int index = -1);
};
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,264 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "fake_quantize_function.hpp"
#include "common_test_utils/data_utils.hpp"
#include <snippets/snippets_isa.hpp>
#include <snippets/op/subgraph.hpp>
#include "ngraph_functions/builders.hpp"
#include "function_helper.hpp"
namespace ov {
namespace test {
namespace snippets {
namespace {
std::shared_ptr<ngraph::op::FakeQuantize> makeFakeQuantize(
const Output<Node>& parent,
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint) {
auto generate = [](const ov::element::Type precision,
const ngraph::Shape& shape,
const float initialValue,
const std::string& name) {
const auto size = ngraph::shape_size(shape);
std::vector<float> values(size);
for (auto i = 0; i < size; ++i) {
values[i] = static_cast<float>(initialValue + i);
}
auto constant = std::make_shared<ngraph::opset1::Constant>(precision, shape, values);
constant->set_friendly_name(name);
return constant;
};
const auto fakeQuantize = std::make_shared<ngraph::opset1::FakeQuantize>(
parent,
generate(inputType, fakeQuantizeShapes[0], zeroPoint, "inputLow"),
generate(inputType, fakeQuantizeShapes[1], 20.f, "inputHigh"),
generate(inputType, fakeQuantizeShapes[2], zeroPoint, "outputLow"),
generate(inputType, fakeQuantizeShapes[3], 20.f, "outputHigh"),
256ul);
fakeQuantize->set_friendly_name("fakeQuantize");
return fakeQuantize;
}
std::shared_ptr<ngraph::opset1::Convolution> makeConvolution(const Output<Node>& parent) {
const auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 3, 3, 1, 1 }, { 1.f });
const auto convolution = std::make_shared<ngraph::opset1::Convolution>(
parent,
weights,
ngraph::Strides{ 1, 1 },
ngraph::CoordinateDiff{ 0, 0 },
ngraph::CoordinateDiff{ 0, 0 },
ngraph::Strides{ 1, 1 });
convolution->set_friendly_name("Convolution");
return convolution;
}
std::shared_ptr<ngraph::opset1::GroupConvolution> makeGroupConvolution(const Output<Node>& parent) {
const auto weights = ngraph::opset1::Constant::create(ngraph::element::f32, ngraph::Shape{ 1, 3, 3, 1, 1 }, { 1.f });
const auto convolution = std::make_shared<ngraph::opset1::GroupConvolution>(
parent,
weights,
ngraph::Strides{ 1, 1 },
ngraph::CoordinateDiff{ 0, 0 },
ngraph::CoordinateDiff{ 0, 0 },
ngraph::Strides{ 1, 1 });
convolution->set_friendly_name("GroupConvolution");
return convolution;
}
std::shared_ptr<ngraph::opset1::MatMul> makeMatMul(const Output<Node>& parent1, const Output<Node>& parent2) {
const auto matMul = std::make_shared<ngraph::opset1::MatMul>(parent1, parent2);
matMul->set_friendly_name("MatMul");
return matMul;
}
Output<Node> initOperation(std::shared_ptr<Node> operation, const std::vector<Output<Node>>& parents) {
if (is_type<ngraph::opset1::Convolution>(operation)) {
assert(parents.size() == 1ul);
return makeConvolution(parents[0]);
}
if (is_type<ngraph::opset1::GroupConvolution>(operation)) {
assert(parents.size() == 1ul);
return makeGroupConvolution(parents[0]);
}
if (is_type<ngraph::opset1::MatMul>(operation)) {
assert(parents.size() == 2ul);
return makeMatMul(parents[0], parents[1]);
}
operation->set_argument(0, parents[0]);
auto elementType = std::string(operation->get_type_name());
operation->set_friendly_name(elementType);
return operation;
}
// TODO: workaround while element-wise operations after `Parameter` are not added in Subgraph
std::shared_ptr<Node> getOperations(const std::vector<std::shared_ptr<Node>>& operations, const Output<Node>& parent) {
Output<Node> currentParent = parent;
for (auto operation : operations) {
operation->set_argument(0, currentParent);
currentParent = operation;
}
return currentParent.get_node_shared_ptr();
}
} // namespace
std::shared_ptr<ov::Model> FakeQuantizeFunction::getOperationAndFakeQuantize(
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint,
const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites,
std::shared_ptr<ngraph::Node> operation) {
assert(fakeQuantizeShapes.size() == 4ul);
const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
parameter->set_friendly_name("parameter");
auto parent = FunctionHelper::applyPrerequisites(parameter, prerequisites);
const auto fakeQuantize = makeFakeQuantize(
operation == nullptr ? parent : initOperation(operation, { parent }),
inputShape,
inputType,
fakeQuantizeShapes,
zeroPoint);
fakeQuantize->set_friendly_name("fakeQuantize");
const auto result = std::make_shared<ngraph::opset1::Result>(fakeQuantize);
result->set_friendly_name("result");
auto function = std::make_shared<ngraph::Function>(ngraph::ResultVector{ result }, ParameterVector{ parameter }, "FakeQuantizeFunction");
function->validate_nodes_and_infer_types();
return function;
}
std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithFakeQuantize(
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint,
const std::vector<std::shared_ptr<ngraph::Node>>& prerequisites,
const std::vector<std::shared_ptr<Node>>& beforeFakeQuantizeOperations) {
assert(fakeQuantizeShapes.size() == 4ul);
auto getSubgraphBody = [](
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint,
const std::vector<std::shared_ptr<Node>>& beforeFakeQuantizeOperations) {
const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
parameter->set_friendly_name("parameter");
const auto fakeQuantize = makeFakeQuantize(
getOperations(beforeFakeQuantizeOperations, {parameter}), inputShape, inputType, fakeQuantizeShapes, zeroPoint);
const auto result = std::make_shared<ngraph::opset1::Result>(fakeQuantize);
result->set_friendly_name("result");
return std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{parameter}, "SubgraphWithFakeQuantizeBody");
};
const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
parameter->set_friendly_name("parameter");
auto parent = FunctionHelper::applyPrerequisites(parameter, prerequisites);
const auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
ngraph::OutputVector{ parent },
getSubgraphBody(inputShape, inputType, fakeQuantizeShapes, zeroPoint, beforeFakeQuantizeOperations));
subgraph->set_friendly_name("subgraph");
const auto result = std::make_shared<ngraph::opset1::Result>(subgraph);
result->set_friendly_name("result");
auto function = std::make_shared<ngraph::Function>(ngraph::ResultVector{ result }, ParameterVector{ parameter }, "SubgraphWithFakeQuantize");
function->validate_nodes_and_infer_types();
return function;
}
std::shared_ptr<ov::Model> FakeQuantizeFunction::getSubgraphWithDecomposedFakeQuantize(
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint) {
assert(fakeQuantizeShapes.size() == 4ul);
auto getSubgraphBody = [](
const ngraph::Shape& inputShape,
const element::Type inputType,
const std::vector<ngraph::Shape>& fakeQuantizeShapes,
const float zeroPoint) {
const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
parameter->set_friendly_name("parameter");
const auto maximum = std::make_shared<ngraph::opset1::Maximum>(
parameter,
std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{1.f}));
maximum->set_friendly_name("inputLow");
const auto minimum = std::make_shared<ngraph::opset1::Minimum>(
maximum,
std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{20.f}));
minimum->set_friendly_name("inputHigh");
const auto multiply = std::make_shared<ngraph::opset1::Multiply>(
minimum,
std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{13.4211f}));
multiply->set_friendly_name("multiply");
const auto subtract = std::make_shared<ngraph::opset1::Subtract>(
multiply,
std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{13.4211f}));
subtract->set_friendly_name("subtract");
const auto round = std::make_shared<ngraph::opset5::Round>(subtract, ngraph::opset5::Round::RoundMode::HALF_TO_EVEN);
round->set_friendly_name("round");
const auto devide = std::make_shared<ngraph::opset1::Multiply>(
round,
std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{0.0745098f}));
devide->set_friendly_name("devide");
const auto add = std::make_shared<ngraph::opset1::Add>(
devide,
std::make_shared<ngraph::opset1::Constant>(element::f32, Shape{}, std::vector<float>{1.f}));
add->set_friendly_name("add");
const auto result = std::make_shared<ngraph::opset1::Result>(add);
result->set_friendly_name("result");
return std::make_shared<ngraph::Function>(
ngraph::ResultVector{result}, ngraph::ParameterVector{parameter}, "SubgraphWithDecomposedFakeQuantizeBody");
};
const auto parameter = std::make_shared<ngraph::opset1::Parameter>(inputType, inputShape);
parameter->set_friendly_name("parameter");
const auto subgraph = std::make_shared<ngraph::snippets::op::Subgraph>(
ngraph::OutputVector {parameter},
getSubgraphBody(inputShape, inputType, fakeQuantizeShapes, zeroPoint));
subgraph->set_friendly_name("subgraph");
const auto result = std::make_shared<ngraph::opset1::Result>(subgraph);
result->set_friendly_name("result");
return std::make_shared<ngraph::Function>(ngraph::ResultVector{result}, ngraph::ParameterVector{parameter}, "SubgraphWithDecomposedFakeQuantize");
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -0,0 +1,73 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "function_helper.hpp"
#include "common_test_utils/data_utils.hpp"
#include <snippets/snippets_isa.hpp>
#include <snippets/op/subgraph.hpp>
#include "ngraph_functions/builders.hpp"
namespace ov {
namespace test {
namespace snippets {
// TODO: workaround while element-wise operations after `Parameter` are not added in Subgraph
std::vector<std::shared_ptr<Node>> FunctionHelper::makePrerequisitesOriginal() {
std::vector<std::shared_ptr<Node>> nodes;
const auto parameter = std::make_shared<ngraph::opset1::Parameter>();
parameter->set_friendly_name("parameter");
nodes.push_back(parameter);
const auto maxPool = std::make_shared<ngraph::opset1::MaxPool>(
parameter,
Strides{ 1, 1 }, // strides
Shape{ 0, 0 }, // pads_begin
Shape{ 0, 0 }, // pads_end
Shape{ 1, 1 }); // kernel
maxPool->set_friendly_name("maxPool");
nodes.push_back(maxPool);
return nodes;
}
std::shared_ptr<Node> FunctionHelper::applyPrerequisites(const std::shared_ptr<Node>& parent, const std::vector<std::shared_ptr<Node>>& prerequisites) {
std::shared_ptr<ngraph::Node> currentParent;
if (prerequisites.empty()) {
currentParent = parent;
} else {
auto begin = prerequisites[0];
if (is_type<ngraph::opset1::Parameter>(begin)) {
begin = prerequisites[1];
}
begin->set_argument(0, parent);
currentParent = *prerequisites.rbegin();
}
return currentParent;
}
std::shared_ptr<Node> FunctionHelper::getSubgraph(const std::shared_ptr<Model>& f, const int index) {
int currentIndex = 0;
std::shared_ptr<ngraph::snippets::op::Subgraph> subgraph;
for (const auto& op : f->get_ordered_ops()) {
auto tmp_subgraph = as_type_ptr<ngraph::snippets::op::Subgraph>(op);
if (tmp_subgraph != nullptr) {
if (index == currentIndex) {
return tmp_subgraph;
}
subgraph = tmp_subgraph;
currentIndex++;
}
}
if (index != -1) {
return nullptr;
}
return subgraph;
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -13,19 +13,19 @@ namespace snippets {
std::shared_ptr<ov::Model> AddFunctionLoweredBroadcast::initLowered() const {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto load0 = std::make_shared<ngraph::snippets::op::Load>(data0);
std::shared_ptr<Node> add_input0 = load0;
if (!broadcast_shapes[0].empty()) {
auto broadcast0 = std::make_shared<ngraph::snippets::op::BroadcastMove>(load0, broadcast_shapes[0]);
add_input0 = broadcast0;
std::shared_ptr<Node> add_input0 = nullptr;
if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].back()) {
add_input0 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data0, broadcast_shapes[0]);
} else {
add_input0 = std::make_shared<ngraph::snippets::op::Load>(data0);
}
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
auto load1 = std::make_shared<ngraph::snippets::op::Load>(data1);
std::shared_ptr<Node> add_input1 = load1;
if (!broadcast_shapes[1].empty()) {
auto broadcast1 = std::make_shared<ngraph::snippets::op::BroadcastMove>(load1, broadcast_shapes[1]);
add_input1 = broadcast1;
std::shared_ptr<Node> add_input1 = nullptr;
if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].back()) {
add_input1 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data1, broadcast_shapes[1]);
} else {
add_input1 = std::make_shared<ngraph::snippets::op::Load>(data1);
}
auto add = std::make_shared<op::v1::Add>(add_input0, add_input1);
auto store = std::make_shared<ngraph::snippets::op::Store>(add);