[CPU] [DEBUG CAPS] Extension for snippets and other ngraph transformations (#14223)

This commit is contained in:
Egor Duplenskii 2022-12-20 06:49:37 +01:00 committed by GitHub
parent 40e19dec00
commit e306cbc67a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 1278 additions and 718 deletions

View File

@ -14,10 +14,11 @@
#include "ie_parallel.hpp"
#include "ie_system_conf.h"
#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
#include "openvino/core/type/element_type_traits.hpp"
#include "openvino/runtime/properties.hpp"
#include <cpu/x64/cpu_isa_traits.hpp>
#include "utils/debug_capabilities.h"
#include "cpu/x64/cpu_isa_traits.hpp"
namespace ov {
namespace intel_cpu {
@ -48,10 +49,24 @@ Config::Config() {
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16))
enforceBF16 = false;
CPU_DEBUG_CAP_ENABLE(readDebugCapsProperties());
CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
updateProperties();
}
#ifdef CPU_DEBUG_CAPS
/**
* Debug capabilities configuration has more priority than common one
* Some of the debug capabilities also require to enable some of common
* configuration properties
*/
void Config::applyDebugCapsProperties() {
// always enable perf counters for verbose mode and performance summary
if (!debugCaps.verbose.empty() || !debugCaps.summaryPerf.empty())
collectPerfCounters = true;
}
#endif
void Config::readProperties(const std::map<std::string, std::string> &prop) {
const auto streamExecutorConfigKeys = streamExecutorConfig.SupportedKeys();
const auto hintsConfigKeys = perfHintsConfig.SupportedKeys();
@ -184,7 +199,7 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
streamExecutorConfig._streams = 1;
CPU_DEBUG_CAP_ENABLE(readDebugCapsProperties());
CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
updateProperties();
}
@ -239,58 +254,6 @@ void Config::updateProperties() {
_config.insert({PluginConfigParams::KEY_CACHE_DIR, cache_dir});
}
#ifdef CPU_DEBUG_CAPS
void Config::readDebugCapsProperties() {
auto readEnv = [](const char* envVar) {
return std::getenv(envVar);
};
auto parseDumpFormat = [](const std::string& format) {
if (format == "BIN")
return FORMAT::BIN;
else if (format == "TEXT")
return FORMAT::TEXT;
else
IE_THROW() << "readDebugCapsProperties: Unknown dump format";
};
const char* envVarValue = nullptr;
if (envVarValue = readEnv("OV_CPU_EXEC_GRAPH_PATH"))
execGraphPath = envVarValue;
if (envVarValue = readEnv("OV_CPU_VERBOSE"))
verbose = envVarValue;
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_DIR"))
blobDumpDir = envVarValue;
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_FORMAT"))
blobDumpFormat = parseDumpFormat(envVarValue);
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_EXEC_ID"))
blobDumpFilters[BY_EXEC_ID] = envVarValue;
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_PORTS"))
blobDumpFilters[BY_PORTS] = envVarValue;
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_TYPE"))
blobDumpFilters[BY_TYPE] = envVarValue;
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_NAME"))
blobDumpFilters[BY_NAME] = envVarValue;
if (envVarValue = readEnv("OV_CPU_SUMMARY_PERF")) {
collectPerfCounters = true;
summaryPerf = envVarValue;
}
// always enable perf counters for verbose mode
if (!verbose.empty())
collectPerfCounters = true;
}
#endif // CPU_DEBUG_CAPS
} // namespace intel_cpu
} // namespace ov

View File

@ -6,8 +6,11 @@
#include <threading/ie_istreams_executor.hpp>
#include <ie_performance_hints.hpp>
#include "utils/debug_capabilities.h"
#include <ie/ie_common.h>
#include <openvino/util/common_util.hpp>
#include "utils/debug_caps_config.h"
#include <bitset>
#include <string>
#include <map>
#include <mutex>
@ -57,31 +60,12 @@ struct Config {
std::map<std::string, std::string> _config;
#ifdef CPU_DEBUG_CAPS
enum FILTER {
BY_PORTS,
BY_EXEC_ID,
BY_TYPE,
BY_NAME,
};
enum class FORMAT {
BIN,
TEXT,
};
std::string execGraphPath;
std::string verbose;
std::string blobDumpDir = "cpu_dump";
FORMAT blobDumpFormat = FORMAT::TEXT;
// std::hash<int> is necessary for Ubuntu-16.04 (gcc-5.4 and defect in C++11 standart)
std::unordered_map<FILTER, std::string, std::hash<int>> blobDumpFilters;
std::string summaryPerf = "";
void readDebugCapsProperties();
#endif
bool isNewApi = true;
#ifdef CPU_DEBUG_CAPS
DebugCapsConfig debugCaps;
void applyDebugCapsProperties();
#endif
};
} // namespace intel_cpu

View File

@ -6,6 +6,7 @@ Use the following cmake option to enable debug capabilities:
* [Verbose mode](verbose.md)
* [Blob dumping](blob_dumping.md)
* [Graph serialization](graph_serialization.md)
* [Graph transformation disabling](feature_disabling.md#graph-transformations)
## Debug log

View File

@ -29,8 +29,8 @@ Default is *cpu_dump*
OV_CPU_BLOB_DUMP_FORMAT=<format> binary ...
```
Options are:
* BIN (default)
* TEXT
* BIN
* TEXT (default)
## Filter input / output blobs
To dump only input / output blobs:

View File

@ -0,0 +1,52 @@
# Filters
Filters described below have the following common format:
```sh
filter_name=<comma_separated_tokens>
```
Tokens are processed from left to right and each one includes or excludes corresponding value.\
For exclusion token is just prepended by minus: *-token*\
All tokens are case insensitive and no tokens is treated as *all*\
So filters below are equal:
* filter_name
* filter_name=all
* filter_name=-all,ALL
## IR format filter
IR format filter is used to specify output IR formats, e.g. for [serialization](graph_serialization.md#graph-transformations).
```sh
formats=<comma_separated_tokens>
```
The following tokens are supported:
* all\
equals to <xml,dot,svg>
* xml (default)\
IR in .xml file. Can be opened using, for example, *netron* app. (For now the option is Linux only)
* xmlbin\
IR in .xml and .bin files. Can be opened using, for example, *netron* app.
* dot\
IR in .dot file (.svg.dot file if svg is also specified). Can be inspected using, for example, *graphviz* tools.
* svg\
IR in .svg file. Requires *dot* tool to be installed on the host, not supported on Windows.\
Generation is based on dot representation, so IR is additionally dumped to .svg.dot file.
## Transformation filter
Transformation filter is used to specify main graph transformation stages for different purposes,
e.g. for [disabling](feature_disabling.md#graph-transformation) or [serialization](graph_serialization.md#graph-transformations).
```sh
transformations=<comma_separated_tokens>
```
The following tokens are supported:
* all (default)\
equals to <preLpt,lpt,postLpt,snippets,specific>
* common \
equals to <preLpt,postLpt>
* preLpt
* lpt
* postLpt
* snippets
* specific

View File

@ -0,0 +1,26 @@
# Feature disabling
Common way to disable something in CPU plugin is implied by means of environment variable **OV_CPU_DISABLE**:
```sh
OV_CPU_DISABLE=<space_separated_options> binary ...
```
Option names are case insensitive and processed from left to right,\
so last one overwrites previous ones if duplicated.
Examples:
```sh
OV_CPU_DISABLE="transformations" binary ...
OV_CPU_DISABLE="transformations=lpt" binary ...
OV_CPU_DISABLE="transformations=all,-common" binary ...
```
By means of corresponding options **OV_CPU_DISABLE** controls disabling of the following features:
## Graph transformations
Graph transformation disabling is controlled by the following option inside **OV_CPU_DISABLE**:
```sh
transformations=<comma_separated_tokens>
```
Filter with main transformation stages to disable specified ones.\
See [transformation filter](debug_caps_filters.md#transformation-filter) for more details.

View File

@ -1,17 +1,43 @@
# Graph serialization
The functionality allows to serialize execution graph using environment variable:
Graph serialization is disabled by default and controlled by environment variables.
## Execution graph
Execution graph could be serialized using environment variable **OV_CPU_EXEC_GRAPH_PATH**:
```sh
OV_CPU_EXEC_GRAPH_PATH=<path> binary ...
OV_CPU_EXEC_GRAPH_PATH=<option> binary ...
```
Possible serialization options:
* cout\
Serialize to console output.
* \<path\>.xml\
Serialize graph into .xml and .bin files. Can be opened using, for example, *netron* app.
* **TBD**: \<path\>.dot\
Serialize graph into .dot file. Can be inspected using, for example, *graphviz* tools.
## Graph transformations
Additionally, IR could be serialized at specified stages using environment variable **OV_CPU_DUMP_IR**:
```sh
OV_CPU_DUMP_IR=<space_separated_options> binary ...
```
Possible serialization options:
* cout
Examples:
```sh
OV_CPU_DUMP_IR="transformations" binary ...
OV_CPU_DUMP_IR="transformations=snippets dir=path/dumpDir" binary ...
OV_CPU_DUMP_IR="transformations=all,-common DIR=path/dumpdir formats=svg,xml" binary ...
```
Serialize to console output
* \<path\>.xml
Option names are case insensitive, the following options are supported:
* dir=\<path\>\
Path to dumped IR files. If omitted, it defaults to *intel_cpu_dump*
* formats=<comma_separated_tokens>\
Filter with IR formats to dump. If omitted, it defaults to *xml*\
See [IR format filter](debug_caps_filters.md#ir-format-filter) for more details.
* transformations=<comma_separated_tokens>\
Filter with main transformation stages to serialize graph before and after specified ones.\
See [transformation filter](debug_caps_filters.md#transformation-filter) for more details.
Serialize graph into .xml and .bin files. Can be opened using, for example, *netron* app
* \<path\>.dot
TBD. Serialize graph into .dot file. Can be inspected using, for example, *graphviz* tools.
Options are processed from left to right, so last one overwrites previous ones if duplicated.

View File

@ -1073,7 +1073,7 @@ void Graph::InferStatic(InferRequestBase* request) {
dnnl::stream stream(eng);
for (const auto& node : executableGraphNodes) {
VERBOSE(node, config.verbose);
VERBOSE(node, config.debugCaps.verbose);
PERF(node, config.collectPerfCounters);
if (request)
@ -1160,7 +1160,7 @@ void Graph::InferDynamic(InferRequestBase* request) {
updateNodes(stopIndx);
for (; inferCounter < stopIndx; ++inferCounter) {
auto& node = executableGraphNodes[inferCounter];
VERBOSE(node, config.verbose);
VERBOSE(node, config.debugCaps.verbose);
PERF(node, config.collectPerfCounters);
if (request)
@ -1171,7 +1171,7 @@ void Graph::InferDynamic(InferRequestBase* request) {
}
inline void Graph::ExecuteNode(const NodePtr& node, const dnnl::stream& stream) const {
DUMP(node, config, infer_count);
DUMP(node, config.debugCaps, infer_count);
OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, node->profiling.execute);
if (node->isDynamicNode()) {

View File

@ -210,7 +210,7 @@ std::shared_ptr<ngraph::Function> dump_graph_as_ie_ngraph_net(const Graph &graph
#ifdef CPU_DEBUG_CAPS
void serialize(const Graph &graph) {
const std::string& path = graph.getConfig().execGraphPath;
const std::string& path = graph.getConfig().debugCaps.execGraphPath;
if (path.empty())
return;
@ -257,7 +257,7 @@ void serializeToCout(const Graph &graph) {
}
void summary_perf(const Graph &graph) {
const std::string& summaryPerf = graph.getConfig().summaryPerf;
const std::string& summaryPerf = graph.getConfig().debugCaps.summaryPerf;
if (summaryPerf.empty())
return;

View File

@ -27,6 +27,7 @@ namespace intel_cpu {
inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphFunc) {
RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset);
ngraph::pass::Manager manager;
manager.register_pass<ConvertMatMulToFC>();
manager.register_pass<AlignMatMulInputRanks>();

View File

@ -2,140 +2,28 @@
// SPDX-License-Identifier: Apache-2.0
//
#include "ie_metric_helpers.hpp"
#include "plugin.h"
#include "extension_mngr.h"
#include "weights_cache.hpp"
#include "extension.h"
#include "itt.h"
#include "serialize.h"
#include "ie_metric_helpers.hpp" // must be included first
#include "plugin.h"
#include "transformation_pipeline.h"
#include "itt.h"
#include "extension_mngr.h"
#include "extension.h"
#include "serialize.h"
#include "threading/ie_executor_manager.hpp"
#include "ie_icore.hpp"
#include "ie_plugin_config.hpp"
#include "ie_system_conf.h"
#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
#include <threading/ie_executor_manager.hpp>
#include <memory>
#include <ie_plugin_config.hpp>
#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
#include <ie_icore.hpp>
#include <fstream>
#include <vector>
#include <tuple>
#include <unordered_set>
#include <ie_system_conf.h>
#include <ie_ngraph_utils.hpp>
#include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
#include <transformations/common_optimizations/common_optimizations.hpp>
#include <transformations/common_optimizations/fq_mul_fusion.hpp>
#include <transformations/common_optimizations/mul_fake_quantize_fusion.hpp>
#include <transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp>
#include <transformations/common_optimizations/convert_quantize_dequantize.hpp>
#include <transformations/common_optimizations/nop_elimination.hpp>
#include <transformations/common_optimizations/wrap_interpolate_into_transposes.hpp>
#include <transformations/common_optimizations/transpose_sinking.hpp>
#include "transformations/common_optimizations/convert_compression_only_to_legacy.hpp"
#include <transformations/common_optimizations/lin_op_sequence_fusion.hpp>
#include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
#include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
#include <transformations/op_conversions/convert_broadcast_to_tiles.hpp>
#include <transformations/op_conversions/convert_depth_to_space.hpp>
#include <transformations/op_conversions/convert_shuffle_channels3.hpp>
#include <transformations/op_conversions/convert_slice_to_strided_slice.hpp>
#include <transformations/op_conversions/convert_space_to_depth.hpp>
#include <transformations/op_conversions/convert_gelu.hpp>
#include <transformations/op_conversions/convert_gather_downgrade.hpp>
#include <transformations/op_conversions/convert_gather_upgrade.hpp>
#include <transformations/op_conversions/detection_output_downgrade.hpp>
#include <transformations/op_conversions/detection_output_upgrade.hpp>
#include <transformations/op_conversions/gelu7_downgrade.hpp>
#include <transformations/op_conversions/hswish_decomposition.hpp>
#include <transformations/op_conversions/hsigmoid_decomposition.hpp>
#include <transformations/op_conversions/mvn6_decomposition.hpp>
#include <transformations/op_conversions/normalize_l2_decomposition.hpp>
#include <transformations/op_conversions/reduce_l1_decomposition.hpp>
#include <transformations/op_conversions/reduce_l2_decomposition.hpp>
#include <transformations/op_conversions/softplus_decomposition.hpp>
#include <transformations/op_conversions/convert_space_to_batch.hpp>
#include <transformations/op_conversions/convert_batch_to_space.hpp>
#include <transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp>
#include <transformations/op_conversions/convert_subtract.hpp>
#include <transformations/op_conversions/softmax_decomposition.hpp>
#include <transformations/control_flow/unroll_tensor_iterator.hpp>
#include <transformations/op_conversions/convert_mod.hpp>
#include <transformations/op_conversions/convert_ti_to_sequences.hpp>
#include <transformations/op_conversions/lstm_cell_decomposition.hpp>
#include <transformations/op_conversions/rnn_cell_decomposition.hpp>
#include <transformations/op_conversions/gru_cell_decomposition.hpp>
#include <transformations/op_conversions/log_softmax_decomposition.hpp>
#include <transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp>
#include <transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp>
#include <transformations/op_conversions/convert_previous_nms_to_nms_9.hpp>
#include <transformations/op_conversions/convert_nms9_to_nms_ie_internal.hpp>
#include <transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp>
#include <transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp>
#include <transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp>
#include <transformations/smart_reshape/matmul_sr.hpp>
#include <transformations/op_conversions/convert_minimum_to_power_and_max.hpp>
#include <transformations/op_conversions/convert_reduce_to_pooling.hpp>
#include <transformations/convert_precision.hpp>
#include <transformations/init_node_info.hpp>
#include <transformations/disable_decompression_convert_constant_folding.hpp>
#include <transformations/rt_info/fused_names_attribute.hpp>
#include <transformations/op_conversions/fq_decomposition.hpp>
#include <transformations/utils/utils.hpp>
#include <transformations/op_conversions/convert_roi_align_v9_to_v3.hpp>
#include <transformations/op_conversions/convert_roi_align_v3_to_v9.hpp>
#include <transformations/op_conversions/softsign_decomposition.hpp>
#include "transformations/op_conversions/eye_decomposition.hpp"
#include "transformations/op_conversions/unique_decomposition.hpp"
#include "ngraph_transformations/convert_to_cpu_specific_opset.hpp"
#include "ngraph_transformations/snippets_mark_skipped.hpp"
#include "ngraph_transformations/mha_fusion.hpp"
#include "ngraph_transformations/convert_to_interaction.hpp"
#include "ngraph_transformations/convert_fq_rnn_to_quantized_rnn.hpp"
#include "ngraph_transformations/move_eltwise_up_data_movement.hpp"
#include "ngraph_transformations/swap_convert_transpose.hpp"
#include <snippets/pass/collapse_subgraph.hpp>
#include <snippets/pass/common_optimizations.hpp>
#include <snippets/pass/convert_constants.hpp>
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/opsets/opset2.hpp>
#include <ngraph/opsets/opset3.hpp>
#include <ngraph/opsets/opset4.hpp>
#include <ngraph/opsets/opset5.hpp>
#include <ngraph/opsets/opset6.hpp>
#include <openvino/opsets/opset10.hpp>
#include <ngraph/op/util/op_types.hpp>
#include <ngraph/pass/manager.hpp>
#include <ngraph/graph_util.hpp>
#include <ov_ops/augru_cell.hpp>
#include <ov_ops/augru_sequence.hpp>
#include <transformations/low_precision/mark_dequantization_subgraph.hpp>
#include <low_precision/common/quantization_granularity_restriction.hpp>
#include <low_precision/common/precisions_restriction.hpp>
#include <low_precision/convert_subtract_constant.hpp>
#include <low_precision/convolution.hpp>
#include <low_precision/convolution_backprop_data.hpp>
#include <low_precision/layer_transformation.hpp>
#include <low_precision/low_precision.hpp>
#include <low_precision/multiply_to_group_convolution.hpp>
#include <low_precision/network_helper.hpp>
#include "openvino/runtime/core.hpp"
#include "openvino/util/common_util.hpp"
#include <ie_algorithm.hpp>
#include "performance_heuristics.hpp"
#include "nodes/mvn.h"
#include "nodes/fake_quantize.h"
#include "nodes/normalize.h"
#include "nodes/mha.h"
#include "weights_cache.hpp"
#include "utils/denormals.hpp"
#include "transformations/common_optimizations/augru_cell_fusion.hpp"
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
#ifndef __GNUC_PREREQ
@ -262,452 +150,6 @@ Engine::~Engine() {
executorManager()->clear("CPUCallbackExecutor");
}
static bool fuse_type_to_convert(const std::shared_ptr<ngraph::Node>& node, ov::element::Type to, size_t idx) {
if (auto convert = ov::as_type_ptr<ov::opset10::Convert>(node)) {
// For Convert node, converting precision from floating point to boolean will lead to mathematical
// error, because here the output precision boolean is replaced by u8. E.g. floating point value 0.01
// is converted to be 1 for boolean, but 0 for u8. Thus an Abs and Ceil node should be added before the
// Convert node for this scenario.
if (convert->input(0).get_element_type().is_real() &&
convert->get_convert_element_type() == ngraph::element::boolean && to.is_integral_number()) {
auto abs = std::make_shared<ov::opset10::Abs>(convert->input_value(0).get_node_shared_ptr());
auto ceil = std::make_shared<ov::opset10::Ceiling>(abs);
auto new_convert = std::make_shared<ov::opset10::Convert>(ceil, to);
new_convert->set_friendly_name(convert->get_friendly_name());
ov::copy_runtime_info(convert, {abs, ceil, new_convert});
ov::replace_node(convert, new_convert);
return true;
} else {
convert->set_convert_element_type(to);
return true;
}
}
return false;
}
static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function> nGraphFunc, const bool _enableLPT, const bool _enableBF16,
const bool _enableSnippets, const bool isLegacyApi) {
ov::pass::Manager manager;
manager.set_per_pass_validation(false);
manager.register_pass<ov::pass::InitNodeInfo>();
const bool useLpt =
_enableLPT &&
ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(nGraphFunc);
auto defaultPrecisions = useLpt ? ngraph::pass::low_precision::precision_set::int8_support : std::vector<ov::element::Type>{};
bool hasINT16orINT32Levels = false;
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part1);
hasINT16orINT32Levels = ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent(
nGraphFunc,
{ngraph::pass::low_precision::levels::int16, ngraph::pass::low_precision::levels::int16_narrow_range,
ngraph::pass::low_precision::levels::int32, ngraph::pass::low_precision::levels::int32_narrow_range});
if (hasINT16orINT32Levels) {
defaultPrecisions = ngraph::pass::low_precision::precision_set::int8_int16_int32_support;
}
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(defaultPrecisions);
}
auto get_convert_precisions = []() {
precisions_array array = {
{ngraph::element::i64, ngraph::element::i32},
{ngraph::element::u64, ngraph::element::i32},
{ngraph::element::i16, ngraph::element::i32},
{ngraph::element::u16, ngraph::element::i32},
{ngraph::element::u32, ngraph::element::i32},
{ngraph::element::f64, ngraph::element::f32},
{ngraph::element::f16, ngraph::element::f32},
{ngraph::element::boolean, ngraph::element::u8},
{ngraph::element::i4, ngraph::element::i8},
{ngraph::element::u4, ngraph::element::u8}
};
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))
array.push_back({ngraph::element::bf16, ngraph::element::f32});
return array;
};
static const auto precisions = get_convert_precisions();
type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};
manager.register_pass<ov::pass::AUGRUCellFusion>();
manager.register_pass<ov::pass::CommonOptimizations>();
manager.register_pass<ov::pass::WrapInterpolateIntoTransposes>();
manager.register_pass<ov::pass::TransposeSinking>();
manager.register_pass<ov::pass::ConvertSequenceToTensorIterator>();
manager.register_pass<ov::pass::ConvertOpSet3ToOpSet2>();
manager.register_pass<ov::pass::ConvertOpSet2ToOpSet1>();
manager.register_pass<ov::pass::LSTMCellDecomposition>();
manager.register_pass<ov::pass::GRUCellDecomposition>();
manager.register_pass<ov::pass::RNNCellDecomposition>();
manager.register_pass<ov::pass::ConvertNMS1ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS3ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS4ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS5ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS9ToNMSIEInternal>();
manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
manager.register_pass<ov::pass::ConvertMatrixNmsToMatrixNmsIE>();
manager.register_pass<ov::pass::TransposeMatMul>();
manager.register_pass<ov::pass::ConstantFolding>();
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part2);
manager.register_pass<ngraph::pass::low_precision::ConvertSubtractConstant>(defaultPrecisions);
}
manager.register_pass<ov::pass::Validate>();
manager.register_pass<ov::pass::ConvertPrecision>(precisions, type_to_fuse);
manager.register_pass<ov::pass::EliminateConvert>();
manager.register_pass<SwapConvertTranspose>();
manager.register_pass<ConvertToInteraction>();
manager.register_pass<ConvertInteractionInt8>();
auto pass_config = manager.get_pass_config();
using const_node_ptr = const std::shared_ptr<const ngraph::Node>;
// SpaceToDepth/ DepthToSpace node implementation supports only equal input/output tensors with rank <= 5
pass_config->set_callback<ov::pass::ConvertSpaceToDepth,
ov::pass::ConvertDepthToSpace>(
[](const_node_ptr &node) -> bool {
return node->input_value(0).get_shape().size() <= 5lu &&
node->input_value(0).get_shape().size() == node->get_output_shape(0).size();
});
pass_config->set_callback<ov::pass::ConvertBatchToSpace,
ov::pass::ConvertSpaceToBatch>(
[](const_node_ptr &node) -> bool {
const auto & rank = node->input(0).get_partial_shape().rank().get_length();
return rank == 4lu || rank == 5lu;
});
auto isCellPrimitiveSupported = [](const_node_ptr &node) -> bool {
if (const auto &rnn_cell = std::dynamic_pointer_cast<const ngraph::opset4::RNNCell>(node)) {
return rnn_cell->get_clip() == 0.0f;
} else if (const auto &gru_cell = std::dynamic_pointer_cast<const ngraph::opset4::GRUCell>(
node)) {
return gru_cell->get_clip() == 0.0f
&& gru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
} else if (const auto &augru_cell = std::dynamic_pointer_cast<const ov::op::internal::AUGRUCell>(
node)) {
return augru_cell->get_clip() == 0.0f
&& augru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
} else if (const auto &lstm_cell = std::dynamic_pointer_cast<const ngraph::opset4::LSTMCell>(
node)) {
return lstm_cell->get_clip() == 0.0f &&
lstm_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
} else if (const auto &lstm_cell_v1 = std::dynamic_pointer_cast<const ngraph::opset1::LSTMCell>(
node)) {
return lstm_cell_v1->get_clip() == 0.0f &&
lstm_cell_v1->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
}
return false;
};
// Sequences supported by the plugin shouldn't be converted to TensorIterator.
// sequence_length input is not supported in all Sequences, so if is_seq_len_provided() == true, we
// should always convert to TensorIterator.
// RNN/GRU/LSTM Sequences are supported with clip == 0, and with default activations.
auto isSequencePrimitiveSupported = [](const_node_ptr &node) -> bool {
const auto& data = node->input(0);
const auto& data_pshape = data.get_partial_shape();
// WA: dynamic shapes make impossible to check seq_len due to shapeOf subgraphs
// but the sequence is still supported in CPU and doesn't need to be decomposed
if (data_pshape.is_dynamic())
return true;
if (data_pshape.rank().is_static() && data_pshape.rank().get_length() > 1 && !data_pshape[1].is_static())
return false;
auto max_seq_len = data.get_shape().at(1);
if (const auto &rnn_seq = std::dynamic_pointer_cast<const ngraph::opset6::RNNSequence>(node)) {
return rnn_seq->get_clip() == 0.0f &&
!ngraph::op::util::is_seq_len_provided(rnn_seq->get_input_node_shared_ptr(2),
max_seq_len);
} else if (const auto &gru_seq = std::dynamic_pointer_cast<const ngraph::opset6::GRUSequence>(
node)) {
return gru_seq->get_clip() == 0.0f &&
gru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
!ngraph::op::util::is_seq_len_provided(gru_seq->get_input_node_shared_ptr(2),
max_seq_len);
} else if (const auto &augru_seq = std::dynamic_pointer_cast<const ov::op::internal::AUGRUSequence>(
node)) {
return augru_seq->get_clip() == 0.0f &&
augru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
!ngraph::op::util::is_seq_len_provided(augru_seq->get_input_node_shared_ptr(2),
max_seq_len);
} else if (const auto &lstm_seq = std::dynamic_pointer_cast<const ngraph::opset6::LSTMSequence>(
node)) {
return lstm_seq->get_clip() == 0.0f &&
lstm_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"} &&
!ngraph::op::util::is_seq_len_provided(lstm_seq->get_input_node_shared_ptr(3),
max_seq_len);
}
return false;
};
pass_config->set_callback<ov::pass::ConvertRNNSequenceToTensorIterator,
ov::pass::ConvertGRUSequenceToTensorIterator,
ov::pass::ConvertLSTMSequenceToTensorIterator>(
[isSequencePrimitiveSupported](const_node_ptr &node) -> bool {
return isSequencePrimitiveSupported(node);
});
pass_config->set_callback<ov::pass::RNNCellDecomposition, ov::pass::GRUCellDecomposition,
ov::pass::LSTMCellDecomposition>(
[isCellPrimitiveSupported](const_node_ptr &node) -> bool {
return isCellPrimitiveSupported(node);
});
pass_config->set_callback<ov::pass::MVN6Decomposition>(
[](const_node_ptr &node) -> bool {
std::string errorMessage;
return node::MVN::isSupportedOperation(node, errorMessage);
});
pass_config->set_callback<ov::pass::NormalizeL2Decomposition>(
[](const_node_ptr &node) -> bool {
std::string errorMsg;
return node::NormalizeL2::isSupportedOperation(node, errorMsg);
});
pass_config->enable<ov::pass::SoftmaxDecomposition>();
pass_config->set_callback<ov::pass::SoftmaxDecomposition>(
[](const_node_ptr &node) -> bool {
return node->input_value(0).get_partial_shape().rank().get_length() <= 5;
});
if (!isLegacyApi) {
auto nmsCallback = [](const_node_ptr &node) -> bool {
for (size_t i = 0; i < node->get_output_size(); i++) {
const auto outputs = node->get_output_target_inputs(i);
for (const auto &out : outputs) {
if (!ngraph::op::is_output(out.get_node())) {
return false;
}
}
}
return true;
};
pass_config->set_callback<ov::pass::ConvertNMS9ToNMSIEInternal>(nmsCallback);
pass_config->set_callback<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>(nmsCallback);
pass_config->set_callback<ov::pass::ConvertMatrixNmsToMatrixNmsIE>(nmsCallback);
}
// List of enabled/disabled transformations
// Allow FP16 Converts to be folded and FP16 constants to be upgraded to FP32 data type
pass_config->disable<ov::pass::DisableDecompressionConvertConstantFolding>();
pass_config->disable<ov::pass::ConvertCompressedOnlyToLegacy>();
pass_config->disable<ov::pass::EyeDecomposition>();
pass_config->disable<ov::pass::ConvertGELU>();
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
pass_config->disable<ov::pass::Gelu7Downgrade>();
pass_config->disable<ov::pass::HSwishDecomposition>();
pass_config->disable<ov::pass::ReduceL1Decomposition>();
pass_config->disable<ov::pass::ReduceL2Decomposition>();
pass_config->disable<ov::pass::SoftPlusDecomposition>();
pass_config->disable<ov::pass::HSigmoidDecomposition>();
pass_config->disable<ov::pass::ConvertMod>();
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
pass_config->disable<ov::pass::WeightsDequantizeToFakeQuantize>();
pass_config->disable<ov::pass::SimplifyCTCGreedyDecoderSeqLen>();
pass_config->disable<ov::pass::ConvertGather7ToGather1>();
pass_config->disable<ov::pass::ConvertGather8ToGather7>();
pass_config->disable<ov::pass::ConvertMinimum>();
pass_config->disable<ov::pass::ConvertBroadcastToTiles>();
pass_config->disable<ov::pass::ConvertReduceMeanToPooling>();
pass_config->disable<ov::pass::ConvertReduceMaxToPooling>();
pass_config->disable<ov::pass::ConvertReduceSumToPooling>();
pass_config->disable<ov::pass::SliceToStridedSlice>();
pass_config->disable<ov::pass::ConvertDetectionOutput8ToDetectionOutput1>();
pass_config->disable<ov::pass::ConvertROIAlign9To3>();
pass_config->disable<ov::pass::SoftSignDecomposition>();
pass_config->disable<ov::pass::UniqueDecomposition>();
pass_config->enable<ov::pass::NormalizeL2Decomposition>();
pass_config->enable<ov::pass::ConvertInterpolate1ToInterpolate4>();
pass_config->enable<ov::pass::ConvertGather1ToGather7>();
pass_config->enable<ov::pass::ConvertDetectionOutput1ToDetectionOutput8>();
pass_config->enable<ov::pass::ConvertROIAlign3To9>();
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part3);
pass_config->set_callback<ov::pass::AddFakeQuantizeFusion,
ov::pass::MulFakeQuantizeFusion,
ov::pass::FakeQuantizeMulFusion>([](const_node_ptr &node) -> bool {
std::string errMsg;
return !node::FakeQuantize::isSupportedOperation(node, errMsg);
});
pass_config->set_callback<ov::pass::ConvertQuantizeDequantize>([&defaultPrecisions](const_node_ptr &node) -> bool {
return ngraph::pass::low_precision::NetworkHelper::areQuantizeAndDequantizeSupportedForMultiply(node, defaultPrecisions);
});
}
manager.run_passes(nGraphFunc);
using namespace ngraph::pass::low_precision;
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
//Only enable conv/group conv signed input on AMX platform.
std::vector<ngraph::element::Type> input0LowPrecisionList;
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
input0LowPrecisionList = {ngraph::element::u8, ngraph::element::i8};
} else {
input0LowPrecisionList = {ngraph::element::u8};
}
auto supportedPrecisions = std::vector<PrecisionsRestriction>({
PrecisionsRestriction::create<ngraph::opset1::Convolution>({
{{0}, input0LowPrecisionList},
{{1}, {ngraph::element::i8}},
}),
PrecisionsRestriction::create<ngraph::opset1::ConvolutionBackpropData>({
{{0}, {ngraph::element::u8, ngraph::element::i8}},
{{1}, {ngraph::element::i8}}
}),
PrecisionsRestriction::create<ngraph::opset1::GroupConvolution>({
{{0}, input0LowPrecisionList},
{{1}, {ngraph::element::i8}}
}),
PrecisionsRestriction::create<ngraph::opset1::Multiply>({
{{0}, {ngraph::element::u8}},
{{1}, {ngraph::element::i8}},
}),
PrecisionsRestriction::create<ngraph::opset1::MatMul>({
{{0}, {ngraph::element::u8, ngraph::element::i8}},
{{1}, {ngraph::element::i8}}
}),
PrecisionsRestriction::create<ngraph::opset5::LSTMSequence>({
{{0, 1}, {ngraph::element::u8, ngraph::element::i8}},
}),
PrecisionsRestriction::create<ngraph::opset6::GRUSequence>({
{{0, 1}, {ngraph::element::u8, ngraph::element::i8}},
}),
});
auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>({
QuantizationGranularityRestriction::create<ngraph::opset1::Convolution>({0}),
QuantizationGranularityRestriction::create<ngraph::opset1::ConvolutionBackpropData>({0})
});
// for GNA networks reference execution
bool updatePrecision = true;
if (hasINT16orINT32Levels) {
updatePrecision = false;
supportedPrecisions = std::vector<PrecisionsRestriction>({});
}
ov::pass::Manager lptManager;
lptManager.register_pass<ngraph::pass::low_precision::LowPrecision>(
supportedPrecisions,
quantizationRestrictions,
LayerTransformation::Params(updatePrecision, ngraph::element::f32, defaultPrecisions));
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::MarkupPrecisions>([](const_node_ptr& node) -> bool {
if (const auto mulitply = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply);
}
return false;
});
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::ConvolutionBackpropDataTransformation>(
[&defaultPrecisions](const_node_ptr& node) -> bool {
return LayerTransformation::isAsymmetricQuantization(node, defaultPrecisions) ||
WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions);
});
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::MultiplyToGroupConvolutionTransformation>([](const_node_ptr& node) -> bool {
return true;//MultiplyToGroupConvolutionTransformation::isDynamicOrScalar(node);
});
lptManager.run_passes(nGraphFunc);
}
ov::pass::Manager postLPTPassManager;
postLPTPassManager.register_pass<ov::pass::UnrollTensorIterator>();
postLPTPassManager.register_pass<ov::pass::ReshapePRelu>();
postLPTPassManager.get_pass_config()->set_callback<ov::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});
postLPTPassManager.register_pass<MoveEltwiseUpThroughDataMov>();
postLPTPassManager.get_pass_config()->set_callback<MoveEltwiseUpThroughDataMov>([](const std::shared_ptr<const ngraph::Node>& node) -> bool {
if (node->get_input_size() >= 2) {
return node->get_input_element_type(1) == ngraph::element::i8 || node->get_input_element_type(1) == ngraph::element::u8;
}
return false;
});
postLPTPassManager.register_pass<ov::pass::ConstantFolding>();
// Snippets may brake MHA patterns so the fusion has to performed before
postLPTPassManager.register_pass<MHAFusion>();
postLPTPassManager.register_pass<FuseFQtoInteraction>();
postLPTPassManager.get_pass_config()->set_callback<MHAFloatFusion, MHAFloatFusion2,
MHAQuantFusion, MHAQuantFusion2>([_enableBF16](const std::shared_ptr<const ov::Node>& n) -> bool {
std::string errorMessage;
if (!node::MHA::isSupportedOperation(n, errorMessage))
return true;
// Implementation calls AMX BF16 brgemm only for tensors with K and N aligned on 2, otherwise fallbacks on vector impl
// Vector madd BF16 instruction on SPR has reduced performance on HW level, which results in overall perf degradation
size_t bf16Factor = 2;
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16_amx_bf16) &&
(n->get_input_element_type(0) == element::bf16 || (n->get_input_element_type(0) == element::f32 && _enableBF16)) &&
(n->get_input_shape(0)[3] % bf16Factor != 0 || n->get_input_shape(1)[1] % bf16Factor != 0 || n->get_input_shape(3)[3] % bf16Factor != 0)) {
return true;
}
return false;
});
// Execute before snippets. Otherwise FQ will be converted to Subgraph
postLPTPassManager.register_pass<ConvertFqRnnToQuantizedRnn>();
postLPTPassManager.run_passes(nGraphFunc);
if (_enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
ov::pass::Manager snippetsManager;
snippetsManager.register_pass<SnippetsMarkSkipped>();
snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
[](const std::shared_ptr<const ov::Node>& n) -> bool {
// CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
if (ov::is_type<const ov::op::v4::Swish>(n)) {
if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
return true;
}
const auto& inputs = n->inputs();
// todo: clarify whether we can evaluate snippets on const paths
const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
[](const ov::Input<const ov::Node> &in) {
return ov::is_type<ov::op::v0::Constant>(in.get_source_output().get_node_shared_ptr());
});
// todo: clarify whether we can evaluate snippets on inputs with larger ranks
auto rank_is_too_large = [](const ov::descriptor::Tensor& t ) {
// callback is called has_supported_in_out(), so it's safe to assume that the shapes are static
return t.get_partial_shape().rank().get_length() > 6;
};
const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(),
[&](const ov::Input<const ov::Node>& in) {return rank_is_too_large(in.get_tensor());});
const auto& outputs = n->outputs();
const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
[&](const ov::Output<const ov::Node>& out) {return rank_is_too_large(out.get_tensor());});
return has_only_const_inputs || bad_input_rank || bad_output_rank;
});
snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
snippetsManager.run_passes(nGraphFunc);
}
ov::pass::Manager postSnippetsManager;
postSnippetsManager.register_pass<ov::pass::FakeQuantizeDecomposition>();
postSnippetsManager.get_pass_config()->set_callback<ov::pass::FakeQuantizeDecomposition>([](const_node_ptr& node) -> bool {
std::string errMsg;
return node::FakeQuantize::isSupportedOperation(node, errMsg);
});
postSnippetsManager.register_pass<ov::pass::ConstantFolding>();
postSnippetsManager.run_passes(nGraphFunc);
}
static bool streamsSet(const std::map<std::string, std::string>& config) {
return config.count(PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) ||
config.count(ov::num_streams.name());
@ -883,7 +325,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
const bool enableLPT = (lptProp != config.end() && lptProp->second == PluginConfigParams::YES) /* enabled in the orig_config*/
|| Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled for the plugin */;
const auto& BF16Prop = config.find(InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16);
bool enableBF16;
bool enableBF16 = false;
if (BF16Prop != config.end()) {
if (BF16Prop->second == PluginConfigParams::YES) {
enableBF16 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core);
@ -901,7 +343,8 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
DEBUG_LOG(PrintableModel(*nGraphFunc, "org_"));
TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableBF16, enableSnippets, isLegacyAPI());
Transformations transformations(nGraphFunc, enableLPT, enableSnippets, enableBF16, isLegacyAPI(), engConfig);
transformations.UpToCpuSpecificOpSet();
// need to check that all outputs have static shapes
// checking that all inputs have static shapes is performed in the common part
@ -914,8 +357,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
}
ApplyPerformanceHints(config, nGraphFunc);
ConvertToCPUSpecificOpset(nGraphFunc);
transformations.CpuSpecificOpSet();
DEBUG_LOG(PrintableModel(*nGraphFunc, "cpu_"));
@ -1153,19 +595,20 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
}
auto supported = GetSupportedNodes(model,
[&](std::shared_ptr<ov::Model>& model) {
TransformationUpToCPUSpecificOpSet(model, enableLPT, conf.enforceBF16, enableSnippets, isLegacyAPI());
ConvertToCPUSpecificOpset(model);
},
[&](const std::shared_ptr<ngraph::Node>& op) {
std::unique_ptr<Node> ptr;
try {
ptr.reset(Node::factory().create(op, {dnnl::engine::kind::cpu, 0}, extensionManager, fake_w_cache));
} catch (const InferenceEngine::Exception&) {
return false;
}
return true;
});
[&](std::shared_ptr<ov::Model>& model) {
Transformations transformation(model, enableLPT, enableSnippets, conf.enforceBF16, isLegacyAPI(), engConfig);
transformation.UpToCpuSpecificOpSet();
transformation.CpuSpecificOpSet();
},
[&](const std::shared_ptr<ngraph::Node>& op) {
std::unique_ptr<Node> ptr;
try {
ptr.reset(Node::factory().create(op, {dnnl::engine::kind::cpu, 0}, extensionManager, fake_w_cache));
} catch (const InferenceEngine::Exception&) {
return false;
}
return true;
});
for (auto&& layerName : supported) {
res.supportedLayersMap.emplace(layerName, GetName());

View File

@ -4,16 +4,12 @@
#pragma once
#include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
#include "exec_network.h"
#include <string>
#include <map>
#include <unordered_map>
#include <memory>
#include <functional>
#include <vector>
#include <cfloat>
namespace ov {
namespace intel_cpu {

View File

@ -0,0 +1,609 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "transformation_pipeline.h"
// Operations
#include "openvino/opsets/opset1.hpp"
#include "openvino/opsets/opset2.hpp"
#include "openvino/opsets/opset3.hpp"
#include "openvino/opsets/opset4.hpp"
#include "openvino/opsets/opset5.hpp"
#include "openvino/opsets/opset6.hpp"
#include "openvino/opsets/opset10.hpp"
#include <ov_ops/augru_cell.hpp>
#include <ov_ops/augru_sequence.hpp>
// Common transformations
#include "transformations/common_optimizations/add_fake_quantize_fusion.hpp"
#include "transformations/common_optimizations/convert_compression_only_to_legacy.hpp"
#include "transformations/common_optimizations/convert_quantize_dequantize.hpp"
#include "transformations/common_optimizations/fq_mul_fusion.hpp"
#include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp"
#include "transformations/common_optimizations/nop_elimination.hpp"
#include "transformations/common_optimizations/transpose_sinking.hpp"
#include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp"
#include "transformations/common_optimizations/augru_cell_fusion.hpp"
#include "transformations/common_optimizations/common_optimizations.hpp"
#include "transformations/common_optimizations/wrap_interpolate_into_transposes.hpp"
#include "transformations/control_flow/unroll_tensor_iterator.hpp"
#include "transformations/disable_decompression_convert_constant_folding.hpp"
#include "transformations/op_conversions/convert_batch_to_space.hpp"
#include "transformations/op_conversions/convert_broadcast_to_tiles.hpp"
#include "transformations/op_conversions/convert_depth_to_space.hpp"
#include "transformations/op_conversions/convert_gather_downgrade.hpp"
#include "transformations/op_conversions/convert_gather_upgrade.hpp"
#include "transformations/op_conversions/convert_gelu.hpp"
#include "transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp"
#include "transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp"
#include "transformations/op_conversions/convert_minimum_to_power_and_max.hpp"
#include "transformations/op_conversions/convert_mod.hpp"
#include "transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp"
#include "transformations/op_conversions/convert_nms9_to_nms_ie_internal.hpp"
#include "transformations/op_conversions/convert_previous_nms_to_nms_9.hpp"
#include "transformations/op_conversions/convert_reduce_to_pooling.hpp"
#include "transformations/op_conversions/convert_roi_align_v3_to_v9.hpp"
#include "transformations/op_conversions/convert_roi_align_v9_to_v3.hpp"
#include "transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp"
#include "transformations/op_conversions/convert_shuffle_channels3.hpp"
#include "transformations/op_conversions/convert_slice_to_strided_slice.hpp"
#include "transformations/op_conversions/convert_space_to_batch.hpp"
#include "transformations/op_conversions/convert_space_to_depth.hpp"
#include "transformations/op_conversions/convert_subtract.hpp"
#include "transformations/op_conversions/convert_ti_to_sequences.hpp"
#include "transformations/op_conversions/detection_output_downgrade.hpp"
#include "transformations/op_conversions/detection_output_upgrade.hpp"
#include "transformations/op_conversions/eye_decomposition.hpp"
#include "transformations/op_conversions/fq_decomposition.hpp"
#include "transformations/op_conversions/gelu7_downgrade.hpp"
#include "transformations/op_conversions/hsigmoid_decomposition.hpp"
#include "transformations/op_conversions/hswish_decomposition.hpp"
#include "transformations/op_conversions/gru_cell_decomposition.hpp"
#include "transformations/op_conversions/lstm_cell_decomposition.hpp"
#include "transformations/op_conversions/mvn6_decomposition.hpp"
#include "transformations/op_conversions/normalize_l2_decomposition.hpp"
#include "transformations/op_conversions/reduce_l1_decomposition.hpp"
#include "transformations/op_conversions/reduce_l2_decomposition.hpp"
#include "transformations/op_conversions/rnn_cell_decomposition.hpp"
#include "transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp"
#include "transformations/op_conversions/softplus_decomposition.hpp"
#include "transformations/op_conversions/softsign_decomposition.hpp"
#include "transformations/op_conversions/softmax_decomposition.hpp"
#include "transformations/op_conversions/unique_decomposition.hpp"
#include "transformations/opset_conversions/convert_opset2_to_opset1.hpp"
#include "transformations/opset_conversions/convert_opset3_to_opset2.hpp"
#include "transformations/smart_reshape/matmul_sr.hpp"
#include "transformations/init_node_info.hpp"
#include "utils/ngraph_transformation.hpp"
// LPT transformations
#include "transformations/low_precision/mark_dequantization_subgraph.hpp"
#include "low_precision/convolution_backprop_data.hpp"
#include "low_precision/convert_subtract_constant.hpp"
#include "low_precision/network_helper.hpp"
#include "low_precision/multiply_to_group_convolution.hpp"
#include "low_precision/group_convolution.hpp"
// CPU specific transformations
#include "ngraph_transformations/convert_to_cpu_specific_opset.hpp"
#include "ngraph_transformations/snippets_mark_skipped.hpp"
#include "ngraph_transformations/mha_fusion.hpp"
#include "ngraph_transformations/convert_to_interaction.hpp"
#include "ngraph_transformations/convert_fq_rnn_to_quantized_rnn.hpp"
#include "ngraph_transformations/move_eltwise_up_data_movement.hpp"
#include "ngraph_transformations/swap_convert_transpose.hpp"
// Snippets
#include "snippets/pass/collapse_subgraph.hpp"
#include "snippets/pass/common_optimizations.hpp"
// Misc
#include "nodes/mvn.h"
#include "nodes/normalize.h"
#include "nodes/fake_quantize.h"
#include "nodes/mha.h"
#include "dnnl.hpp"
#include <cpu/x64/cpu_isa_traits.hpp>
namespace ov {
namespace intel_cpu {
using const_node_ptr = const std::shared_ptr<const ov::Node>;
bool Transformations::fuse_type_to_convert(const std::shared_ptr<ngraph::Node>& node, ov::element::Type to, size_t idx) {
if (auto convert = ov::as_type_ptr<ov::opset10::Convert>(node)) {
// For Convert node, converting precision from floating point to boolean will lead to mathematical
// error, because here the output precision boolean is replaced by u8. E.g. floating point value 0.01
// is converted to be 1 for boolean, but 0 for u8. Thus an Abs and Ceil node should be added before the
// Convert node for this scenario.
if (convert->input(0).get_element_type().is_real() &&
convert->get_convert_element_type() == ngraph::element::boolean && to.is_integral_number()) {
auto abs = std::make_shared<ov::opset10::Abs>(convert->input_value(0).get_node_shared_ptr());
auto ceil = std::make_shared<ov::opset10::Ceiling>(abs);
auto new_convert = std::make_shared<ov::opset10::Convert>(ceil, to);
new_convert->set_friendly_name(convert->get_friendly_name());
ov::copy_runtime_info(convert, {abs, ceil, new_convert});
ov::replace_node(convert, new_convert);
return true;
} else {
convert->set_convert_element_type(to);
return true;
}
}
return false;
}
void Transformations::UpToCpuSpecificOpSet() {
const bool useLpt = enableLpt &&
ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(model) &&
CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt);
const bool useSnippets = enableSnippets &&
CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Snippets);
auto defaultPrecisions = useLpt ? ngraph::pass::low_precision::precision_set::int8_support : std::vector<ov::element::Type>{};
bool hasINT16orINT32Levels = false;
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part1);
hasINT16orINT32Levels = ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent(
model,
{ngraph::pass::low_precision::levels::int16, ngraph::pass::low_precision::levels::int16_narrow_range,
ngraph::pass::low_precision::levels::int32, ngraph::pass::low_precision::levels::int32_narrow_range});
if (hasINT16orINT32Levels) {
defaultPrecisions = ngraph::pass::low_precision::precision_set::int8_int16_int32_support;
}
}
PreLpt(defaultPrecisions, isLegacyApi);
if (useLpt)
Lpt(hasINT16orINT32Levels, defaultPrecisions);
PostLpt();
if (useSnippets)
Snippets();
}
void Transformations::CpuSpecificOpSet(void) {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Specific);
ConvertToCPUSpecificOpset(model);
}
void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecisions, const bool isLegacyApi) {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PreLpt);
ov::pass::Manager manager;
manager.set_per_pass_validation(false);
manager.register_pass<ov::pass::InitNodeInfo>();
const bool useLpt = !defaultPrecisions.empty();
if (useLpt) {
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(defaultPrecisions);
}
auto get_convert_precisions = []() {
precisions_array array = {
{ov::element::i64, ov::element::i32},
{ov::element::u64, ov::element::i32},
{ov::element::i16, ov::element::i32},
{ov::element::u16, ov::element::i32},
{ov::element::u32, ov::element::i32},
{ov::element::f64, ov::element::f32},
{ov::element::f16, ov::element::f32},
{ov::element::boolean, ov::element::u8},
{ov::element::i4, ov::element::i8},
{ov::element::u4, ov::element::u8}
};
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))
array.push_back({ov::element::bf16, ov::element::f32});
return array;
};
static const auto precisions = get_convert_precisions();
type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};
manager.register_pass<ov::pass::AUGRUCellFusion>();
manager.register_pass<ov::pass::CommonOptimizations>();
manager.register_pass<ov::pass::WrapInterpolateIntoTransposes>();
manager.register_pass<ov::pass::TransposeSinking>();
manager.register_pass<ov::pass::ConvertSequenceToTensorIterator>();
manager.register_pass<ov::pass::ConvertOpSet3ToOpSet2>();
manager.register_pass<ov::pass::ConvertOpSet2ToOpSet1>();
manager.register_pass<ov::pass::LSTMCellDecomposition>();
manager.register_pass<ov::pass::GRUCellDecomposition>();
manager.register_pass<ov::pass::RNNCellDecomposition>();
manager.register_pass<ov::pass::ConvertNMS1ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS3ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS4ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS5ToNMS9>();
manager.register_pass<ov::pass::ConvertNMS9ToNMSIEInternal>();
manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
manager.register_pass<ov::pass::ConvertMatrixNmsToMatrixNmsIE>();
manager.register_pass<ov::pass::TransposeMatMul>();
manager.register_pass<ov::pass::ConstantFolding>();
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part2);
manager.register_pass<ngraph::pass::low_precision::ConvertSubtractConstant>(defaultPrecisions);
}
manager.register_pass<ov::pass::Validate>();
manager.register_pass<ov::pass::ConvertPrecision>(precisions, type_to_fuse);
manager.register_pass<ov::pass::EliminateConvert>();
manager.register_pass<SwapConvertTranspose>();
manager.register_pass<ConvertToInteraction>();
manager.register_pass<ConvertInteractionInt8>();
auto pass_config = manager.get_pass_config();
// SpaceToDepth/ DepthToSpace node implementation supports only equal input/output tensors with rank <= 5
pass_config->set_callback<ov::pass::ConvertSpaceToDepth,
ov::pass::ConvertDepthToSpace>(
[](const_node_ptr &node) -> bool {
return node->input_value(0).get_shape().size() <= 5lu &&
node->input_value(0).get_shape().size() == node->get_output_shape(0).size();
});
pass_config->set_callback<ov::pass::ConvertBatchToSpace,
ov::pass::ConvertSpaceToBatch>(
[](const_node_ptr &node) -> bool {
const auto & rank = node->input(0).get_partial_shape().rank().get_length();
return rank == 4lu || rank == 5lu;
});
auto isCellPrimitiveSupported = [](const_node_ptr &node) -> bool {
if (const auto &rnn_cell = std::dynamic_pointer_cast<const ov::opset4::RNNCell>(node)) {
return rnn_cell->get_clip() == 0.0f;
} else if (const auto &gru_cell = std::dynamic_pointer_cast<const ov::opset4::GRUCell>(
node)) {
return gru_cell->get_clip() == 0.0f
&& gru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
} else if (const auto &augru_cell = std::dynamic_pointer_cast<const ov::op::internal::AUGRUCell>(
node)) {
return augru_cell->get_clip() == 0.0f
&& augru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
} else if (const auto &lstm_cell = std::dynamic_pointer_cast<const ov::opset4::LSTMCell>(
node)) {
return lstm_cell->get_clip() == 0.0f &&
lstm_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
} else if (const auto &lstm_cell_v1 = std::dynamic_pointer_cast<const ov::opset1::LSTMCell>(
node)) {
return lstm_cell_v1->get_clip() == 0.0f &&
lstm_cell_v1->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
}
return false;
};
// Sequences supported by the plugin shouldn't be converted to TensorIterator.
// sequence_length input is not supported in all Sequences, so if is_seq_len_provided() == true, we
// should always convert to TensorIterator.
// RNN/GRU/LSTM Sequences are supported with clip == 0, and with default activations.
auto isSequencePrimitiveSupported = [](const_node_ptr &node) -> bool {
const auto& data = node->input(0);
const auto& data_pshape = data.get_partial_shape();
// WA: dynamic shapes make impossible to check seq_len due to shapeOf subgraphs
// but the sequence is still supported in CPU and doesn't need to be decomposed
if (data_pshape.is_dynamic())
return true;
if (data_pshape.rank().is_static() && data_pshape.rank().get_length() > 1 && !data_pshape[1].is_static())
return false;
auto max_seq_len = data.get_shape().at(1);
if (const auto &rnn_seq = std::dynamic_pointer_cast<const ov::opset6::RNNSequence>(node)) {
return rnn_seq->get_clip() == 0.0f &&
!ov::op::util::is_seq_len_provided(rnn_seq->get_input_node_shared_ptr(2),
max_seq_len);
} else if (const auto &gru_seq = std::dynamic_pointer_cast<const ov::opset6::GRUSequence>(
node)) {
return gru_seq->get_clip() == 0.0f &&
gru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
!ov::op::util::is_seq_len_provided(gru_seq->get_input_node_shared_ptr(2),
max_seq_len);
} else if (const auto &augru_seq = std::dynamic_pointer_cast<const ov::op::internal::AUGRUSequence>(
node)) {
return augru_seq->get_clip() == 0.0f &&
augru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
!ov::op::util::is_seq_len_provided(augru_seq->get_input_node_shared_ptr(2),
max_seq_len);
} else if (const auto &lstm_seq = std::dynamic_pointer_cast<const ov::opset6::LSTMSequence>(
node)) {
return lstm_seq->get_clip() == 0.0f &&
lstm_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"} &&
!ov::op::util::is_seq_len_provided(lstm_seq->get_input_node_shared_ptr(3),
max_seq_len);
}
return false;
};
pass_config->set_callback<ov::pass::ConvertRNNSequenceToTensorIterator,
ov::pass::ConvertGRUSequenceToTensorIterator,
ov::pass::ConvertLSTMSequenceToTensorIterator>(
[isSequencePrimitiveSupported](const_node_ptr &node) -> bool {
return isSequencePrimitiveSupported(node);
});
pass_config->set_callback<ov::pass::RNNCellDecomposition, ov::pass::GRUCellDecomposition,
ov::pass::LSTMCellDecomposition>(
[isCellPrimitiveSupported](const_node_ptr &node) -> bool {
return isCellPrimitiveSupported(node);
});
pass_config->set_callback<ov::pass::MVN6Decomposition>(
[](const_node_ptr &node) -> bool {
std::string errorMessage;
return node::MVN::isSupportedOperation(node, errorMessage);
});
pass_config->set_callback<ov::pass::NormalizeL2Decomposition>(
[](const_node_ptr &node) -> bool {
std::string errorMsg;
return node::NormalizeL2::isSupportedOperation(node, errorMsg);
});
pass_config->enable<ngraph::pass::SoftmaxDecomposition>();
pass_config->set_callback<ngraph::pass::SoftmaxDecomposition>(
[](const_node_ptr &node) -> bool {
return node->input_value(0).get_partial_shape().rank().get_length() <= 5;
});
if (!isLegacyApi) {
auto nmsCallback = [](const_node_ptr &node) -> bool {
for (size_t i = 0; i < node->get_output_size(); i++) {
const auto outputs = node->get_output_target_inputs(i);
for (const auto &out : outputs) {
if (!ov::op::util::is_output(out.get_node())) {
return false;
}
}
}
return true;
};
pass_config->set_callback<ov::pass::ConvertNMS9ToNMSIEInternal>(nmsCallback);
pass_config->set_callback<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>(nmsCallback);
pass_config->set_callback<ov::pass::ConvertMatrixNmsToMatrixNmsIE>(nmsCallback);
}
// List of enabled/disabled transformations
// Allow FP16 Converts to be folded and FP16 constants to be upgraded to FP32 data type
pass_config->disable<ov::pass::DisableDecompressionConvertConstantFolding>();
pass_config->disable<ov::pass::ConvertCompressedOnlyToLegacy>();
pass_config->disable<ov::pass::EyeDecomposition>();
pass_config->disable<ov::pass::ConvertGELU>();
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
pass_config->disable<ov::pass::Gelu7Downgrade>();
pass_config->disable<ov::pass::HSwishDecomposition>();
pass_config->disable<ov::pass::ReduceL1Decomposition>();
pass_config->disable<ov::pass::ReduceL2Decomposition>();
pass_config->disable<ov::pass::SoftPlusDecomposition>();
pass_config->disable<ov::pass::HSigmoidDecomposition>();
pass_config->disable<ov::pass::ConvertMod>();
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
pass_config->disable<ov::pass::WeightsDequantizeToFakeQuantize>();
pass_config->disable<ov::pass::SimplifyCTCGreedyDecoderSeqLen>();
pass_config->disable<ov::pass::ConvertGather7ToGather1>();
pass_config->disable<ov::pass::ConvertGather8ToGather7>();
pass_config->disable<ov::pass::ConvertMinimum>();
pass_config->disable<ov::pass::ConvertBroadcastToTiles>();
pass_config->disable<ov::pass::ConvertReduceMeanToPooling>();
pass_config->disable<ov::pass::ConvertReduceMaxToPooling>();
pass_config->disable<ov::pass::ConvertReduceSumToPooling>();
pass_config->disable<ov::pass::SliceToStridedSlice>();
pass_config->disable<ov::pass::ConvertDetectionOutput8ToDetectionOutput1>();
pass_config->disable<ov::pass::ConvertROIAlign9To3>();
pass_config->disable<ov::pass::SoftSignDecomposition>();
pass_config->disable<ov::pass::UniqueDecomposition>();
pass_config->enable<ov::pass::NormalizeL2Decomposition>();
pass_config->enable<ov::pass::ConvertInterpolate1ToInterpolate4>();
pass_config->enable<ov::pass::ConvertGather1ToGather7>();
pass_config->enable<ov::pass::ConvertDetectionOutput1ToDetectionOutput8>();
pass_config->enable<ov::pass::ConvertROIAlign3To9>();
if (useLpt) {
CPU_LPT_SCOPE(LowPrecisionTransformations_Part3);
pass_config->set_callback<ov::pass::AddFakeQuantizeFusion,
ov::pass::MulFakeQuantizeFusion,
ov::pass::FakeQuantizeMulFusion>(
[](const_node_ptr &node) -> bool {
std::string errMsg;
return !node::FakeQuantize::isSupportedOperation(node, errMsg);
});
pass_config->set_callback<ov::pass::ConvertQuantizeDequantize>([&defaultPrecisions](const_node_ptr &node) -> bool {
return ngraph::pass::low_precision::NetworkHelper::areQuantizeAndDequantizeSupportedForMultiply(node, defaultPrecisions);
});
}
manager.run_passes(model);
}
void Transformations::Lpt(const bool hasINT16orINT32Levels, const std::vector<ov::element::Type>& defaultPrecisions) {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);
using namespace ngraph::pass::low_precision;
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
//Only enable conv/group conv signed input on AMX platform.
std::vector<ov::element::Type> input0LowPrecisionList;
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
input0LowPrecisionList = {ov::element::u8, ov::element::i8};
} else {
input0LowPrecisionList = {ov::element::u8};
}
auto supportedPrecisions = std::vector<PrecisionsRestriction>({
PrecisionsRestriction::create<ov::opset1::Convolution>({
{{0}, input0LowPrecisionList},
{{1}, {ov::element::i8}},
}),
PrecisionsRestriction::create<ov::opset1::ConvolutionBackpropData>({
{{0}, {ov::element::u8, ov::element::i8}},
{{1}, {ov::element::i8}}
}),
PrecisionsRestriction::create<ov::opset1::GroupConvolution>({
{{0}, input0LowPrecisionList},
{{1}, {ov::element::i8}}
}),
PrecisionsRestriction::create<ov::opset1::Multiply>({
{{0}, {ov::element::u8}},
{{1}, {ov::element::i8}},
}),
PrecisionsRestriction::create<ov::opset1::MatMul>({
{{0}, {ov::element::u8, ov::element::i8}},
{{1}, {ov::element::i8}}
}),
PrecisionsRestriction::create<ov::opset5::LSTMSequence>({
{{0, 1}, {ov::element::u8, ov::element::i8}},
}),
PrecisionsRestriction::create<ov::opset6::GRUSequence>({
{{0, 1}, {ov::element::u8, ov::element::i8}},
}),
});
auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>({
QuantizationGranularityRestriction::create<ov::opset1::Convolution>({0}),
QuantizationGranularityRestriction::create<ov::opset1::ConvolutionBackpropData>({0})
});
// for GNA networks reference execution
bool updatePrecision = true;
if (hasINT16orINT32Levels) {
updatePrecision = false;
supportedPrecisions = std::vector<PrecisionsRestriction>({});
}
ov::pass::Manager lptManager;
lptManager.register_pass<ngraph::pass::low_precision::LowPrecision>(
supportedPrecisions,
quantizationRestrictions,
LayerTransformation::Params(updatePrecision, ov::element::f32, defaultPrecisions));
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::MarkupPrecisions>([](const_node_ptr& node) -> bool {
if (const auto mulitply = std::dynamic_pointer_cast<const ov::opset1::Multiply>(node)) {
return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply);
}
return false;
});
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::ConvolutionBackpropDataTransformation>(
[&defaultPrecisions](const_node_ptr& node) -> bool {
return LayerTransformation::isAsymmetricQuantization(node, defaultPrecisions) ||
WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions);
});
lptManager.get_pass_config()->disable<ngraph::pass::low_precision::MultiplyToGroupConvolutionTransformation>();
lptManager.run_passes(model);
}
void Transformations::PostLpt() {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PostLpt);
ov::pass::Manager postLPTPassManager;
postLPTPassManager.register_pass<ov::pass::UnrollTensorIterator>();
postLPTPassManager.register_pass<ov::pass::ReshapePRelu>();
postLPTPassManager.get_pass_config()->set_callback<ov::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
return node->get_rt_info().count("UNROLL_TI") == 0;
});
postLPTPassManager.register_pass<MoveEltwiseUpThroughDataMov>();
postLPTPassManager.get_pass_config()->set_callback<MoveEltwiseUpThroughDataMov>([](const std::shared_ptr<const ov::Node>& node) -> bool {
if (node->get_input_size() >= 2) {
return node->get_input_element_type(1) == ov::element::i8 || node->get_input_element_type(1) == ov::element::u8;
}
return false;
});
postLPTPassManager.register_pass<ov::pass::ConstantFolding>();
// Snippets may brake MHA patterns so the fusion has to performed before
postLPTPassManager.register_pass<MHAFusion>();
postLPTPassManager.register_pass<FuseFQtoInteraction>();
postLPTPassManager.get_pass_config()->set_callback<MHAFloatFusion, MHAFloatFusion2,
MHAQuantFusion, MHAQuantFusion2>
([this](const std::shared_ptr<const ov::Node>& n) -> bool {
std::string errorMessage;
if (!node::MHA::isSupportedOperation(n, errorMessage))
return true;
// Implementation calls AMX BF16 brgemm only for tensors with K and N aligned on 2, otherwise fallbacks on vector impl
// Vector madd BF16 instruction on SPR has reduced performance on HW level, which results in overall perf degradation
size_t bf16Factor = 2;
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16_amx_bf16) &&
(n->get_input_element_type(0) == element::bf16 || (n->get_input_element_type(0) == element::f32 && enableBF16)) &&
(n->get_input_shape(0)[3] % bf16Factor != 0 || n->get_input_shape(1)[1] % bf16Factor != 0 || n->get_input_shape(3)[3] % bf16Factor != 0)) {
return true;
}
return false;
});
// Execute before snippets. Otherwise FQ will be converted to Subgraph
postLPTPassManager.register_pass<ConvertFqRnnToQuantizedRnn>();
postLPTPassManager.run_passes(model);
}
void Transformations::MainSnippets(void) {
if (!enableSnippets ||
!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) // snippets are implemeted only for relevant platforms (avx2+ extentions)
return;
ov::pass::Manager snippetsManager;
snippetsManager.register_pass<SnippetsMarkSkipped>();
snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
[](const std::shared_ptr<const ov::Node>& n) -> bool {
// CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
if (ov::is_type<const ov::op::v4::Swish>(n)) {
if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
return true;
}
const auto& inputs = n->inputs();
// todo: clarify whether we can evaluate snippets on const paths
const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
[](const ov::Input<const ov::Node> &in) {
return ov::is_type<ov::op::v0::Constant>(in.get_source_output().get_node_shared_ptr());
});
// todo: clarify whether we can evaluate snippets on inputs with larger ranks
auto rank_is_too_large = [](const ov::descriptor::Tensor& t ) {
// callback is called has_supported_in_out(), so it's safe to assume that the shapes are static
return t.get_partial_shape().rank().get_length() > 6;
};
const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(),
[&](const ov::Input<const ov::Node>& in) {return rank_is_too_large(in.get_tensor());});
const auto& outputs = n->outputs();
const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
[&](const ov::Output<const ov::Node>& out) {return rank_is_too_large(out.get_tensor());});
return has_only_const_inputs || bad_input_rank || bad_output_rank;
});
snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
snippetsManager.run_passes(model);
}
void Transformations::PostSnippets(void) {
ov::pass::Manager postSnippetsManager;
postSnippetsManager.register_pass<ov::pass::FakeQuantizeDecomposition>();
postSnippetsManager.get_pass_config()->set_callback<ov::pass::FakeQuantizeDecomposition>([](const_node_ptr& node) -> bool {
std::string errMsg;
return node::FakeQuantize::isSupportedOperation(node, errMsg);
});
postSnippetsManager.register_pass<ov::pass::ConstantFolding>();
postSnippetsManager.run_passes(model);
}
void Transformations::Snippets(void) {
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Snippets);
MainSnippets();
PostSnippets();
}
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,65 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "openvino/core/model.hpp"
#include "low_precision/low_precision.hpp"
#include "config.h"
#include "itt.h"
#include <memory>
#include <vector>
using namespace InferenceEngine;
#define IE_CPU_PLUGIN_THROW(...) IE_THROW(__VA_ARGS__) << "CPU plugin: "
namespace ov {
namespace intel_cpu {
class Transformations {
public:
Transformations(const std::shared_ptr<ov::Model>& initialModel,
const bool enableLpt,
const bool enableSnippets,
const bool enableBF16,
const bool isLegacyApi,
const Config& config)
: model(initialModel),
enableLpt(enableLpt),
enableSnippets(enableSnippets),
enableBF16(enableBF16),
isLegacyApi(isLegacyApi),
config(config) {}
void UpToCpuSpecificOpSet();
void CpuSpecificOpSet(void);
private:
std::shared_ptr<ov::Model> model;
const bool enableLpt;
const bool enableSnippets;
const bool enableBF16;
const bool isLegacyApi;
const Config& config;
void PreLpt(const std::vector<ov::element::Type>& defaultPrecisions, const bool isLegacyApi);
void Lpt(const bool hasINT16orINT32Levels, const std::vector<ov::element::Type>& defaultPrecisions);
void PostLpt();
void MainSnippets(void);
void PostSnippets(void);
void Snippets(void);
static bool fuse_type_to_convert(const std::shared_ptr<ngraph::Node>& node, ov::element::Type to, size_t idx);
};
} // namespace intel_cpu
} // namespace ov

View File

@ -5,7 +5,7 @@
#ifdef CPU_DEBUG_CAPS
#define CPU_DEBUG_CAP_ENABLE(_x) _x;
#define CPU_DEBUG_CAP_ENABLE(...) __VA_ARGS__
#define CPU_DEBUG_CAPS_ALWAYS_TRUE(x) true
#include <string>
@ -147,7 +147,7 @@ static inline std::ostream& write_all_to_stream(std::ostream& os, const T& arg,
#else // !CPU_DEBUG_CAPS
#define CPU_DEBUG_CAP_ENABLE(_x)
#define CPU_DEBUG_CAP_ENABLE(...)
#define CPU_DEBUG_CAPS_ALWAYS_TRUE(x) x
#define DEBUG_LOG(...)

View File

@ -0,0 +1,66 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#ifdef CPU_DEBUG_CAPS
#include "debug_caps_config.h"
#include <string>
namespace ov {
namespace intel_cpu {
void DebugCapsConfig::readProperties() {
auto readEnv = [](const char* envVar) {
return std::getenv(envVar);
};
auto parseDumpFormat = [](const std::string& format) {
if (format == "BIN")
return FORMAT::BIN;
else if (format == "TEXT")
return FORMAT::TEXT;
else
IE_THROW() << "readDebugCapsProperties: Unknown dump format";
};
const char* envVarValue = nullptr;
if ((envVarValue = readEnv("OV_CPU_EXEC_GRAPH_PATH")))
execGraphPath = envVarValue;
if ((envVarValue = readEnv("OV_CPU_VERBOSE")))
verbose = envVarValue;
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_DIR")))
blobDumpDir = envVarValue;
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_FORMAT")))
blobDumpFormat = parseDumpFormat(envVarValue);
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_EXEC_ID")))
blobDumpFilters[BY_EXEC_ID] = envVarValue;
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_PORTS")))
blobDumpFilters[BY_PORTS] = envVarValue;
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_TYPE")))
blobDumpFilters[BY_TYPE] = envVarValue;
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_NAME")))
blobDumpFilters[BY_NAME] = envVarValue;
if ((envVarValue = readEnv("OV_CPU_SUMMARY_PERF"))) {
summaryPerf = envVarValue;
}
if ((envVarValue = readEnv("OV_CPU_DISABLE")))
disable.parseAndSet(envVarValue);
if ((envVarValue = readEnv("OV_CPU_DUMP_IR")))
dumpIR.parseAndSet(envVarValue);
}
} // namespace intel_cpu
} // namespace ov
#endif // CPU_DEBUG_CAPS

View File

@ -0,0 +1,213 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#ifdef CPU_DEBUG_CAPS
#include "ie_common.h"
#include "openvino/util/common_util.hpp"
#include <bitset>
#include <unordered_map>
namespace ov {
namespace intel_cpu {
class DebugCapsConfig {
private:
struct PropertySetter;
using PropertySetterPtr = std::shared_ptr<PropertySetter>;
public:
DebugCapsConfig() {
readProperties();
}
enum FILTER {
BY_PORTS,
BY_EXEC_ID,
BY_TYPE,
BY_NAME,
};
enum class FORMAT {
BIN,
TEXT,
};
std::string execGraphPath;
std::string verbose;
std::string blobDumpDir = "cpu_dump";
FORMAT blobDumpFormat = FORMAT::TEXT;
// std::hash<int> is necessary for Ubuntu-16.04 (gcc-5.4 and defect in C++11 standart)
std::unordered_map<FILTER, std::string, std::hash<int>> blobDumpFilters;
std::string summaryPerf = "";
struct TransformationFilter {
enum Type : uint8_t {
PreLpt = 0, Lpt, PostLpt, Snippets, Specific, NumOfTypes
};
std::bitset<NumOfTypes> filter;
PropertySetterPtr getPropertySetter() {
return PropertySetterPtr(new BitsetFilterPropertySetter<NumOfTypes>("transformations", filter,
{{"all", {PreLpt, Lpt, PostLpt, Snippets, Specific}},
{"common", {PreLpt, PostLpt}},
{"prelpt", {PreLpt}},
{"lpt", {Lpt}},
{"postlpt", {PostLpt}},
{"snippets", {Snippets}},
{"specific", {Specific}}
}));
}
};
struct IrFormatFilter {
enum Type : uint8_t {
Xml = 0, XmlBin, Dot, Svg, NumOfTypes
};
std::bitset<NumOfTypes> filter;
PropertySetterPtr getPropertySetter() {
return PropertySetterPtr(new BitsetFilterPropertySetter<NumOfTypes>("formats", filter,
{{"all", {XmlBin, Dot, Svg}},
{"xml", {Xml}},
{"xmlbin", {XmlBin}},
{"dot", {Dot}},
{"svg", {Svg}},
}));
}
};
struct PropertyGroup {
virtual std::vector<PropertySetterPtr> getPropertySetters(void) = 0;
void parseAndSet(const std::string& str) {
const auto& options = ov::util::split(str, ' ');
const auto& propertySetters = getPropertySetters();
bool failed = false;
auto getHelp = [propertySetters] (void) {
std::string help;
for (const auto& property : propertySetters)
help.append('\t' + property->getPropertyName() + "=<" + property->getPropertyValueDescription() + ">\n");
return help;
};
for (const auto& option : options) {
const auto& parts = ov::util::split(option, '=');
if (parts.size() > 2) {
failed = true;
break;
}
const auto& propertyName = ov::util::to_lower(parts.front());
const auto& foundSetter = std::find_if(propertySetters.begin(), propertySetters.end(),
[propertyName] (const PropertySetterPtr& setter) { return setter->getPropertyName() == propertyName; });
if (foundSetter == propertySetters.end() ||
!(*foundSetter)->parseAndSet(parts.size() == 1 ? "" : parts.back())) {
failed = true;
break;
}
}
if (failed)
IE_THROW() << "Wrong syntax: " << str << std::endl
<< "The following space separated options are supported (option names are case insensitive):" << std::endl
<< getHelp();
}
};
struct : PropertyGroup {
TransformationFilter transformations;
std::vector<PropertySetterPtr> getPropertySetters(void) override {
return { transformations.getPropertySetter() };
}
} disable;
struct : PropertyGroup {
std::string dir = "intel_cpu_dump";
IrFormatFilter format = { 1 << IrFormatFilter::Xml };
TransformationFilter transformations;
std::vector<PropertySetterPtr> getPropertySetters(void) override {
return { PropertySetterPtr(new StringPropertySetter("dir", dir, "path to dumped IRs")),
format.getPropertySetter(),
transformations.getPropertySetter() };
}
} dumpIR;
private:
struct PropertySetter {
virtual bool parseAndSet(const std::string& str) = 0;
virtual std::string getPropertyValueDescription(void) const = 0;
PropertySetter(const std::string&& name) : propertyName(name) {}
const std::string& getPropertyName(void) const { return propertyName; }
private:
const std::string propertyName;
};
struct StringPropertySetter : PropertySetter {
StringPropertySetter(const std::string&& name, std::string& ref, const std::string&& valueDescription)
: property(ref), propertyValueDescription(valueDescription), PropertySetter(std::move(name)) {}
bool parseAndSet(const std::string& str) override {
property = str;
return true;
}
std::string getPropertyValueDescription(void) const override { return propertyValueDescription; }
private:
std::string& property;
const std::string propertyValueDescription;
};
template<std::size_t NumOfBits>
struct BitsetFilterPropertySetter : PropertySetter {
struct Token {
std::string name;
std::vector<size_t> bits;
};
BitsetFilterPropertySetter(const std::string&& name, std::bitset<NumOfBits>& ref, const std::vector<Token>&& tokens)
: property(ref), propertyTokens(tokens), PropertySetter(std::move(name)) {}
bool parseAndSet(const std::string& str) override {
const auto& tokens = str.empty() ?
std::vector<std::string>{"all"} : ov::util::split(ov::util::to_lower(str), ',');
property.reset();
for (const auto& token : tokens) {
const bool tokenVal = (token.front() != '-');
const auto& tokenName = tokenVal ? token : token.substr(1);
const auto& foundToken = std::find_if(propertyTokens.begin(), propertyTokens.end(),
[tokenName] (const Token& token) { return token.name == tokenName; });
if (foundToken == propertyTokens.end())
return false;
for (const auto& bit : foundToken->bits) {
property.set(bit, tokenVal);
}
}
return true;
}
std::string getPropertyValueDescription(void) const override {
std::string supportedTokens = "comma separated filter tokens: ";
for (auto i = 0; i < propertyTokens.size(); i++) {
if (i)
supportedTokens.push_back(',');
supportedTokens.append(propertyTokens[i].name);
}
supportedTokens.append("; -'token' is used for exclusion, case does not matter, no tokens is treated as 'all'");
return supportedTokens;
}
private:
std::bitset<NumOfBits>& property;
const std::vector<Token> propertyTokens;
};
void readProperties();
};
} // namespace intel_cpu
} // namespace ov
#endif // CPU_DEBUG_CAPS

View File

@ -0,0 +1,113 @@
// Copyright (C) 2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#ifdef CPU_DEBUG_CAPS
#include "debug_caps_config.h"
#include "openvino/util/file_util.hpp"
#include <openvino/pass/manager.hpp>
#include <openvino/pass/serialize.hpp>
#include <openvino/pass/visualize_tree.hpp>
namespace ov {
namespace intel_cpu {
class TransformationDumper {
public:
explicit TransformationDumper(const DebugCapsConfig& config, const DebugCapsConfig::TransformationFilter::Type type,
const std::shared_ptr<ov::Model>& model)
: config(config), type(type), model(model) {
for (auto prev = infoMap.at(type).prev; prev != TransformationType::NumOfTypes;
prev = infoMap.at(prev).prev) {
// no need to serialize input graph if there was no transformations from previous dump
if (config.disable.transformations.filter[prev])
continue;
if (!config.dumpIR.transformations.filter[prev])
break;
if (wasDumped()[prev])
return;
}
dump("_in");
}
~TransformationDumper() {
dump("_out");
wasDumped().set(type);
}
private:
const DebugCapsConfig& config;
const std::shared_ptr<ov::Model>& model;
using TransformationType = DebugCapsConfig::TransformationFilter::Type;
const TransformationType type;
struct TransformationInfo {
std::string name;
TransformationType prev;
};
// std::hash<std::underlying_type<FILTER>::type> is necessary for Ubuntu-16.04 (gcc-5.4 and defect in C++11 standart)
const std::unordered_map<TransformationType, TransformationInfo,
std::hash<std::underlying_type<TransformationType>::type>> infoMap =
{{TransformationType::PreLpt, {"preLpt", TransformationType::NumOfTypes}},
{TransformationType::Lpt, {"lpt", TransformationType::PreLpt}},
{TransformationType::PostLpt, {"postLpt", TransformationType::Lpt}},
{TransformationType::Snippets, {"snippets", TransformationType::PostLpt}},
{TransformationType::Specific, {"cpuSpecific", TransformationType::Snippets}}};
std::bitset<TransformationType::NumOfTypes>& wasDumped(void) {
static std::bitset<TransformationType::NumOfTypes> wasDumped;
return wasDumped;
}
void dump(const std::string&& postfix) {
static int num = 0; // just to keep dumped IRs ordered in filesystem
const auto pathAndName = config.dumpIR.dir + "/ir_" + std::to_string(num) + '_' +
infoMap.at(type).name + postfix;
ov::util::create_directory_recursive(config.dumpIR.dir);
ov::pass::Manager serializer;
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::XmlBin])
serializer.register_pass<ov::pass::Serialize>(pathAndName + ".xml", "");
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Xml]) {
std::string xmlFile(pathAndName + ".xml");
std::string binFile("/dev/null"); // @todo make it crossplatform using dummy implementation of std::ostream
serializer.register_pass<ov::pass::Serialize>(xmlFile, binFile);
}
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Svg]) {
serializer.register_pass<ov::pass::VisualizeTree>(pathAndName + ".svg");
}
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Dot]) {
serializer.register_pass<ov::pass::VisualizeTree>(pathAndName + ".dot");
}
serializer.run_passes(model);
num++;
}
};
} // namespace intel_cpu
} // namespace ov
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(_config, _type) \
_config.disable.transformations.filter[DebugCapsConfig::TransformationFilter::Type::_type]
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(...) !CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(__VA_ARGS__)
# define CPU_DEBUG_CAP_TRANSFORMATION_DUMP(_this, _type) \
IE_ASSERT(CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(_this->config.debugCaps, _type)); \
auto dumperPtr = _this->config.debugCaps.dumpIR.transformations.filter[DebugCapsConfig::TransformationFilter::Type::_type] ? \
std::unique_ptr<TransformationDumper>(new TransformationDumper(_this->config.debugCaps, \
DebugCapsConfig::TransformationFilter::Type::_type, _this->model)) : \
nullptr
# define CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(_this, _type) \
if (CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(_this->config.debugCaps, _type)) \
return; \
CPU_DEBUG_CAP_TRANSFORMATION_DUMP(_this, _type)
#else
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(_config, _type) false
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(...) true
# define CPU_DEBUG_CAP_TRANSFORMATION_DUMP(_this, _type)
# define CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(_this, _type)
#endif // CPU_DEBUG_CAPS

View File

@ -5,6 +5,7 @@
#include "node_dumper.h"
#include "utils/debug_caps_config.h"
#include <node.h>
#include "ie_common.h"
#include "utils/blob_dump.h"
@ -26,20 +27,20 @@ static void formatNodeName(std::string& name) {
std::replace(name.begin(), name.end(), ':', '-');
}
static bool shouldBeDumped(const NodePtr& node, const Config& config, const std::string& portsKind) {
static bool shouldBeDumped(const NodePtr& node, const DebugCapsConfig& config, const std::string& portsKind) {
const auto& dumpFilters = config.blobDumpFilters;
if (dumpFilters.empty())
return false;
if (dumpFilters.count(Config::FILTER::BY_PORTS)) { // filter by ports configured
if (dumpFilters.at(Config::FILTER::BY_PORTS) != "ALL" &&
portsKind != dumpFilters.at(Config::FILTER::BY_PORTS))
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_PORTS)) { // filter by ports configured
if (dumpFilters.at(DebugCapsConfig::FILTER::BY_PORTS) != "ALL" &&
portsKind != dumpFilters.at(DebugCapsConfig::FILTER::BY_PORTS))
return false;
}
if (dumpFilters.count(Config::FILTER::BY_EXEC_ID)) { // filter by exec id configured
std::stringstream ss(dumpFilters.at(Config::FILTER::BY_EXEC_ID));
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_EXEC_ID)) { // filter by exec id configured
std::stringstream ss(dumpFilters.at(DebugCapsConfig::FILTER::BY_EXEC_ID));
int id;
bool matched = false;
@ -54,8 +55,8 @@ static bool shouldBeDumped(const NodePtr& node, const Config& config, const std:
return false;
}
if (dumpFilters.count(Config::FILTER::BY_TYPE)) { // filter by type configured
std::stringstream ss(dumpFilters.at(Config::FILTER::BY_TYPE));
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_TYPE)) { // filter by type configured
std::stringstream ss(dumpFilters.at(DebugCapsConfig::FILTER::BY_TYPE));
std::string type;
bool matched = false;
@ -70,22 +71,22 @@ static bool shouldBeDumped(const NodePtr& node, const Config& config, const std:
return false;
}
if (dumpFilters.count(Config::FILTER::BY_NAME)) { // filter by name configured
if (dumpFilters.at(Config::FILTER::BY_NAME) != "*" && // to have 'single char' option for matching all the names
!std::regex_match(node->getName(), std::regex(dumpFilters.at(Config::FILTER::BY_NAME)))) // name does not match
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_NAME)) { // filter by name configured
if (dumpFilters.at(DebugCapsConfig::FILTER::BY_NAME) != "*" && // to have 'single char' option for matching all the names
!std::regex_match(node->getName(), std::regex(dumpFilters.at(DebugCapsConfig::FILTER::BY_NAME)))) // name does not match
return false;
}
return true;
}
static void dump(const BlobDumper& bd, const std::string& file, const Config& config) {
static void dump(const BlobDumper& bd, const std::string& file, const DebugCapsConfig& config) {
switch (config.blobDumpFormat) {
case Config::FORMAT::BIN: {
case DebugCapsConfig::FORMAT::BIN: {
bd.dump(file);
break;
}
case Config::FORMAT::TEXT: {
case DebugCapsConfig::FORMAT::TEXT: {
bd.dumpAsTxt(file);
break;
}
@ -94,7 +95,7 @@ static void dump(const BlobDumper& bd, const std::string& file, const Config& co
}
}
static void dumpInternalBlobs(const NodePtr& node, const Config& config) {
static void dumpInternalBlobs(const NodePtr& node, const DebugCapsConfig& config) {
std::string nodeName = node->getName();
formatNodeName(nodeName);
@ -116,7 +117,7 @@ static void dumpInternalBlobs(const NodePtr& node, const Config& config) {
}
}
void dumpInputBlobs(const NodePtr& node, const Config& config, int count) {
void dumpInputBlobs(const NodePtr& node, const DebugCapsConfig& config, int count) {
if (!shouldBeDumped(node, config, "IN"))
return;
@ -150,7 +151,7 @@ void dumpInputBlobs(const NodePtr& node, const Config& config, int count) {
dumpInternalBlobs(node, config);
}
void dumpOutputBlobs(const NodePtr& node, const Config& config, int count) {
void dumpOutputBlobs(const NodePtr& node, const DebugCapsConfig& config, int count) {
if (!shouldBeDumped(node, config, "OUT"))
return;

View File

@ -1,25 +1,26 @@
// Copyright (C) 2018-2022 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#ifdef CPU_DEBUG_CAPS
#pragma once
#ifdef CPU_DEBUG_CAPS
#include "utils/debug_caps_config.h"
#include <node.h>
#include "config.h"
namespace ov {
namespace intel_cpu {
void dumpInputBlobs(const NodePtr &node, const Config& config, int count = -1);
void dumpOutputBlobs(const NodePtr &node, const Config& config, int count = -1);
void dumpInputBlobs(const NodePtr &node, const DebugCapsConfig& config, int count = -1);
void dumpOutputBlobs(const NodePtr &node, const DebugCapsConfig& config, int count = -1);
class DumpHelper {
const NodePtr& node;
const int count;
const Config& config;
const DebugCapsConfig& config;
public:
explicit DumpHelper(const NodePtr& _node, const Config& _config, int _count = -1): node(_node), config(_config), count(_count) {
explicit DumpHelper(const NodePtr& _node, const DebugCapsConfig& _config, int _count = -1):
node(_node), config(_config), count(_count) {
dumpInputBlobs(node, config, count);
}