[CPU] [DEBUG CAPS] Extension for snippets and other ngraph transformations (#14223)
This commit is contained in:
parent
40e19dec00
commit
e306cbc67a
@ -14,10 +14,11 @@
|
||||
#include "ie_parallel.hpp"
|
||||
#include "ie_system_conf.h"
|
||||
|
||||
#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
|
||||
#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
|
||||
#include "openvino/core/type/element_type_traits.hpp"
|
||||
#include "openvino/runtime/properties.hpp"
|
||||
#include <cpu/x64/cpu_isa_traits.hpp>
|
||||
#include "utils/debug_capabilities.h"
|
||||
#include "cpu/x64/cpu_isa_traits.hpp"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
@ -48,10 +49,24 @@ Config::Config() {
|
||||
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16))
|
||||
enforceBF16 = false;
|
||||
|
||||
CPU_DEBUG_CAP_ENABLE(readDebugCapsProperties());
|
||||
CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
|
||||
|
||||
updateProperties();
|
||||
}
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
/**
|
||||
* Debug capabilities configuration has more priority than common one
|
||||
* Some of the debug capabilities also require to enable some of common
|
||||
* configuration properties
|
||||
*/
|
||||
void Config::applyDebugCapsProperties() {
|
||||
// always enable perf counters for verbose mode and performance summary
|
||||
if (!debugCaps.verbose.empty() || !debugCaps.summaryPerf.empty())
|
||||
collectPerfCounters = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
void Config::readProperties(const std::map<std::string, std::string> &prop) {
|
||||
const auto streamExecutorConfigKeys = streamExecutorConfig.SupportedKeys();
|
||||
const auto hintsConfigKeys = perfHintsConfig.SupportedKeys();
|
||||
@ -184,7 +199,7 @@ void Config::readProperties(const std::map<std::string, std::string> &prop) {
|
||||
if (exclusiveAsyncRequests) // Exclusive request feature disables the streams
|
||||
streamExecutorConfig._streams = 1;
|
||||
|
||||
CPU_DEBUG_CAP_ENABLE(readDebugCapsProperties());
|
||||
CPU_DEBUG_CAP_ENABLE(applyDebugCapsProperties());
|
||||
updateProperties();
|
||||
}
|
||||
|
||||
@ -239,58 +254,6 @@ void Config::updateProperties() {
|
||||
_config.insert({PluginConfigParams::KEY_CACHE_DIR, cache_dir});
|
||||
}
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
void Config::readDebugCapsProperties() {
|
||||
auto readEnv = [](const char* envVar) {
|
||||
return std::getenv(envVar);
|
||||
};
|
||||
|
||||
auto parseDumpFormat = [](const std::string& format) {
|
||||
if (format == "BIN")
|
||||
return FORMAT::BIN;
|
||||
else if (format == "TEXT")
|
||||
return FORMAT::TEXT;
|
||||
else
|
||||
IE_THROW() << "readDebugCapsProperties: Unknown dump format";
|
||||
};
|
||||
|
||||
const char* envVarValue = nullptr;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_EXEC_GRAPH_PATH"))
|
||||
execGraphPath = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_VERBOSE"))
|
||||
verbose = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_DIR"))
|
||||
blobDumpDir = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_FORMAT"))
|
||||
blobDumpFormat = parseDumpFormat(envVarValue);
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_EXEC_ID"))
|
||||
blobDumpFilters[BY_EXEC_ID] = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_PORTS"))
|
||||
blobDumpFilters[BY_PORTS] = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_TYPE"))
|
||||
blobDumpFilters[BY_TYPE] = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_NAME"))
|
||||
blobDumpFilters[BY_NAME] = envVarValue;
|
||||
|
||||
if (envVarValue = readEnv("OV_CPU_SUMMARY_PERF")) {
|
||||
collectPerfCounters = true;
|
||||
summaryPerf = envVarValue;
|
||||
}
|
||||
|
||||
// always enable perf counters for verbose mode
|
||||
if (!verbose.empty())
|
||||
collectPerfCounters = true;
|
||||
}
|
||||
#endif // CPU_DEBUG_CAPS
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
||||
|
@ -6,8 +6,11 @@
|
||||
|
||||
#include <threading/ie_istreams_executor.hpp>
|
||||
#include <ie_performance_hints.hpp>
|
||||
#include "utils/debug_capabilities.h"
|
||||
#include <ie/ie_common.h>
|
||||
#include <openvino/util/common_util.hpp>
|
||||
#include "utils/debug_caps_config.h"
|
||||
|
||||
#include <bitset>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <mutex>
|
||||
@ -57,31 +60,12 @@ struct Config {
|
||||
|
||||
std::map<std::string, std::string> _config;
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
enum FILTER {
|
||||
BY_PORTS,
|
||||
BY_EXEC_ID,
|
||||
BY_TYPE,
|
||||
BY_NAME,
|
||||
};
|
||||
|
||||
enum class FORMAT {
|
||||
BIN,
|
||||
TEXT,
|
||||
};
|
||||
|
||||
std::string execGraphPath;
|
||||
std::string verbose;
|
||||
std::string blobDumpDir = "cpu_dump";
|
||||
FORMAT blobDumpFormat = FORMAT::TEXT;
|
||||
// std::hash<int> is necessary for Ubuntu-16.04 (gcc-5.4 and defect in C++11 standart)
|
||||
std::unordered_map<FILTER, std::string, std::hash<int>> blobDumpFilters;
|
||||
std::string summaryPerf = "";
|
||||
|
||||
void readDebugCapsProperties();
|
||||
#endif
|
||||
|
||||
bool isNewApi = true;
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
DebugCapsConfig debugCaps;
|
||||
void applyDebugCapsProperties();
|
||||
#endif
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
|
@ -6,6 +6,7 @@ Use the following cmake option to enable debug capabilities:
|
||||
* [Verbose mode](verbose.md)
|
||||
* [Blob dumping](blob_dumping.md)
|
||||
* [Graph serialization](graph_serialization.md)
|
||||
* [Graph transformation disabling](feature_disabling.md#graph-transformations)
|
||||
|
||||
## Debug log
|
||||
|
||||
|
@ -29,8 +29,8 @@ Default is *cpu_dump*
|
||||
OV_CPU_BLOB_DUMP_FORMAT=<format> binary ...
|
||||
```
|
||||
Options are:
|
||||
* BIN (default)
|
||||
* TEXT
|
||||
* BIN
|
||||
* TEXT (default)
|
||||
|
||||
## Filter input / output blobs
|
||||
To dump only input / output blobs:
|
||||
|
52
src/plugins/intel_cpu/src/docs/debug_caps_filters.md
Normal file
52
src/plugins/intel_cpu/src/docs/debug_caps_filters.md
Normal file
@ -0,0 +1,52 @@
|
||||
# Filters
|
||||
|
||||
Filters described below have the following common format:
|
||||
```sh
|
||||
filter_name=<comma_separated_tokens>
|
||||
```
|
||||
Tokens are processed from left to right and each one includes or excludes corresponding value.\
|
||||
For exclusion token is just prepended by minus: *-token*\
|
||||
All tokens are case insensitive and no tokens is treated as *all*\
|
||||
So filters below are equal:
|
||||
* filter_name
|
||||
* filter_name=all
|
||||
* filter_name=-all,ALL
|
||||
|
||||
## IR format filter
|
||||
|
||||
IR format filter is used to specify output IR formats, e.g. for [serialization](graph_serialization.md#graph-transformations).
|
||||
```sh
|
||||
formats=<comma_separated_tokens>
|
||||
```
|
||||
|
||||
The following tokens are supported:
|
||||
* all\
|
||||
equals to <xml,dot,svg>
|
||||
* xml (default)\
|
||||
IR in .xml file. Can be opened using, for example, *netron* app. (For now the option is Linux only)
|
||||
* xmlbin\
|
||||
IR in .xml and .bin files. Can be opened using, for example, *netron* app.
|
||||
* dot\
|
||||
IR in .dot file (.svg.dot file if svg is also specified). Can be inspected using, for example, *graphviz* tools.
|
||||
* svg\
|
||||
IR in .svg file. Requires *dot* tool to be installed on the host, not supported on Windows.\
|
||||
Generation is based on dot representation, so IR is additionally dumped to .svg.dot file.
|
||||
|
||||
## Transformation filter
|
||||
|
||||
Transformation filter is used to specify main graph transformation stages for different purposes,
|
||||
e.g. for [disabling](feature_disabling.md#graph-transformation) or [serialization](graph_serialization.md#graph-transformations).
|
||||
```sh
|
||||
transformations=<comma_separated_tokens>
|
||||
```
|
||||
|
||||
The following tokens are supported:
|
||||
* all (default)\
|
||||
equals to <preLpt,lpt,postLpt,snippets,specific>
|
||||
* common \
|
||||
equals to <preLpt,postLpt>
|
||||
* preLpt
|
||||
* lpt
|
||||
* postLpt
|
||||
* snippets
|
||||
* specific
|
26
src/plugins/intel_cpu/src/docs/feature_disabling.md
Normal file
26
src/plugins/intel_cpu/src/docs/feature_disabling.md
Normal file
@ -0,0 +1,26 @@
|
||||
# Feature disabling
|
||||
|
||||
Common way to disable something in CPU plugin is implied by means of environment variable **OV_CPU_DISABLE**:
|
||||
```sh
|
||||
OV_CPU_DISABLE=<space_separated_options> binary ...
|
||||
```
|
||||
Option names are case insensitive and processed from left to right,\
|
||||
so last one overwrites previous ones if duplicated.
|
||||
|
||||
Examples:
|
||||
```sh
|
||||
OV_CPU_DISABLE="transformations" binary ...
|
||||
OV_CPU_DISABLE="transformations=lpt" binary ...
|
||||
OV_CPU_DISABLE="transformations=all,-common" binary ...
|
||||
```
|
||||
|
||||
By means of corresponding options **OV_CPU_DISABLE** controls disabling of the following features:
|
||||
|
||||
## Graph transformations
|
||||
|
||||
Graph transformation disabling is controlled by the following option inside **OV_CPU_DISABLE**:
|
||||
```sh
|
||||
transformations=<comma_separated_tokens>
|
||||
```
|
||||
Filter with main transformation stages to disable specified ones.\
|
||||
See [transformation filter](debug_caps_filters.md#transformation-filter) for more details.
|
@ -1,17 +1,43 @@
|
||||
# Graph serialization
|
||||
|
||||
The functionality allows to serialize execution graph using environment variable:
|
||||
Graph serialization is disabled by default and controlled by environment variables.
|
||||
|
||||
## Execution graph
|
||||
|
||||
Execution graph could be serialized using environment variable **OV_CPU_EXEC_GRAPH_PATH**:
|
||||
```sh
|
||||
OV_CPU_EXEC_GRAPH_PATH=<path> binary ...
|
||||
OV_CPU_EXEC_GRAPH_PATH=<option> binary ...
|
||||
```
|
||||
Possible serialization options:
|
||||
* cout\
|
||||
Serialize to console output.
|
||||
* \<path\>.xml\
|
||||
Serialize graph into .xml and .bin files. Can be opened using, for example, *netron* app.
|
||||
* **TBD**: \<path\>.dot\
|
||||
Serialize graph into .dot file. Can be inspected using, for example, *graphviz* tools.
|
||||
|
||||
## Graph transformations
|
||||
|
||||
Additionally, IR could be serialized at specified stages using environment variable **OV_CPU_DUMP_IR**:
|
||||
```sh
|
||||
OV_CPU_DUMP_IR=<space_separated_options> binary ...
|
||||
```
|
||||
|
||||
Possible serialization options:
|
||||
* cout
|
||||
Examples:
|
||||
```sh
|
||||
OV_CPU_DUMP_IR="transformations" binary ...
|
||||
OV_CPU_DUMP_IR="transformations=snippets dir=path/dumpDir" binary ...
|
||||
OV_CPU_DUMP_IR="transformations=all,-common DIR=path/dumpdir formats=svg,xml" binary ...
|
||||
```
|
||||
|
||||
Serialize to console output
|
||||
* \<path\>.xml
|
||||
Option names are case insensitive, the following options are supported:
|
||||
* dir=\<path\>\
|
||||
Path to dumped IR files. If omitted, it defaults to *intel_cpu_dump*
|
||||
* formats=<comma_separated_tokens>\
|
||||
Filter with IR formats to dump. If omitted, it defaults to *xml*\
|
||||
See [IR format filter](debug_caps_filters.md#ir-format-filter) for more details.
|
||||
* transformations=<comma_separated_tokens>\
|
||||
Filter with main transformation stages to serialize graph before and after specified ones.\
|
||||
See [transformation filter](debug_caps_filters.md#transformation-filter) for more details.
|
||||
|
||||
Serialize graph into .xml and .bin files. Can be opened using, for example, *netron* app
|
||||
* \<path\>.dot
|
||||
|
||||
TBD. Serialize graph into .dot file. Can be inspected using, for example, *graphviz* tools.
|
||||
Options are processed from left to right, so last one overwrites previous ones if duplicated.
|
||||
|
@ -1073,7 +1073,7 @@ void Graph::InferStatic(InferRequestBase* request) {
|
||||
dnnl::stream stream(eng);
|
||||
|
||||
for (const auto& node : executableGraphNodes) {
|
||||
VERBOSE(node, config.verbose);
|
||||
VERBOSE(node, config.debugCaps.verbose);
|
||||
PERF(node, config.collectPerfCounters);
|
||||
|
||||
if (request)
|
||||
@ -1160,7 +1160,7 @@ void Graph::InferDynamic(InferRequestBase* request) {
|
||||
updateNodes(stopIndx);
|
||||
for (; inferCounter < stopIndx; ++inferCounter) {
|
||||
auto& node = executableGraphNodes[inferCounter];
|
||||
VERBOSE(node, config.verbose);
|
||||
VERBOSE(node, config.debugCaps.verbose);
|
||||
PERF(node, config.collectPerfCounters);
|
||||
|
||||
if (request)
|
||||
@ -1171,7 +1171,7 @@ void Graph::InferDynamic(InferRequestBase* request) {
|
||||
}
|
||||
|
||||
inline void Graph::ExecuteNode(const NodePtr& node, const dnnl::stream& stream) const {
|
||||
DUMP(node, config, infer_count);
|
||||
DUMP(node, config.debugCaps, infer_count);
|
||||
OV_ITT_SCOPED_TASK(itt::domains::intel_cpu, node->profiling.execute);
|
||||
|
||||
if (node->isDynamicNode()) {
|
||||
|
@ -210,7 +210,7 @@ std::shared_ptr<ngraph::Function> dump_graph_as_ie_ngraph_net(const Graph &graph
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
void serialize(const Graph &graph) {
|
||||
const std::string& path = graph.getConfig().execGraphPath;
|
||||
const std::string& path = graph.getConfig().debugCaps.execGraphPath;
|
||||
|
||||
if (path.empty())
|
||||
return;
|
||||
@ -257,7 +257,7 @@ void serializeToCout(const Graph &graph) {
|
||||
}
|
||||
|
||||
void summary_perf(const Graph &graph) {
|
||||
const std::string& summaryPerf = graph.getConfig().summaryPerf;
|
||||
const std::string& summaryPerf = graph.getConfig().debugCaps.summaryPerf;
|
||||
|
||||
if (summaryPerf.empty())
|
||||
return;
|
||||
|
@ -27,6 +27,7 @@ namespace intel_cpu {
|
||||
|
||||
inline void ConvertToCPUSpecificOpset(std::shared_ptr<ngraph::Function> &nGraphFunc) {
|
||||
RUN_ON_FUNCTION_SCOPE(ConvertToCPUSpecificOpset);
|
||||
|
||||
ngraph::pass::Manager manager;
|
||||
manager.register_pass<ConvertMatMulToFC>();
|
||||
manager.register_pass<AlignMatMulInputRanks>();
|
||||
|
@ -2,140 +2,28 @@
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "ie_metric_helpers.hpp"
|
||||
#include "plugin.h"
|
||||
#include "extension_mngr.h"
|
||||
#include "weights_cache.hpp"
|
||||
#include "extension.h"
|
||||
#include "itt.h"
|
||||
#include "serialize.h"
|
||||
#include "ie_metric_helpers.hpp" // must be included first
|
||||
|
||||
#include "plugin.h"
|
||||
|
||||
#include "transformation_pipeline.h"
|
||||
#include "itt.h"
|
||||
#include "extension_mngr.h"
|
||||
#include "extension.h"
|
||||
#include "serialize.h"
|
||||
#include "threading/ie_executor_manager.hpp"
|
||||
|
||||
#include "ie_icore.hpp"
|
||||
#include "ie_plugin_config.hpp"
|
||||
#include "ie_system_conf.h"
|
||||
#include "cpp_interfaces/interface/ie_internal_plugin_config.hpp"
|
||||
|
||||
#include <threading/ie_executor_manager.hpp>
|
||||
#include <memory>
|
||||
#include <ie_plugin_config.hpp>
|
||||
#include <cpp_interfaces/interface/ie_internal_plugin_config.hpp>
|
||||
#include <ie_icore.hpp>
|
||||
#include <fstream>
|
||||
#include <vector>
|
||||
#include <tuple>
|
||||
#include <unordered_set>
|
||||
#include <ie_system_conf.h>
|
||||
#include <ie_ngraph_utils.hpp>
|
||||
|
||||
|
||||
#include <transformations/common_optimizations/add_fake_quantize_fusion.hpp>
|
||||
#include <transformations/common_optimizations/common_optimizations.hpp>
|
||||
#include <transformations/common_optimizations/fq_mul_fusion.hpp>
|
||||
#include <transformations/common_optimizations/mul_fake_quantize_fusion.hpp>
|
||||
#include <transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp>
|
||||
#include <transformations/common_optimizations/convert_quantize_dequantize.hpp>
|
||||
#include <transformations/common_optimizations/nop_elimination.hpp>
|
||||
#include <transformations/common_optimizations/wrap_interpolate_into_transposes.hpp>
|
||||
#include <transformations/common_optimizations/transpose_sinking.hpp>
|
||||
#include "transformations/common_optimizations/convert_compression_only_to_legacy.hpp"
|
||||
#include <transformations/common_optimizations/lin_op_sequence_fusion.hpp>
|
||||
|
||||
#include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
|
||||
#include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
|
||||
#include <transformations/op_conversions/convert_broadcast_to_tiles.hpp>
|
||||
#include <transformations/op_conversions/convert_depth_to_space.hpp>
|
||||
#include <transformations/op_conversions/convert_shuffle_channels3.hpp>
|
||||
#include <transformations/op_conversions/convert_slice_to_strided_slice.hpp>
|
||||
#include <transformations/op_conversions/convert_space_to_depth.hpp>
|
||||
#include <transformations/op_conversions/convert_gelu.hpp>
|
||||
#include <transformations/op_conversions/convert_gather_downgrade.hpp>
|
||||
#include <transformations/op_conversions/convert_gather_upgrade.hpp>
|
||||
#include <transformations/op_conversions/detection_output_downgrade.hpp>
|
||||
#include <transformations/op_conversions/detection_output_upgrade.hpp>
|
||||
#include <transformations/op_conversions/gelu7_downgrade.hpp>
|
||||
#include <transformations/op_conversions/hswish_decomposition.hpp>
|
||||
#include <transformations/op_conversions/hsigmoid_decomposition.hpp>
|
||||
#include <transformations/op_conversions/mvn6_decomposition.hpp>
|
||||
#include <transformations/op_conversions/normalize_l2_decomposition.hpp>
|
||||
#include <transformations/op_conversions/reduce_l1_decomposition.hpp>
|
||||
#include <transformations/op_conversions/reduce_l2_decomposition.hpp>
|
||||
#include <transformations/op_conversions/softplus_decomposition.hpp>
|
||||
#include <transformations/op_conversions/convert_space_to_batch.hpp>
|
||||
#include <transformations/op_conversions/convert_batch_to_space.hpp>
|
||||
#include <transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp>
|
||||
#include <transformations/op_conversions/convert_subtract.hpp>
|
||||
#include <transformations/op_conversions/softmax_decomposition.hpp>
|
||||
#include <transformations/control_flow/unroll_tensor_iterator.hpp>
|
||||
#include <transformations/op_conversions/convert_mod.hpp>
|
||||
#include <transformations/op_conversions/convert_ti_to_sequences.hpp>
|
||||
#include <transformations/op_conversions/lstm_cell_decomposition.hpp>
|
||||
#include <transformations/op_conversions/rnn_cell_decomposition.hpp>
|
||||
#include <transformations/op_conversions/gru_cell_decomposition.hpp>
|
||||
#include <transformations/op_conversions/log_softmax_decomposition.hpp>
|
||||
#include <transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp>
|
||||
#include <transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp>
|
||||
#include <transformations/op_conversions/convert_previous_nms_to_nms_9.hpp>
|
||||
#include <transformations/op_conversions/convert_nms9_to_nms_ie_internal.hpp>
|
||||
#include <transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp>
|
||||
#include <transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp>
|
||||
#include <transformations/op_conversions/convert_deformable_conv_v8_to_v1.hpp>
|
||||
#include <transformations/smart_reshape/matmul_sr.hpp>
|
||||
#include <transformations/op_conversions/convert_minimum_to_power_and_max.hpp>
|
||||
#include <transformations/op_conversions/convert_reduce_to_pooling.hpp>
|
||||
#include <transformations/convert_precision.hpp>
|
||||
#include <transformations/init_node_info.hpp>
|
||||
#include <transformations/disable_decompression_convert_constant_folding.hpp>
|
||||
#include <transformations/rt_info/fused_names_attribute.hpp>
|
||||
#include <transformations/op_conversions/fq_decomposition.hpp>
|
||||
#include <transformations/utils/utils.hpp>
|
||||
#include <transformations/op_conversions/convert_roi_align_v9_to_v3.hpp>
|
||||
#include <transformations/op_conversions/convert_roi_align_v3_to_v9.hpp>
|
||||
#include <transformations/op_conversions/softsign_decomposition.hpp>
|
||||
#include "transformations/op_conversions/eye_decomposition.hpp"
|
||||
#include "transformations/op_conversions/unique_decomposition.hpp"
|
||||
|
||||
#include "ngraph_transformations/convert_to_cpu_specific_opset.hpp"
|
||||
#include "ngraph_transformations/snippets_mark_skipped.hpp"
|
||||
#include "ngraph_transformations/mha_fusion.hpp"
|
||||
#include "ngraph_transformations/convert_to_interaction.hpp"
|
||||
#include "ngraph_transformations/convert_fq_rnn_to_quantized_rnn.hpp"
|
||||
#include "ngraph_transformations/move_eltwise_up_data_movement.hpp"
|
||||
#include "ngraph_transformations/swap_convert_transpose.hpp"
|
||||
|
||||
#include <snippets/pass/collapse_subgraph.hpp>
|
||||
#include <snippets/pass/common_optimizations.hpp>
|
||||
#include <snippets/pass/convert_constants.hpp>
|
||||
|
||||
#include <ngraph/opsets/opset1.hpp>
|
||||
#include <ngraph/opsets/opset2.hpp>
|
||||
#include <ngraph/opsets/opset3.hpp>
|
||||
#include <ngraph/opsets/opset4.hpp>
|
||||
#include <ngraph/opsets/opset5.hpp>
|
||||
#include <ngraph/opsets/opset6.hpp>
|
||||
#include <openvino/opsets/opset10.hpp>
|
||||
#include <ngraph/op/util/op_types.hpp>
|
||||
#include <ngraph/pass/manager.hpp>
|
||||
#include <ngraph/graph_util.hpp>
|
||||
#include <ov_ops/augru_cell.hpp>
|
||||
#include <ov_ops/augru_sequence.hpp>
|
||||
|
||||
#include <transformations/low_precision/mark_dequantization_subgraph.hpp>
|
||||
#include <low_precision/common/quantization_granularity_restriction.hpp>
|
||||
#include <low_precision/common/precisions_restriction.hpp>
|
||||
#include <low_precision/convert_subtract_constant.hpp>
|
||||
#include <low_precision/convolution.hpp>
|
||||
#include <low_precision/convolution_backprop_data.hpp>
|
||||
#include <low_precision/layer_transformation.hpp>
|
||||
#include <low_precision/low_precision.hpp>
|
||||
#include <low_precision/multiply_to_group_convolution.hpp>
|
||||
#include <low_precision/network_helper.hpp>
|
||||
#include "openvino/runtime/core.hpp"
|
||||
#include "openvino/util/common_util.hpp"
|
||||
|
||||
#include <ie_algorithm.hpp>
|
||||
#include "performance_heuristics.hpp"
|
||||
|
||||
#include "nodes/mvn.h"
|
||||
#include "nodes/fake_quantize.h"
|
||||
#include "nodes/normalize.h"
|
||||
#include "nodes/mha.h"
|
||||
#include "weights_cache.hpp"
|
||||
#include "utils/denormals.hpp"
|
||||
#include "transformations/common_optimizations/augru_cell_fusion.hpp"
|
||||
|
||||
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(_M_ARM64)
|
||||
#ifndef __GNUC_PREREQ
|
||||
@ -262,452 +150,6 @@ Engine::~Engine() {
|
||||
executorManager()->clear("CPUCallbackExecutor");
|
||||
}
|
||||
|
||||
static bool fuse_type_to_convert(const std::shared_ptr<ngraph::Node>& node, ov::element::Type to, size_t idx) {
|
||||
if (auto convert = ov::as_type_ptr<ov::opset10::Convert>(node)) {
|
||||
// For Convert node, converting precision from floating point to boolean will lead to mathematical
|
||||
// error, because here the output precision boolean is replaced by u8. E.g. floating point value 0.01
|
||||
// is converted to be 1 for boolean, but 0 for u8. Thus an Abs and Ceil node should be added before the
|
||||
// Convert node for this scenario.
|
||||
if (convert->input(0).get_element_type().is_real() &&
|
||||
convert->get_convert_element_type() == ngraph::element::boolean && to.is_integral_number()) {
|
||||
auto abs = std::make_shared<ov::opset10::Abs>(convert->input_value(0).get_node_shared_ptr());
|
||||
auto ceil = std::make_shared<ov::opset10::Ceiling>(abs);
|
||||
auto new_convert = std::make_shared<ov::opset10::Convert>(ceil, to);
|
||||
new_convert->set_friendly_name(convert->get_friendly_name());
|
||||
ov::copy_runtime_info(convert, {abs, ceil, new_convert});
|
||||
ov::replace_node(convert, new_convert);
|
||||
return true;
|
||||
} else {
|
||||
convert->set_convert_element_type(to);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function> nGraphFunc, const bool _enableLPT, const bool _enableBF16,
|
||||
const bool _enableSnippets, const bool isLegacyApi) {
|
||||
ov::pass::Manager manager;
|
||||
manager.set_per_pass_validation(false);
|
||||
manager.register_pass<ov::pass::InitNodeInfo>();
|
||||
|
||||
const bool useLpt =
|
||||
_enableLPT &&
|
||||
ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(nGraphFunc);
|
||||
auto defaultPrecisions = useLpt ? ngraph::pass::low_precision::precision_set::int8_support : std::vector<ov::element::Type>{};
|
||||
bool hasINT16orINT32Levels = false;
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part1);
|
||||
hasINT16orINT32Levels = ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent(
|
||||
nGraphFunc,
|
||||
{ngraph::pass::low_precision::levels::int16, ngraph::pass::low_precision::levels::int16_narrow_range,
|
||||
ngraph::pass::low_precision::levels::int32, ngraph::pass::low_precision::levels::int32_narrow_range});
|
||||
if (hasINT16orINT32Levels) {
|
||||
defaultPrecisions = ngraph::pass::low_precision::precision_set::int8_int16_int32_support;
|
||||
}
|
||||
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(defaultPrecisions);
|
||||
}
|
||||
auto get_convert_precisions = []() {
|
||||
precisions_array array = {
|
||||
{ngraph::element::i64, ngraph::element::i32},
|
||||
{ngraph::element::u64, ngraph::element::i32},
|
||||
{ngraph::element::i16, ngraph::element::i32},
|
||||
{ngraph::element::u16, ngraph::element::i32},
|
||||
{ngraph::element::u32, ngraph::element::i32},
|
||||
{ngraph::element::f64, ngraph::element::f32},
|
||||
{ngraph::element::f16, ngraph::element::f32},
|
||||
{ngraph::element::boolean, ngraph::element::u8},
|
||||
{ngraph::element::i4, ngraph::element::i8},
|
||||
{ngraph::element::u4, ngraph::element::u8}
|
||||
};
|
||||
|
||||
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))
|
||||
array.push_back({ngraph::element::bf16, ngraph::element::f32});
|
||||
|
||||
return array;
|
||||
};
|
||||
|
||||
static const auto precisions = get_convert_precisions();
|
||||
type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};
|
||||
|
||||
manager.register_pass<ov::pass::AUGRUCellFusion>();
|
||||
manager.register_pass<ov::pass::CommonOptimizations>();
|
||||
manager.register_pass<ov::pass::WrapInterpolateIntoTransposes>();
|
||||
manager.register_pass<ov::pass::TransposeSinking>();
|
||||
manager.register_pass<ov::pass::ConvertSequenceToTensorIterator>();
|
||||
manager.register_pass<ov::pass::ConvertOpSet3ToOpSet2>();
|
||||
manager.register_pass<ov::pass::ConvertOpSet2ToOpSet1>();
|
||||
manager.register_pass<ov::pass::LSTMCellDecomposition>();
|
||||
manager.register_pass<ov::pass::GRUCellDecomposition>();
|
||||
manager.register_pass<ov::pass::RNNCellDecomposition>();
|
||||
manager.register_pass<ov::pass::ConvertNMS1ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS3ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS4ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS5ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS9ToNMSIEInternal>();
|
||||
manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
|
||||
manager.register_pass<ov::pass::ConvertMatrixNmsToMatrixNmsIE>();
|
||||
manager.register_pass<ov::pass::TransposeMatMul>();
|
||||
manager.register_pass<ov::pass::ConstantFolding>();
|
||||
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part2);
|
||||
manager.register_pass<ngraph::pass::low_precision::ConvertSubtractConstant>(defaultPrecisions);
|
||||
}
|
||||
manager.register_pass<ov::pass::Validate>();
|
||||
manager.register_pass<ov::pass::ConvertPrecision>(precisions, type_to_fuse);
|
||||
manager.register_pass<ov::pass::EliminateConvert>();
|
||||
manager.register_pass<SwapConvertTranspose>();
|
||||
manager.register_pass<ConvertToInteraction>();
|
||||
manager.register_pass<ConvertInteractionInt8>();
|
||||
|
||||
auto pass_config = manager.get_pass_config();
|
||||
|
||||
using const_node_ptr = const std::shared_ptr<const ngraph::Node>;
|
||||
|
||||
// SpaceToDepth/ DepthToSpace node implementation supports only equal input/output tensors with rank <= 5
|
||||
pass_config->set_callback<ov::pass::ConvertSpaceToDepth,
|
||||
ov::pass::ConvertDepthToSpace>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
return node->input_value(0).get_shape().size() <= 5lu &&
|
||||
node->input_value(0).get_shape().size() == node->get_output_shape(0).size();
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertBatchToSpace,
|
||||
ov::pass::ConvertSpaceToBatch>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
const auto & rank = node->input(0).get_partial_shape().rank().get_length();
|
||||
return rank == 4lu || rank == 5lu;
|
||||
});
|
||||
|
||||
auto isCellPrimitiveSupported = [](const_node_ptr &node) -> bool {
|
||||
if (const auto &rnn_cell = std::dynamic_pointer_cast<const ngraph::opset4::RNNCell>(node)) {
|
||||
return rnn_cell->get_clip() == 0.0f;
|
||||
} else if (const auto &gru_cell = std::dynamic_pointer_cast<const ngraph::opset4::GRUCell>(
|
||||
node)) {
|
||||
return gru_cell->get_clip() == 0.0f
|
||||
&& gru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
|
||||
} else if (const auto &augru_cell = std::dynamic_pointer_cast<const ov::op::internal::AUGRUCell>(
|
||||
node)) {
|
||||
return augru_cell->get_clip() == 0.0f
|
||||
&& augru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
|
||||
} else if (const auto &lstm_cell = std::dynamic_pointer_cast<const ngraph::opset4::LSTMCell>(
|
||||
node)) {
|
||||
return lstm_cell->get_clip() == 0.0f &&
|
||||
lstm_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
|
||||
} else if (const auto &lstm_cell_v1 = std::dynamic_pointer_cast<const ngraph::opset1::LSTMCell>(
|
||||
node)) {
|
||||
return lstm_cell_v1->get_clip() == 0.0f &&
|
||||
lstm_cell_v1->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// Sequences supported by the plugin shouldn't be converted to TensorIterator.
|
||||
// sequence_length input is not supported in all Sequences, so if is_seq_len_provided() == true, we
|
||||
// should always convert to TensorIterator.
|
||||
// RNN/GRU/LSTM Sequences are supported with clip == 0, and with default activations.
|
||||
auto isSequencePrimitiveSupported = [](const_node_ptr &node) -> bool {
|
||||
const auto& data = node->input(0);
|
||||
const auto& data_pshape = data.get_partial_shape();
|
||||
// WA: dynamic shapes make impossible to check seq_len due to shapeOf subgraphs
|
||||
// but the sequence is still supported in CPU and doesn't need to be decomposed
|
||||
if (data_pshape.is_dynamic())
|
||||
return true;
|
||||
if (data_pshape.rank().is_static() && data_pshape.rank().get_length() > 1 && !data_pshape[1].is_static())
|
||||
return false;
|
||||
auto max_seq_len = data.get_shape().at(1);
|
||||
if (const auto &rnn_seq = std::dynamic_pointer_cast<const ngraph::opset6::RNNSequence>(node)) {
|
||||
return rnn_seq->get_clip() == 0.0f &&
|
||||
!ngraph::op::util::is_seq_len_provided(rnn_seq->get_input_node_shared_ptr(2),
|
||||
max_seq_len);
|
||||
} else if (const auto &gru_seq = std::dynamic_pointer_cast<const ngraph::opset6::GRUSequence>(
|
||||
node)) {
|
||||
return gru_seq->get_clip() == 0.0f &&
|
||||
gru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
|
||||
!ngraph::op::util::is_seq_len_provided(gru_seq->get_input_node_shared_ptr(2),
|
||||
max_seq_len);
|
||||
} else if (const auto &augru_seq = std::dynamic_pointer_cast<const ov::op::internal::AUGRUSequence>(
|
||||
node)) {
|
||||
return augru_seq->get_clip() == 0.0f &&
|
||||
augru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
|
||||
!ngraph::op::util::is_seq_len_provided(augru_seq->get_input_node_shared_ptr(2),
|
||||
max_seq_len);
|
||||
} else if (const auto &lstm_seq = std::dynamic_pointer_cast<const ngraph::opset6::LSTMSequence>(
|
||||
node)) {
|
||||
return lstm_seq->get_clip() == 0.0f &&
|
||||
lstm_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"} &&
|
||||
!ngraph::op::util::is_seq_len_provided(lstm_seq->get_input_node_shared_ptr(3),
|
||||
max_seq_len);
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertRNNSequenceToTensorIterator,
|
||||
ov::pass::ConvertGRUSequenceToTensorIterator,
|
||||
ov::pass::ConvertLSTMSequenceToTensorIterator>(
|
||||
[isSequencePrimitiveSupported](const_node_ptr &node) -> bool {
|
||||
return isSequencePrimitiveSupported(node);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::RNNCellDecomposition, ov::pass::GRUCellDecomposition,
|
||||
ov::pass::LSTMCellDecomposition>(
|
||||
[isCellPrimitiveSupported](const_node_ptr &node) -> bool {
|
||||
return isCellPrimitiveSupported(node);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::MVN6Decomposition>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
std::string errorMessage;
|
||||
return node::MVN::isSupportedOperation(node, errorMessage);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::NormalizeL2Decomposition>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
std::string errorMsg;
|
||||
return node::NormalizeL2::isSupportedOperation(node, errorMsg);
|
||||
});
|
||||
|
||||
pass_config->enable<ov::pass::SoftmaxDecomposition>();
|
||||
pass_config->set_callback<ov::pass::SoftmaxDecomposition>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
return node->input_value(0).get_partial_shape().rank().get_length() <= 5;
|
||||
});
|
||||
|
||||
if (!isLegacyApi) {
|
||||
auto nmsCallback = [](const_node_ptr &node) -> bool {
|
||||
for (size_t i = 0; i < node->get_output_size(); i++) {
|
||||
const auto outputs = node->get_output_target_inputs(i);
|
||||
for (const auto &out : outputs) {
|
||||
if (!ngraph::op::is_output(out.get_node())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertNMS9ToNMSIEInternal>(nmsCallback);
|
||||
pass_config->set_callback<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>(nmsCallback);
|
||||
pass_config->set_callback<ov::pass::ConvertMatrixNmsToMatrixNmsIE>(nmsCallback);
|
||||
}
|
||||
|
||||
// List of enabled/disabled transformations
|
||||
|
||||
// Allow FP16 Converts to be folded and FP16 constants to be upgraded to FP32 data type
|
||||
pass_config->disable<ov::pass::DisableDecompressionConvertConstantFolding>();
|
||||
pass_config->disable<ov::pass::ConvertCompressedOnlyToLegacy>();
|
||||
pass_config->disable<ov::pass::EyeDecomposition>();
|
||||
|
||||
pass_config->disable<ov::pass::ConvertGELU>();
|
||||
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
|
||||
pass_config->disable<ov::pass::Gelu7Downgrade>();
|
||||
pass_config->disable<ov::pass::HSwishDecomposition>();
|
||||
pass_config->disable<ov::pass::ReduceL1Decomposition>();
|
||||
pass_config->disable<ov::pass::ReduceL2Decomposition>();
|
||||
pass_config->disable<ov::pass::SoftPlusDecomposition>();
|
||||
pass_config->disable<ov::pass::HSigmoidDecomposition>();
|
||||
pass_config->disable<ov::pass::ConvertMod>();
|
||||
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
|
||||
pass_config->disable<ov::pass::WeightsDequantizeToFakeQuantize>();
|
||||
pass_config->disable<ov::pass::SimplifyCTCGreedyDecoderSeqLen>();
|
||||
pass_config->disable<ov::pass::ConvertGather7ToGather1>();
|
||||
pass_config->disable<ov::pass::ConvertGather8ToGather7>();
|
||||
pass_config->disable<ov::pass::ConvertMinimum>();
|
||||
pass_config->disable<ov::pass::ConvertBroadcastToTiles>();
|
||||
pass_config->disable<ov::pass::ConvertReduceMeanToPooling>();
|
||||
pass_config->disable<ov::pass::ConvertReduceMaxToPooling>();
|
||||
pass_config->disable<ov::pass::ConvertReduceSumToPooling>();
|
||||
pass_config->disable<ov::pass::SliceToStridedSlice>();
|
||||
pass_config->disable<ov::pass::ConvertDetectionOutput8ToDetectionOutput1>();
|
||||
pass_config->disable<ov::pass::ConvertROIAlign9To3>();
|
||||
pass_config->disable<ov::pass::SoftSignDecomposition>();
|
||||
pass_config->disable<ov::pass::UniqueDecomposition>();
|
||||
|
||||
pass_config->enable<ov::pass::NormalizeL2Decomposition>();
|
||||
pass_config->enable<ov::pass::ConvertInterpolate1ToInterpolate4>();
|
||||
pass_config->enable<ov::pass::ConvertGather1ToGather7>();
|
||||
pass_config->enable<ov::pass::ConvertDetectionOutput1ToDetectionOutput8>();
|
||||
pass_config->enable<ov::pass::ConvertROIAlign3To9>();
|
||||
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part3);
|
||||
pass_config->set_callback<ov::pass::AddFakeQuantizeFusion,
|
||||
ov::pass::MulFakeQuantizeFusion,
|
||||
ov::pass::FakeQuantizeMulFusion>([](const_node_ptr &node) -> bool {
|
||||
std::string errMsg;
|
||||
return !node::FakeQuantize::isSupportedOperation(node, errMsg);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertQuantizeDequantize>([&defaultPrecisions](const_node_ptr &node) -> bool {
|
||||
return ngraph::pass::low_precision::NetworkHelper::areQuantizeAndDequantizeSupportedForMultiply(node, defaultPrecisions);
|
||||
});
|
||||
}
|
||||
|
||||
manager.run_passes(nGraphFunc);
|
||||
|
||||
using namespace ngraph::pass::low_precision;
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
|
||||
//Only enable conv/group conv signed input on AMX platform.
|
||||
std::vector<ngraph::element::Type> input0LowPrecisionList;
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
|
||||
input0LowPrecisionList = {ngraph::element::u8, ngraph::element::i8};
|
||||
} else {
|
||||
input0LowPrecisionList = {ngraph::element::u8};
|
||||
}
|
||||
auto supportedPrecisions = std::vector<PrecisionsRestriction>({
|
||||
PrecisionsRestriction::create<ngraph::opset1::Convolution>({
|
||||
{{0}, input0LowPrecisionList},
|
||||
{{1}, {ngraph::element::i8}},
|
||||
}),
|
||||
PrecisionsRestriction::create<ngraph::opset1::ConvolutionBackpropData>({
|
||||
{{0}, {ngraph::element::u8, ngraph::element::i8}},
|
||||
{{1}, {ngraph::element::i8}}
|
||||
}),
|
||||
PrecisionsRestriction::create<ngraph::opset1::GroupConvolution>({
|
||||
{{0}, input0LowPrecisionList},
|
||||
{{1}, {ngraph::element::i8}}
|
||||
}),
|
||||
PrecisionsRestriction::create<ngraph::opset1::Multiply>({
|
||||
{{0}, {ngraph::element::u8}},
|
||||
{{1}, {ngraph::element::i8}},
|
||||
}),
|
||||
PrecisionsRestriction::create<ngraph::opset1::MatMul>({
|
||||
{{0}, {ngraph::element::u8, ngraph::element::i8}},
|
||||
{{1}, {ngraph::element::i8}}
|
||||
}),
|
||||
PrecisionsRestriction::create<ngraph::opset5::LSTMSequence>({
|
||||
{{0, 1}, {ngraph::element::u8, ngraph::element::i8}},
|
||||
}),
|
||||
PrecisionsRestriction::create<ngraph::opset6::GRUSequence>({
|
||||
{{0, 1}, {ngraph::element::u8, ngraph::element::i8}},
|
||||
}),
|
||||
});
|
||||
|
||||
auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>({
|
||||
QuantizationGranularityRestriction::create<ngraph::opset1::Convolution>({0}),
|
||||
QuantizationGranularityRestriction::create<ngraph::opset1::ConvolutionBackpropData>({0})
|
||||
});
|
||||
|
||||
// for GNA networks reference execution
|
||||
bool updatePrecision = true;
|
||||
if (hasINT16orINT32Levels) {
|
||||
updatePrecision = false;
|
||||
supportedPrecisions = std::vector<PrecisionsRestriction>({});
|
||||
}
|
||||
|
||||
ov::pass::Manager lptManager;
|
||||
lptManager.register_pass<ngraph::pass::low_precision::LowPrecision>(
|
||||
supportedPrecisions,
|
||||
quantizationRestrictions,
|
||||
LayerTransformation::Params(updatePrecision, ngraph::element::f32, defaultPrecisions));
|
||||
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::MarkupPrecisions>([](const_node_ptr& node) -> bool {
|
||||
if (const auto mulitply = std::dynamic_pointer_cast<const ngraph::opset1::Multiply>(node)) {
|
||||
return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply);
|
||||
}
|
||||
return false;
|
||||
});
|
||||
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::ConvolutionBackpropDataTransformation>(
|
||||
[&defaultPrecisions](const_node_ptr& node) -> bool {
|
||||
return LayerTransformation::isAsymmetricQuantization(node, defaultPrecisions) ||
|
||||
WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions);
|
||||
});
|
||||
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::MultiplyToGroupConvolutionTransformation>([](const_node_ptr& node) -> bool {
|
||||
return true;//MultiplyToGroupConvolutionTransformation::isDynamicOrScalar(node);
|
||||
});
|
||||
lptManager.run_passes(nGraphFunc);
|
||||
}
|
||||
|
||||
ov::pass::Manager postLPTPassManager;
|
||||
postLPTPassManager.register_pass<ov::pass::UnrollTensorIterator>();
|
||||
postLPTPassManager.register_pass<ov::pass::ReshapePRelu>();
|
||||
postLPTPassManager.get_pass_config()->set_callback<ov::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
|
||||
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
|
||||
return node->get_rt_info().count("UNROLL_TI") == 0;
|
||||
});
|
||||
postLPTPassManager.register_pass<MoveEltwiseUpThroughDataMov>();
|
||||
postLPTPassManager.get_pass_config()->set_callback<MoveEltwiseUpThroughDataMov>([](const std::shared_ptr<const ngraph::Node>& node) -> bool {
|
||||
if (node->get_input_size() >= 2) {
|
||||
return node->get_input_element_type(1) == ngraph::element::i8 || node->get_input_element_type(1) == ngraph::element::u8;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
postLPTPassManager.register_pass<ov::pass::ConstantFolding>();
|
||||
|
||||
// Snippets may brake MHA patterns so the fusion has to performed before
|
||||
postLPTPassManager.register_pass<MHAFusion>();
|
||||
postLPTPassManager.register_pass<FuseFQtoInteraction>();
|
||||
postLPTPassManager.get_pass_config()->set_callback<MHAFloatFusion, MHAFloatFusion2,
|
||||
MHAQuantFusion, MHAQuantFusion2>([_enableBF16](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
std::string errorMessage;
|
||||
|
||||
if (!node::MHA::isSupportedOperation(n, errorMessage))
|
||||
return true;
|
||||
|
||||
// Implementation calls AMX BF16 brgemm only for tensors with K and N aligned on 2, otherwise fallbacks on vector impl
|
||||
// Vector madd BF16 instruction on SPR has reduced performance on HW level, which results in overall perf degradation
|
||||
size_t bf16Factor = 2;
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16_amx_bf16) &&
|
||||
(n->get_input_element_type(0) == element::bf16 || (n->get_input_element_type(0) == element::f32 && _enableBF16)) &&
|
||||
(n->get_input_shape(0)[3] % bf16Factor != 0 || n->get_input_shape(1)[1] % bf16Factor != 0 || n->get_input_shape(3)[3] % bf16Factor != 0)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
// Execute before snippets. Otherwise FQ will be converted to Subgraph
|
||||
postLPTPassManager.register_pass<ConvertFqRnnToQuantizedRnn>();
|
||||
postLPTPassManager.run_passes(nGraphFunc);
|
||||
|
||||
if (_enableSnippets && dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) {
|
||||
ov::pass::Manager snippetsManager;
|
||||
snippetsManager.register_pass<SnippetsMarkSkipped>();
|
||||
snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
|
||||
snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
|
||||
snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
|
||||
[](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
// CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
|
||||
if (ov::is_type<const ov::op::v4::Swish>(n)) {
|
||||
if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
|
||||
return true;
|
||||
}
|
||||
|
||||
const auto& inputs = n->inputs();
|
||||
// todo: clarify whether we can evaluate snippets on const paths
|
||||
const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
|
||||
[](const ov::Input<const ov::Node> &in) {
|
||||
return ov::is_type<ov::op::v0::Constant>(in.get_source_output().get_node_shared_ptr());
|
||||
});
|
||||
// todo: clarify whether we can evaluate snippets on inputs with larger ranks
|
||||
auto rank_is_too_large = [](const ov::descriptor::Tensor& t ) {
|
||||
// callback is called has_supported_in_out(), so it's safe to assume that the shapes are static
|
||||
return t.get_partial_shape().rank().get_length() > 6;
|
||||
};
|
||||
const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(),
|
||||
[&](const ov::Input<const ov::Node>& in) {return rank_is_too_large(in.get_tensor());});
|
||||
const auto& outputs = n->outputs();
|
||||
const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
|
||||
[&](const ov::Output<const ov::Node>& out) {return rank_is_too_large(out.get_tensor());});
|
||||
return has_only_const_inputs || bad_input_rank || bad_output_rank;
|
||||
});
|
||||
snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
|
||||
snippetsManager.run_passes(nGraphFunc);
|
||||
}
|
||||
|
||||
ov::pass::Manager postSnippetsManager;
|
||||
postSnippetsManager.register_pass<ov::pass::FakeQuantizeDecomposition>();
|
||||
postSnippetsManager.get_pass_config()->set_callback<ov::pass::FakeQuantizeDecomposition>([](const_node_ptr& node) -> bool {
|
||||
std::string errMsg;
|
||||
return node::FakeQuantize::isSupportedOperation(node, errMsg);
|
||||
});
|
||||
postSnippetsManager.register_pass<ov::pass::ConstantFolding>();
|
||||
postSnippetsManager.run_passes(nGraphFunc);
|
||||
}
|
||||
|
||||
static bool streamsSet(const std::map<std::string, std::string>& config) {
|
||||
return config.count(PluginConfigParams::KEY_CPU_THROUGHPUT_STREAMS) ||
|
||||
config.count(ov::num_streams.name());
|
||||
@ -883,7 +325,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
|
||||
const bool enableLPT = (lptProp != config.end() && lptProp->second == PluginConfigParams::YES) /* enabled in the orig_config*/
|
||||
|| Config::LPTransformsMode::On == engConfig.lpTransformsMode /* or already enabled for the plugin */;
|
||||
const auto& BF16Prop = config.find(InferenceEngine::PluginConfigParams::KEY_ENFORCE_BF16);
|
||||
bool enableBF16;
|
||||
bool enableBF16 = false;
|
||||
if (BF16Prop != config.end()) {
|
||||
if (BF16Prop->second == PluginConfigParams::YES) {
|
||||
enableBF16 = dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core);
|
||||
@ -901,7 +343,8 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
|
||||
|
||||
DEBUG_LOG(PrintableModel(*nGraphFunc, "org_"));
|
||||
|
||||
TransformationUpToCPUSpecificOpSet(nGraphFunc, enableLPT, enableBF16, enableSnippets, isLegacyAPI());
|
||||
Transformations transformations(nGraphFunc, enableLPT, enableSnippets, enableBF16, isLegacyAPI(), engConfig);
|
||||
transformations.UpToCpuSpecificOpSet();
|
||||
|
||||
// need to check that all outputs have static shapes
|
||||
// checking that all inputs have static shapes is performed in the common part
|
||||
@ -914,8 +357,7 @@ Engine::LoadExeNetworkImpl(const InferenceEngine::CNNNetwork &network, const std
|
||||
}
|
||||
|
||||
ApplyPerformanceHints(config, nGraphFunc);
|
||||
|
||||
ConvertToCPUSpecificOpset(nGraphFunc);
|
||||
transformations.CpuSpecificOpSet();
|
||||
|
||||
DEBUG_LOG(PrintableModel(*nGraphFunc, "cpu_"));
|
||||
|
||||
@ -1154,8 +596,9 @@ QueryNetworkResult Engine::QueryNetwork(const CNNNetwork& network, const std::ma
|
||||
|
||||
auto supported = GetSupportedNodes(model,
|
||||
[&](std::shared_ptr<ov::Model>& model) {
|
||||
TransformationUpToCPUSpecificOpSet(model, enableLPT, conf.enforceBF16, enableSnippets, isLegacyAPI());
|
||||
ConvertToCPUSpecificOpset(model);
|
||||
Transformations transformation(model, enableLPT, enableSnippets, conf.enforceBF16, isLegacyAPI(), engConfig);
|
||||
transformation.UpToCpuSpecificOpSet();
|
||||
transformation.CpuSpecificOpSet();
|
||||
},
|
||||
[&](const std::shared_ptr<ngraph::Node>& op) {
|
||||
std::unique_ptr<Node> ptr;
|
||||
|
@ -4,16 +4,12 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
|
||||
#include "exec_network.h"
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
#include <vector>
|
||||
#include <cfloat>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
609
src/plugins/intel_cpu/src/transformation_pipeline.cpp
Normal file
609
src/plugins/intel_cpu/src/transformation_pipeline.cpp
Normal file
@ -0,0 +1,609 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#include "transformation_pipeline.h"
|
||||
|
||||
// Operations
|
||||
#include "openvino/opsets/opset1.hpp"
|
||||
#include "openvino/opsets/opset2.hpp"
|
||||
#include "openvino/opsets/opset3.hpp"
|
||||
#include "openvino/opsets/opset4.hpp"
|
||||
#include "openvino/opsets/opset5.hpp"
|
||||
#include "openvino/opsets/opset6.hpp"
|
||||
#include "openvino/opsets/opset10.hpp"
|
||||
#include <ov_ops/augru_cell.hpp>
|
||||
#include <ov_ops/augru_sequence.hpp>
|
||||
|
||||
// Common transformations
|
||||
#include "transformations/common_optimizations/add_fake_quantize_fusion.hpp"
|
||||
#include "transformations/common_optimizations/convert_compression_only_to_legacy.hpp"
|
||||
#include "transformations/common_optimizations/convert_quantize_dequantize.hpp"
|
||||
#include "transformations/common_optimizations/fq_mul_fusion.hpp"
|
||||
#include "transformations/common_optimizations/mul_fake_quantize_fusion.hpp"
|
||||
#include "transformations/common_optimizations/nop_elimination.hpp"
|
||||
#include "transformations/common_optimizations/transpose_sinking.hpp"
|
||||
#include "transformations/common_optimizations/weights_dequantize_to_fake_quantize.hpp"
|
||||
#include "transformations/common_optimizations/augru_cell_fusion.hpp"
|
||||
#include "transformations/common_optimizations/common_optimizations.hpp"
|
||||
#include "transformations/common_optimizations/wrap_interpolate_into_transposes.hpp"
|
||||
#include "transformations/control_flow/unroll_tensor_iterator.hpp"
|
||||
#include "transformations/disable_decompression_convert_constant_folding.hpp"
|
||||
#include "transformations/op_conversions/convert_batch_to_space.hpp"
|
||||
#include "transformations/op_conversions/convert_broadcast_to_tiles.hpp"
|
||||
#include "transformations/op_conversions/convert_depth_to_space.hpp"
|
||||
#include "transformations/op_conversions/convert_gather_downgrade.hpp"
|
||||
#include "transformations/op_conversions/convert_gather_upgrade.hpp"
|
||||
#include "transformations/op_conversions/convert_gelu.hpp"
|
||||
#include "transformations/op_conversions/convert_interpolate1_to_interpolate4.hpp"
|
||||
#include "transformations/op_conversions/convert_matrix_nms_to_matrix_nms_ie.hpp"
|
||||
#include "transformations/op_conversions/convert_minimum_to_power_and_max.hpp"
|
||||
#include "transformations/op_conversions/convert_mod.hpp"
|
||||
#include "transformations/op_conversions/convert_multiclass_nms_to_multiclass_nms_ie.hpp"
|
||||
#include "transformations/op_conversions/convert_nms9_to_nms_ie_internal.hpp"
|
||||
#include "transformations/op_conversions/convert_previous_nms_to_nms_9.hpp"
|
||||
#include "transformations/op_conversions/convert_reduce_to_pooling.hpp"
|
||||
#include "transformations/op_conversions/convert_roi_align_v3_to_v9.hpp"
|
||||
#include "transformations/op_conversions/convert_roi_align_v9_to_v3.hpp"
|
||||
#include "transformations/op_conversions/convert_sequences_to_tensor_iterator.hpp"
|
||||
#include "transformations/op_conversions/convert_shuffle_channels3.hpp"
|
||||
#include "transformations/op_conversions/convert_slice_to_strided_slice.hpp"
|
||||
#include "transformations/op_conversions/convert_space_to_batch.hpp"
|
||||
#include "transformations/op_conversions/convert_space_to_depth.hpp"
|
||||
#include "transformations/op_conversions/convert_subtract.hpp"
|
||||
#include "transformations/op_conversions/convert_ti_to_sequences.hpp"
|
||||
#include "transformations/op_conversions/detection_output_downgrade.hpp"
|
||||
#include "transformations/op_conversions/detection_output_upgrade.hpp"
|
||||
#include "transformations/op_conversions/eye_decomposition.hpp"
|
||||
#include "transformations/op_conversions/fq_decomposition.hpp"
|
||||
#include "transformations/op_conversions/gelu7_downgrade.hpp"
|
||||
#include "transformations/op_conversions/hsigmoid_decomposition.hpp"
|
||||
#include "transformations/op_conversions/hswish_decomposition.hpp"
|
||||
#include "transformations/op_conversions/gru_cell_decomposition.hpp"
|
||||
#include "transformations/op_conversions/lstm_cell_decomposition.hpp"
|
||||
#include "transformations/op_conversions/mvn6_decomposition.hpp"
|
||||
#include "transformations/op_conversions/normalize_l2_decomposition.hpp"
|
||||
#include "transformations/op_conversions/reduce_l1_decomposition.hpp"
|
||||
#include "transformations/op_conversions/reduce_l2_decomposition.hpp"
|
||||
#include "transformations/op_conversions/rnn_cell_decomposition.hpp"
|
||||
#include "transformations/op_conversions/simplify_ctc_greedy_decoder_seq_len.hpp"
|
||||
#include "transformations/op_conversions/softplus_decomposition.hpp"
|
||||
#include "transformations/op_conversions/softsign_decomposition.hpp"
|
||||
#include "transformations/op_conversions/softmax_decomposition.hpp"
|
||||
#include "transformations/op_conversions/unique_decomposition.hpp"
|
||||
#include "transformations/opset_conversions/convert_opset2_to_opset1.hpp"
|
||||
#include "transformations/opset_conversions/convert_opset3_to_opset2.hpp"
|
||||
#include "transformations/smart_reshape/matmul_sr.hpp"
|
||||
#include "transformations/init_node_info.hpp"
|
||||
#include "utils/ngraph_transformation.hpp"
|
||||
|
||||
// LPT transformations
|
||||
#include "transformations/low_precision/mark_dequantization_subgraph.hpp"
|
||||
#include "low_precision/convolution_backprop_data.hpp"
|
||||
#include "low_precision/convert_subtract_constant.hpp"
|
||||
#include "low_precision/network_helper.hpp"
|
||||
#include "low_precision/multiply_to_group_convolution.hpp"
|
||||
#include "low_precision/group_convolution.hpp"
|
||||
|
||||
// CPU specific transformations
|
||||
#include "ngraph_transformations/convert_to_cpu_specific_opset.hpp"
|
||||
#include "ngraph_transformations/snippets_mark_skipped.hpp"
|
||||
#include "ngraph_transformations/mha_fusion.hpp"
|
||||
#include "ngraph_transformations/convert_to_interaction.hpp"
|
||||
#include "ngraph_transformations/convert_fq_rnn_to_quantized_rnn.hpp"
|
||||
#include "ngraph_transformations/move_eltwise_up_data_movement.hpp"
|
||||
#include "ngraph_transformations/swap_convert_transpose.hpp"
|
||||
|
||||
// Snippets
|
||||
#include "snippets/pass/collapse_subgraph.hpp"
|
||||
#include "snippets/pass/common_optimizations.hpp"
|
||||
|
||||
// Misc
|
||||
#include "nodes/mvn.h"
|
||||
#include "nodes/normalize.h"
|
||||
#include "nodes/fake_quantize.h"
|
||||
#include "nodes/mha.h"
|
||||
|
||||
#include "dnnl.hpp"
|
||||
#include <cpu/x64/cpu_isa_traits.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
using const_node_ptr = const std::shared_ptr<const ov::Node>;
|
||||
|
||||
bool Transformations::fuse_type_to_convert(const std::shared_ptr<ngraph::Node>& node, ov::element::Type to, size_t idx) {
|
||||
if (auto convert = ov::as_type_ptr<ov::opset10::Convert>(node)) {
|
||||
// For Convert node, converting precision from floating point to boolean will lead to mathematical
|
||||
// error, because here the output precision boolean is replaced by u8. E.g. floating point value 0.01
|
||||
// is converted to be 1 for boolean, but 0 for u8. Thus an Abs and Ceil node should be added before the
|
||||
// Convert node for this scenario.
|
||||
if (convert->input(0).get_element_type().is_real() &&
|
||||
convert->get_convert_element_type() == ngraph::element::boolean && to.is_integral_number()) {
|
||||
auto abs = std::make_shared<ov::opset10::Abs>(convert->input_value(0).get_node_shared_ptr());
|
||||
auto ceil = std::make_shared<ov::opset10::Ceiling>(abs);
|
||||
auto new_convert = std::make_shared<ov::opset10::Convert>(ceil, to);
|
||||
new_convert->set_friendly_name(convert->get_friendly_name());
|
||||
ov::copy_runtime_info(convert, {abs, ceil, new_convert});
|
||||
ov::replace_node(convert, new_convert);
|
||||
return true;
|
||||
} else {
|
||||
convert->set_convert_element_type(to);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Transformations::UpToCpuSpecificOpSet() {
|
||||
const bool useLpt = enableLpt &&
|
||||
ngraph::pass::low_precision::LowPrecision::isFunctionQuantized(model) &&
|
||||
CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Lpt);
|
||||
|
||||
const bool useSnippets = enableSnippets &&
|
||||
CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(config.debugCaps, Snippets);
|
||||
|
||||
auto defaultPrecisions = useLpt ? ngraph::pass::low_precision::precision_set::int8_support : std::vector<ov::element::Type>{};
|
||||
bool hasINT16orINT32Levels = false;
|
||||
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part1);
|
||||
hasINT16orINT32Levels = ngraph::pass::low_precision::LowPrecision::isFQLevelsPresent(
|
||||
model,
|
||||
{ngraph::pass::low_precision::levels::int16, ngraph::pass::low_precision::levels::int16_narrow_range,
|
||||
ngraph::pass::low_precision::levels::int32, ngraph::pass::low_precision::levels::int32_narrow_range});
|
||||
if (hasINT16orINT32Levels) {
|
||||
defaultPrecisions = ngraph::pass::low_precision::precision_set::int8_int16_int32_support;
|
||||
}
|
||||
}
|
||||
|
||||
PreLpt(defaultPrecisions, isLegacyApi);
|
||||
|
||||
if (useLpt)
|
||||
Lpt(hasINT16orINT32Levels, defaultPrecisions);
|
||||
|
||||
PostLpt();
|
||||
|
||||
if (useSnippets)
|
||||
Snippets();
|
||||
}
|
||||
|
||||
void Transformations::CpuSpecificOpSet(void) {
|
||||
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Specific);
|
||||
|
||||
ConvertToCPUSpecificOpset(model);
|
||||
}
|
||||
|
||||
void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecisions, const bool isLegacyApi) {
|
||||
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PreLpt);
|
||||
|
||||
ov::pass::Manager manager;
|
||||
manager.set_per_pass_validation(false);
|
||||
manager.register_pass<ov::pass::InitNodeInfo>();
|
||||
|
||||
const bool useLpt = !defaultPrecisions.empty();
|
||||
if (useLpt) {
|
||||
manager.register_pass<ov::pass::MarkDequantizationSubgraph>(defaultPrecisions);
|
||||
}
|
||||
|
||||
auto get_convert_precisions = []() {
|
||||
precisions_array array = {
|
||||
{ov::element::i64, ov::element::i32},
|
||||
{ov::element::u64, ov::element::i32},
|
||||
{ov::element::i16, ov::element::i32},
|
||||
{ov::element::u16, ov::element::i32},
|
||||
{ov::element::u32, ov::element::i32},
|
||||
{ov::element::f64, ov::element::f32},
|
||||
{ov::element::f16, ov::element::f32},
|
||||
{ov::element::boolean, ov::element::u8},
|
||||
{ov::element::i4, ov::element::i8},
|
||||
{ov::element::u4, ov::element::u8}
|
||||
};
|
||||
|
||||
if (!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core))
|
||||
array.push_back({ov::element::bf16, ov::element::f32});
|
||||
|
||||
return array;
|
||||
};
|
||||
static const auto precisions = get_convert_precisions();
|
||||
type_to_fuse_map type_to_fuse = {{ov::opset10::Convert::get_type_info_static(), fuse_type_to_convert}};
|
||||
|
||||
manager.register_pass<ov::pass::AUGRUCellFusion>();
|
||||
manager.register_pass<ov::pass::CommonOptimizations>();
|
||||
manager.register_pass<ov::pass::WrapInterpolateIntoTransposes>();
|
||||
manager.register_pass<ov::pass::TransposeSinking>();
|
||||
manager.register_pass<ov::pass::ConvertSequenceToTensorIterator>();
|
||||
manager.register_pass<ov::pass::ConvertOpSet3ToOpSet2>();
|
||||
manager.register_pass<ov::pass::ConvertOpSet2ToOpSet1>();
|
||||
manager.register_pass<ov::pass::LSTMCellDecomposition>();
|
||||
manager.register_pass<ov::pass::GRUCellDecomposition>();
|
||||
manager.register_pass<ov::pass::RNNCellDecomposition>();
|
||||
manager.register_pass<ov::pass::ConvertNMS1ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS3ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS4ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS5ToNMS9>();
|
||||
manager.register_pass<ov::pass::ConvertNMS9ToNMSIEInternal>();
|
||||
manager.register_pass<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>();
|
||||
manager.register_pass<ov::pass::ConvertMatrixNmsToMatrixNmsIE>();
|
||||
manager.register_pass<ov::pass::TransposeMatMul>();
|
||||
manager.register_pass<ov::pass::ConstantFolding>();
|
||||
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part2);
|
||||
manager.register_pass<ngraph::pass::low_precision::ConvertSubtractConstant>(defaultPrecisions);
|
||||
}
|
||||
manager.register_pass<ov::pass::Validate>();
|
||||
manager.register_pass<ov::pass::ConvertPrecision>(precisions, type_to_fuse);
|
||||
manager.register_pass<ov::pass::EliminateConvert>();
|
||||
manager.register_pass<SwapConvertTranspose>();
|
||||
manager.register_pass<ConvertToInteraction>();
|
||||
manager.register_pass<ConvertInteractionInt8>();
|
||||
|
||||
auto pass_config = manager.get_pass_config();
|
||||
|
||||
// SpaceToDepth/ DepthToSpace node implementation supports only equal input/output tensors with rank <= 5
|
||||
pass_config->set_callback<ov::pass::ConvertSpaceToDepth,
|
||||
ov::pass::ConvertDepthToSpace>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
return node->input_value(0).get_shape().size() <= 5lu &&
|
||||
node->input_value(0).get_shape().size() == node->get_output_shape(0).size();
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertBatchToSpace,
|
||||
ov::pass::ConvertSpaceToBatch>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
const auto & rank = node->input(0).get_partial_shape().rank().get_length();
|
||||
return rank == 4lu || rank == 5lu;
|
||||
});
|
||||
|
||||
auto isCellPrimitiveSupported = [](const_node_ptr &node) -> bool {
|
||||
if (const auto &rnn_cell = std::dynamic_pointer_cast<const ov::opset4::RNNCell>(node)) {
|
||||
return rnn_cell->get_clip() == 0.0f;
|
||||
} else if (const auto &gru_cell = std::dynamic_pointer_cast<const ov::opset4::GRUCell>(
|
||||
node)) {
|
||||
return gru_cell->get_clip() == 0.0f
|
||||
&& gru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
|
||||
} else if (const auto &augru_cell = std::dynamic_pointer_cast<const ov::op::internal::AUGRUCell>(
|
||||
node)) {
|
||||
return augru_cell->get_clip() == 0.0f
|
||||
&& augru_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh"};
|
||||
} else if (const auto &lstm_cell = std::dynamic_pointer_cast<const ov::opset4::LSTMCell>(
|
||||
node)) {
|
||||
return lstm_cell->get_clip() == 0.0f &&
|
||||
lstm_cell->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
|
||||
} else if (const auto &lstm_cell_v1 = std::dynamic_pointer_cast<const ov::opset1::LSTMCell>(
|
||||
node)) {
|
||||
return lstm_cell_v1->get_clip() == 0.0f &&
|
||||
lstm_cell_v1->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"};
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
// Sequences supported by the plugin shouldn't be converted to TensorIterator.
|
||||
// sequence_length input is not supported in all Sequences, so if is_seq_len_provided() == true, we
|
||||
// should always convert to TensorIterator.
|
||||
// RNN/GRU/LSTM Sequences are supported with clip == 0, and with default activations.
|
||||
auto isSequencePrimitiveSupported = [](const_node_ptr &node) -> bool {
|
||||
const auto& data = node->input(0);
|
||||
const auto& data_pshape = data.get_partial_shape();
|
||||
// WA: dynamic shapes make impossible to check seq_len due to shapeOf subgraphs
|
||||
// but the sequence is still supported in CPU and doesn't need to be decomposed
|
||||
if (data_pshape.is_dynamic())
|
||||
return true;
|
||||
if (data_pshape.rank().is_static() && data_pshape.rank().get_length() > 1 && !data_pshape[1].is_static())
|
||||
return false;
|
||||
auto max_seq_len = data.get_shape().at(1);
|
||||
if (const auto &rnn_seq = std::dynamic_pointer_cast<const ov::opset6::RNNSequence>(node)) {
|
||||
return rnn_seq->get_clip() == 0.0f &&
|
||||
!ov::op::util::is_seq_len_provided(rnn_seq->get_input_node_shared_ptr(2),
|
||||
max_seq_len);
|
||||
} else if (const auto &gru_seq = std::dynamic_pointer_cast<const ov::opset6::GRUSequence>(
|
||||
node)) {
|
||||
return gru_seq->get_clip() == 0.0f &&
|
||||
gru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
|
||||
!ov::op::util::is_seq_len_provided(gru_seq->get_input_node_shared_ptr(2),
|
||||
max_seq_len);
|
||||
} else if (const auto &augru_seq = std::dynamic_pointer_cast<const ov::op::internal::AUGRUSequence>(
|
||||
node)) {
|
||||
return augru_seq->get_clip() == 0.0f &&
|
||||
augru_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh"} &&
|
||||
!ov::op::util::is_seq_len_provided(augru_seq->get_input_node_shared_ptr(2),
|
||||
max_seq_len);
|
||||
} else if (const auto &lstm_seq = std::dynamic_pointer_cast<const ov::opset6::LSTMSequence>(
|
||||
node)) {
|
||||
return lstm_seq->get_clip() == 0.0f &&
|
||||
lstm_seq->get_activations() == std::vector<std::string>{"sigmoid", "tanh", "tanh"} &&
|
||||
!ov::op::util::is_seq_len_provided(lstm_seq->get_input_node_shared_ptr(3),
|
||||
max_seq_len);
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertRNNSequenceToTensorIterator,
|
||||
ov::pass::ConvertGRUSequenceToTensorIterator,
|
||||
ov::pass::ConvertLSTMSequenceToTensorIterator>(
|
||||
[isSequencePrimitiveSupported](const_node_ptr &node) -> bool {
|
||||
return isSequencePrimitiveSupported(node);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::RNNCellDecomposition, ov::pass::GRUCellDecomposition,
|
||||
ov::pass::LSTMCellDecomposition>(
|
||||
[isCellPrimitiveSupported](const_node_ptr &node) -> bool {
|
||||
return isCellPrimitiveSupported(node);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::MVN6Decomposition>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
std::string errorMessage;
|
||||
return node::MVN::isSupportedOperation(node, errorMessage);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::NormalizeL2Decomposition>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
std::string errorMsg;
|
||||
return node::NormalizeL2::isSupportedOperation(node, errorMsg);
|
||||
});
|
||||
|
||||
pass_config->enable<ngraph::pass::SoftmaxDecomposition>();
|
||||
pass_config->set_callback<ngraph::pass::SoftmaxDecomposition>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
return node->input_value(0).get_partial_shape().rank().get_length() <= 5;
|
||||
});
|
||||
|
||||
if (!isLegacyApi) {
|
||||
auto nmsCallback = [](const_node_ptr &node) -> bool {
|
||||
for (size_t i = 0; i < node->get_output_size(); i++) {
|
||||
const auto outputs = node->get_output_target_inputs(i);
|
||||
for (const auto &out : outputs) {
|
||||
if (!ov::op::util::is_output(out.get_node())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertNMS9ToNMSIEInternal>(nmsCallback);
|
||||
pass_config->set_callback<ov::pass::ConvertMulticlassNmsToMulticlassNmsIE>(nmsCallback);
|
||||
pass_config->set_callback<ov::pass::ConvertMatrixNmsToMatrixNmsIE>(nmsCallback);
|
||||
}
|
||||
|
||||
// List of enabled/disabled transformations
|
||||
|
||||
// Allow FP16 Converts to be folded and FP16 constants to be upgraded to FP32 data type
|
||||
pass_config->disable<ov::pass::DisableDecompressionConvertConstantFolding>();
|
||||
pass_config->disable<ov::pass::ConvertCompressedOnlyToLegacy>();
|
||||
pass_config->disable<ov::pass::EyeDecomposition>();
|
||||
|
||||
pass_config->disable<ov::pass::ConvertGELU>();
|
||||
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
|
||||
pass_config->disable<ov::pass::Gelu7Downgrade>();
|
||||
pass_config->disable<ov::pass::HSwishDecomposition>();
|
||||
pass_config->disable<ov::pass::ReduceL1Decomposition>();
|
||||
pass_config->disable<ov::pass::ReduceL2Decomposition>();
|
||||
pass_config->disable<ov::pass::SoftPlusDecomposition>();
|
||||
pass_config->disable<ov::pass::HSigmoidDecomposition>();
|
||||
pass_config->disable<ov::pass::ConvertMod>();
|
||||
pass_config->disable<ov::pass::ConvertShuffleChannels3>();
|
||||
pass_config->disable<ov::pass::WeightsDequantizeToFakeQuantize>();
|
||||
pass_config->disable<ov::pass::SimplifyCTCGreedyDecoderSeqLen>();
|
||||
pass_config->disable<ov::pass::ConvertGather7ToGather1>();
|
||||
pass_config->disable<ov::pass::ConvertGather8ToGather7>();
|
||||
pass_config->disable<ov::pass::ConvertMinimum>();
|
||||
pass_config->disable<ov::pass::ConvertBroadcastToTiles>();
|
||||
pass_config->disable<ov::pass::ConvertReduceMeanToPooling>();
|
||||
pass_config->disable<ov::pass::ConvertReduceMaxToPooling>();
|
||||
pass_config->disable<ov::pass::ConvertReduceSumToPooling>();
|
||||
pass_config->disable<ov::pass::SliceToStridedSlice>();
|
||||
pass_config->disable<ov::pass::ConvertDetectionOutput8ToDetectionOutput1>();
|
||||
pass_config->disable<ov::pass::ConvertROIAlign9To3>();
|
||||
pass_config->disable<ov::pass::SoftSignDecomposition>();
|
||||
pass_config->disable<ov::pass::UniqueDecomposition>();
|
||||
|
||||
pass_config->enable<ov::pass::NormalizeL2Decomposition>();
|
||||
pass_config->enable<ov::pass::ConvertInterpolate1ToInterpolate4>();
|
||||
pass_config->enable<ov::pass::ConvertGather1ToGather7>();
|
||||
pass_config->enable<ov::pass::ConvertDetectionOutput1ToDetectionOutput8>();
|
||||
pass_config->enable<ov::pass::ConvertROIAlign3To9>();
|
||||
|
||||
if (useLpt) {
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part3);
|
||||
pass_config->set_callback<ov::pass::AddFakeQuantizeFusion,
|
||||
ov::pass::MulFakeQuantizeFusion,
|
||||
ov::pass::FakeQuantizeMulFusion>(
|
||||
[](const_node_ptr &node) -> bool {
|
||||
std::string errMsg;
|
||||
return !node::FakeQuantize::isSupportedOperation(node, errMsg);
|
||||
});
|
||||
|
||||
pass_config->set_callback<ov::pass::ConvertQuantizeDequantize>([&defaultPrecisions](const_node_ptr &node) -> bool {
|
||||
return ngraph::pass::low_precision::NetworkHelper::areQuantizeAndDequantizeSupportedForMultiply(node, defaultPrecisions);
|
||||
});
|
||||
}
|
||||
|
||||
manager.run_passes(model);
|
||||
}
|
||||
|
||||
void Transformations::Lpt(const bool hasINT16orINT32Levels, const std::vector<ov::element::Type>& defaultPrecisions) {
|
||||
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Lpt);
|
||||
|
||||
using namespace ngraph::pass::low_precision;
|
||||
CPU_LPT_SCOPE(LowPrecisionTransformations_Part4);
|
||||
OV_ITT_SCOPE(FIRST_INFERENCE, itt::domains::intel_cpu_LT, "LowPrecisionTransformations");
|
||||
//Only enable conv/group conv signed input on AMX platform.
|
||||
std::vector<ov::element::Type> input0LowPrecisionList;
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) {
|
||||
input0LowPrecisionList = {ov::element::u8, ov::element::i8};
|
||||
} else {
|
||||
input0LowPrecisionList = {ov::element::u8};
|
||||
}
|
||||
auto supportedPrecisions = std::vector<PrecisionsRestriction>({
|
||||
PrecisionsRestriction::create<ov::opset1::Convolution>({
|
||||
{{0}, input0LowPrecisionList},
|
||||
{{1}, {ov::element::i8}},
|
||||
}),
|
||||
PrecisionsRestriction::create<ov::opset1::ConvolutionBackpropData>({
|
||||
{{0}, {ov::element::u8, ov::element::i8}},
|
||||
{{1}, {ov::element::i8}}
|
||||
}),
|
||||
PrecisionsRestriction::create<ov::opset1::GroupConvolution>({
|
||||
{{0}, input0LowPrecisionList},
|
||||
{{1}, {ov::element::i8}}
|
||||
}),
|
||||
PrecisionsRestriction::create<ov::opset1::Multiply>({
|
||||
{{0}, {ov::element::u8}},
|
||||
{{1}, {ov::element::i8}},
|
||||
}),
|
||||
PrecisionsRestriction::create<ov::opset1::MatMul>({
|
||||
{{0}, {ov::element::u8, ov::element::i8}},
|
||||
{{1}, {ov::element::i8}}
|
||||
}),
|
||||
PrecisionsRestriction::create<ov::opset5::LSTMSequence>({
|
||||
{{0, 1}, {ov::element::u8, ov::element::i8}},
|
||||
}),
|
||||
PrecisionsRestriction::create<ov::opset6::GRUSequence>({
|
||||
{{0, 1}, {ov::element::u8, ov::element::i8}},
|
||||
}),
|
||||
});
|
||||
|
||||
auto quantizationRestrictions = std::vector<QuantizationGranularityRestriction>({
|
||||
QuantizationGranularityRestriction::create<ov::opset1::Convolution>({0}),
|
||||
QuantizationGranularityRestriction::create<ov::opset1::ConvolutionBackpropData>({0})
|
||||
});
|
||||
|
||||
// for GNA networks reference execution
|
||||
bool updatePrecision = true;
|
||||
if (hasINT16orINT32Levels) {
|
||||
updatePrecision = false;
|
||||
supportedPrecisions = std::vector<PrecisionsRestriction>({});
|
||||
}
|
||||
|
||||
ov::pass::Manager lptManager;
|
||||
lptManager.register_pass<ngraph::pass::low_precision::LowPrecision>(
|
||||
supportedPrecisions,
|
||||
quantizationRestrictions,
|
||||
LayerTransformation::Params(updatePrecision, ov::element::f32, defaultPrecisions));
|
||||
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::MarkupPrecisions>([](const_node_ptr& node) -> bool {
|
||||
if (const auto mulitply = std::dynamic_pointer_cast<const ov::opset1::Multiply>(node)) {
|
||||
return !MultiplyToGroupConvolutionTransformation::canBeTransformedToGroupConvolution(mulitply);
|
||||
}
|
||||
return false;
|
||||
});
|
||||
lptManager.get_pass_config()->set_callback<ngraph::pass::low_precision::ConvolutionBackpropDataTransformation>(
|
||||
[&defaultPrecisions](const_node_ptr& node) -> bool {
|
||||
return LayerTransformation::isAsymmetricQuantization(node, defaultPrecisions) ||
|
||||
WeightableLayerTransformation::isAsymmetricOnWeights(node, defaultPrecisions);
|
||||
});
|
||||
|
||||
lptManager.get_pass_config()->disable<ngraph::pass::low_precision::MultiplyToGroupConvolutionTransformation>();
|
||||
|
||||
lptManager.run_passes(model);
|
||||
}
|
||||
|
||||
void Transformations::PostLpt() {
|
||||
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, PostLpt);
|
||||
|
||||
ov::pass::Manager postLPTPassManager;
|
||||
postLPTPassManager.register_pass<ov::pass::UnrollTensorIterator>();
|
||||
postLPTPassManager.register_pass<ov::pass::ReshapePRelu>();
|
||||
postLPTPassManager.get_pass_config()->set_callback<ov::pass::UnrollTensorIterator>([](const_node_ptr &node) -> bool {
|
||||
// UnrollTI transformation is disabled by default, is turned on by LowLatency transformation
|
||||
return node->get_rt_info().count("UNROLL_TI") == 0;
|
||||
});
|
||||
postLPTPassManager.register_pass<MoveEltwiseUpThroughDataMov>();
|
||||
postLPTPassManager.get_pass_config()->set_callback<MoveEltwiseUpThroughDataMov>([](const std::shared_ptr<const ov::Node>& node) -> bool {
|
||||
if (node->get_input_size() >= 2) {
|
||||
return node->get_input_element_type(1) == ov::element::i8 || node->get_input_element_type(1) == ov::element::u8;
|
||||
}
|
||||
return false;
|
||||
});
|
||||
|
||||
postLPTPassManager.register_pass<ov::pass::ConstantFolding>();
|
||||
|
||||
// Snippets may brake MHA patterns so the fusion has to performed before
|
||||
postLPTPassManager.register_pass<MHAFusion>();
|
||||
postLPTPassManager.register_pass<FuseFQtoInteraction>();
|
||||
postLPTPassManager.get_pass_config()->set_callback<MHAFloatFusion, MHAFloatFusion2,
|
||||
MHAQuantFusion, MHAQuantFusion2>
|
||||
([this](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
std::string errorMessage;
|
||||
|
||||
if (!node::MHA::isSupportedOperation(n, errorMessage))
|
||||
return true;
|
||||
|
||||
// Implementation calls AMX BF16 brgemm only for tensors with K and N aligned on 2, otherwise fallbacks on vector impl
|
||||
// Vector madd BF16 instruction on SPR has reduced performance on HW level, which results in overall perf degradation
|
||||
size_t bf16Factor = 2;
|
||||
if (dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16_amx_bf16) &&
|
||||
(n->get_input_element_type(0) == element::bf16 || (n->get_input_element_type(0) == element::f32 && enableBF16)) &&
|
||||
(n->get_input_shape(0)[3] % bf16Factor != 0 || n->get_input_shape(1)[1] % bf16Factor != 0 || n->get_input_shape(3)[3] % bf16Factor != 0)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
});
|
||||
|
||||
// Execute before snippets. Otherwise FQ will be converted to Subgraph
|
||||
postLPTPassManager.register_pass<ConvertFqRnnToQuantizedRnn>();
|
||||
postLPTPassManager.run_passes(model);
|
||||
}
|
||||
|
||||
void Transformations::MainSnippets(void) {
|
||||
if (!enableSnippets ||
|
||||
!dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx2)) // snippets are implemeted only for relevant platforms (avx2+ extentions)
|
||||
return;
|
||||
|
||||
ov::pass::Manager snippetsManager;
|
||||
snippetsManager.register_pass<SnippetsMarkSkipped>();
|
||||
snippetsManager.register_pass<ngraph::snippets::pass::EnumerateNodes>();
|
||||
snippetsManager.register_pass<ngraph::snippets::pass::TokenizeSnippets>();
|
||||
snippetsManager.get_pass_config()->set_callback<ngraph::snippets::pass::TokenizeSnippets>(
|
||||
[](const std::shared_ptr<const ov::Node>& n) -> bool {
|
||||
// CPU Plugin support Swish in Subgraph via conversion to SwichCPU which assumes second input to be constant
|
||||
if (ov::is_type<const ov::op::v4::Swish>(n)) {
|
||||
if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
|
||||
return true;
|
||||
}
|
||||
|
||||
const auto& inputs = n->inputs();
|
||||
// todo: clarify whether we can evaluate snippets on const paths
|
||||
const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
|
||||
[](const ov::Input<const ov::Node> &in) {
|
||||
return ov::is_type<ov::op::v0::Constant>(in.get_source_output().get_node_shared_ptr());
|
||||
});
|
||||
// todo: clarify whether we can evaluate snippets on inputs with larger ranks
|
||||
auto rank_is_too_large = [](const ov::descriptor::Tensor& t ) {
|
||||
// callback is called has_supported_in_out(), so it's safe to assume that the shapes are static
|
||||
return t.get_partial_shape().rank().get_length() > 6;
|
||||
};
|
||||
const bool bad_input_rank = std::any_of(inputs.begin(), inputs.end(),
|
||||
[&](const ov::Input<const ov::Node>& in) {return rank_is_too_large(in.get_tensor());});
|
||||
const auto& outputs = n->outputs();
|
||||
const bool bad_output_rank = std::any_of(outputs.begin(), outputs.end(),
|
||||
[&](const ov::Output<const ov::Node>& out) {return rank_is_too_large(out.get_tensor());});
|
||||
return has_only_const_inputs || bad_input_rank || bad_output_rank;
|
||||
});
|
||||
snippetsManager.register_pass<ngraph::snippets::pass::CommonOptimizations>();
|
||||
snippetsManager.run_passes(model);
|
||||
}
|
||||
|
||||
void Transformations::PostSnippets(void) {
|
||||
ov::pass::Manager postSnippetsManager;
|
||||
postSnippetsManager.register_pass<ov::pass::FakeQuantizeDecomposition>();
|
||||
postSnippetsManager.get_pass_config()->set_callback<ov::pass::FakeQuantizeDecomposition>([](const_node_ptr& node) -> bool {
|
||||
std::string errMsg;
|
||||
return node::FakeQuantize::isSupportedOperation(node, errMsg);
|
||||
});
|
||||
postSnippetsManager.register_pass<ov::pass::ConstantFolding>();
|
||||
postSnippetsManager.run_passes(model);
|
||||
}
|
||||
|
||||
void Transformations::Snippets(void) {
|
||||
CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(this, Snippets);
|
||||
|
||||
MainSnippets();
|
||||
PostSnippets();
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
65
src/plugins/intel_cpu/src/transformation_pipeline.h
Normal file
65
src/plugins/intel_cpu/src/transformation_pipeline.h
Normal file
@ -0,0 +1,65 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "openvino/core/model.hpp"
|
||||
#include "low_precision/low_precision.hpp"
|
||||
#include "config.h"
|
||||
|
||||
#include "itt.h"
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
using namespace InferenceEngine;
|
||||
|
||||
#define IE_CPU_PLUGIN_THROW(...) IE_THROW(__VA_ARGS__) << "CPU plugin: "
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class Transformations {
|
||||
public:
|
||||
Transformations(const std::shared_ptr<ov::Model>& initialModel,
|
||||
const bool enableLpt,
|
||||
const bool enableSnippets,
|
||||
const bool enableBF16,
|
||||
const bool isLegacyApi,
|
||||
const Config& config)
|
||||
: model(initialModel),
|
||||
enableLpt(enableLpt),
|
||||
enableSnippets(enableSnippets),
|
||||
enableBF16(enableBF16),
|
||||
isLegacyApi(isLegacyApi),
|
||||
config(config) {}
|
||||
|
||||
void UpToCpuSpecificOpSet();
|
||||
void CpuSpecificOpSet(void);
|
||||
|
||||
private:
|
||||
std::shared_ptr<ov::Model> model;
|
||||
const bool enableLpt;
|
||||
const bool enableSnippets;
|
||||
const bool enableBF16;
|
||||
const bool isLegacyApi;
|
||||
const Config& config;
|
||||
|
||||
void PreLpt(const std::vector<ov::element::Type>& defaultPrecisions, const bool isLegacyApi);
|
||||
|
||||
void Lpt(const bool hasINT16orINT32Levels, const std::vector<ov::element::Type>& defaultPrecisions);
|
||||
|
||||
void PostLpt();
|
||||
|
||||
void MainSnippets(void);
|
||||
|
||||
void PostSnippets(void);
|
||||
|
||||
void Snippets(void);
|
||||
|
||||
static bool fuse_type_to_convert(const std::shared_ptr<ngraph::Node>& node, ov::element::Type to, size_t idx);
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
@ -5,7 +5,7 @@
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
|
||||
#define CPU_DEBUG_CAP_ENABLE(_x) _x;
|
||||
#define CPU_DEBUG_CAP_ENABLE(...) __VA_ARGS__
|
||||
#define CPU_DEBUG_CAPS_ALWAYS_TRUE(x) true
|
||||
|
||||
#include <string>
|
||||
@ -147,7 +147,7 @@ static inline std::ostream& write_all_to_stream(std::ostream& os, const T& arg,
|
||||
|
||||
#else // !CPU_DEBUG_CAPS
|
||||
|
||||
#define CPU_DEBUG_CAP_ENABLE(_x)
|
||||
#define CPU_DEBUG_CAP_ENABLE(...)
|
||||
#define CPU_DEBUG_CAPS_ALWAYS_TRUE(x) x
|
||||
|
||||
#define DEBUG_LOG(...)
|
||||
|
66
src/plugins/intel_cpu/src/utils/debug_caps_config.cpp
Normal file
66
src/plugins/intel_cpu/src/utils/debug_caps_config.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
|
||||
#include "debug_caps_config.h"
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
void DebugCapsConfig::readProperties() {
|
||||
auto readEnv = [](const char* envVar) {
|
||||
return std::getenv(envVar);
|
||||
};
|
||||
|
||||
auto parseDumpFormat = [](const std::string& format) {
|
||||
if (format == "BIN")
|
||||
return FORMAT::BIN;
|
||||
else if (format == "TEXT")
|
||||
return FORMAT::TEXT;
|
||||
else
|
||||
IE_THROW() << "readDebugCapsProperties: Unknown dump format";
|
||||
};
|
||||
|
||||
const char* envVarValue = nullptr;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_EXEC_GRAPH_PATH")))
|
||||
execGraphPath = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_VERBOSE")))
|
||||
verbose = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_DIR")))
|
||||
blobDumpDir = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_FORMAT")))
|
||||
blobDumpFormat = parseDumpFormat(envVarValue);
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_EXEC_ID")))
|
||||
blobDumpFilters[BY_EXEC_ID] = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_PORTS")))
|
||||
blobDumpFilters[BY_PORTS] = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_TYPE")))
|
||||
blobDumpFilters[BY_TYPE] = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_BLOB_DUMP_NODE_NAME")))
|
||||
blobDumpFilters[BY_NAME] = envVarValue;
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_SUMMARY_PERF"))) {
|
||||
summaryPerf = envVarValue;
|
||||
}
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_DISABLE")))
|
||||
disable.parseAndSet(envVarValue);
|
||||
|
||||
if ((envVarValue = readEnv("OV_CPU_DUMP_IR")))
|
||||
dumpIR.parseAndSet(envVarValue);
|
||||
}
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
#endif // CPU_DEBUG_CAPS
|
213
src/plugins/intel_cpu/src/utils/debug_caps_config.h
Normal file
213
src/plugins/intel_cpu/src/utils/debug_caps_config.h
Normal file
@ -0,0 +1,213 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#pragma once
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
|
||||
#include "ie_common.h"
|
||||
#include "openvino/util/common_util.hpp"
|
||||
|
||||
#include <bitset>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class DebugCapsConfig {
|
||||
private:
|
||||
struct PropertySetter;
|
||||
using PropertySetterPtr = std::shared_ptr<PropertySetter>;
|
||||
|
||||
public:
|
||||
DebugCapsConfig() {
|
||||
readProperties();
|
||||
}
|
||||
|
||||
enum FILTER {
|
||||
BY_PORTS,
|
||||
BY_EXEC_ID,
|
||||
BY_TYPE,
|
||||
BY_NAME,
|
||||
};
|
||||
|
||||
enum class FORMAT {
|
||||
BIN,
|
||||
TEXT,
|
||||
};
|
||||
|
||||
std::string execGraphPath;
|
||||
std::string verbose;
|
||||
std::string blobDumpDir = "cpu_dump";
|
||||
FORMAT blobDumpFormat = FORMAT::TEXT;
|
||||
// std::hash<int> is necessary for Ubuntu-16.04 (gcc-5.4 and defect in C++11 standart)
|
||||
std::unordered_map<FILTER, std::string, std::hash<int>> blobDumpFilters;
|
||||
std::string summaryPerf = "";
|
||||
|
||||
struct TransformationFilter {
|
||||
enum Type : uint8_t {
|
||||
PreLpt = 0, Lpt, PostLpt, Snippets, Specific, NumOfTypes
|
||||
};
|
||||
std::bitset<NumOfTypes> filter;
|
||||
|
||||
PropertySetterPtr getPropertySetter() {
|
||||
return PropertySetterPtr(new BitsetFilterPropertySetter<NumOfTypes>("transformations", filter,
|
||||
{{"all", {PreLpt, Lpt, PostLpt, Snippets, Specific}},
|
||||
{"common", {PreLpt, PostLpt}},
|
||||
{"prelpt", {PreLpt}},
|
||||
{"lpt", {Lpt}},
|
||||
{"postlpt", {PostLpt}},
|
||||
{"snippets", {Snippets}},
|
||||
{"specific", {Specific}}
|
||||
}));
|
||||
}
|
||||
};
|
||||
struct IrFormatFilter {
|
||||
enum Type : uint8_t {
|
||||
Xml = 0, XmlBin, Dot, Svg, NumOfTypes
|
||||
};
|
||||
std::bitset<NumOfTypes> filter;
|
||||
|
||||
PropertySetterPtr getPropertySetter() {
|
||||
return PropertySetterPtr(new BitsetFilterPropertySetter<NumOfTypes>("formats", filter,
|
||||
{{"all", {XmlBin, Dot, Svg}},
|
||||
{"xml", {Xml}},
|
||||
{"xmlbin", {XmlBin}},
|
||||
{"dot", {Dot}},
|
||||
{"svg", {Svg}},
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
struct PropertyGroup {
|
||||
virtual std::vector<PropertySetterPtr> getPropertySetters(void) = 0;
|
||||
|
||||
void parseAndSet(const std::string& str) {
|
||||
const auto& options = ov::util::split(str, ' ');
|
||||
const auto& propertySetters = getPropertySetters();
|
||||
bool failed = false;
|
||||
auto getHelp = [propertySetters] (void) {
|
||||
std::string help;
|
||||
for (const auto& property : propertySetters)
|
||||
help.append('\t' + property->getPropertyName() + "=<" + property->getPropertyValueDescription() + ">\n");
|
||||
return help;
|
||||
};
|
||||
|
||||
for (const auto& option : options) {
|
||||
const auto& parts = ov::util::split(option, '=');
|
||||
if (parts.size() > 2) {
|
||||
failed = true;
|
||||
break;
|
||||
}
|
||||
const auto& propertyName = ov::util::to_lower(parts.front());
|
||||
const auto& foundSetter = std::find_if(propertySetters.begin(), propertySetters.end(),
|
||||
[propertyName] (const PropertySetterPtr& setter) { return setter->getPropertyName() == propertyName; });
|
||||
if (foundSetter == propertySetters.end() ||
|
||||
!(*foundSetter)->parseAndSet(parts.size() == 1 ? "" : parts.back())) {
|
||||
failed = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (failed)
|
||||
IE_THROW() << "Wrong syntax: " << str << std::endl
|
||||
<< "The following space separated options are supported (option names are case insensitive):" << std::endl
|
||||
<< getHelp();
|
||||
}
|
||||
};
|
||||
|
||||
struct : PropertyGroup {
|
||||
TransformationFilter transformations;
|
||||
|
||||
std::vector<PropertySetterPtr> getPropertySetters(void) override {
|
||||
return { transformations.getPropertySetter() };
|
||||
}
|
||||
} disable;
|
||||
|
||||
struct : PropertyGroup {
|
||||
std::string dir = "intel_cpu_dump";
|
||||
IrFormatFilter format = { 1 << IrFormatFilter::Xml };
|
||||
TransformationFilter transformations;
|
||||
|
||||
std::vector<PropertySetterPtr> getPropertySetters(void) override {
|
||||
return { PropertySetterPtr(new StringPropertySetter("dir", dir, "path to dumped IRs")),
|
||||
format.getPropertySetter(),
|
||||
transformations.getPropertySetter() };
|
||||
}
|
||||
} dumpIR;
|
||||
|
||||
private:
|
||||
struct PropertySetter {
|
||||
virtual bool parseAndSet(const std::string& str) = 0;
|
||||
virtual std::string getPropertyValueDescription(void) const = 0;
|
||||
|
||||
PropertySetter(const std::string&& name) : propertyName(name) {}
|
||||
const std::string& getPropertyName(void) const { return propertyName; }
|
||||
|
||||
private:
|
||||
const std::string propertyName;
|
||||
};
|
||||
|
||||
struct StringPropertySetter : PropertySetter {
|
||||
StringPropertySetter(const std::string&& name, std::string& ref, const std::string&& valueDescription)
|
||||
: property(ref), propertyValueDescription(valueDescription), PropertySetter(std::move(name)) {}
|
||||
bool parseAndSet(const std::string& str) override {
|
||||
property = str;
|
||||
return true;
|
||||
}
|
||||
std::string getPropertyValueDescription(void) const override { return propertyValueDescription; }
|
||||
|
||||
private:
|
||||
std::string& property;
|
||||
const std::string propertyValueDescription;
|
||||
};
|
||||
template<std::size_t NumOfBits>
|
||||
|
||||
struct BitsetFilterPropertySetter : PropertySetter {
|
||||
struct Token {
|
||||
std::string name;
|
||||
std::vector<size_t> bits;
|
||||
};
|
||||
|
||||
BitsetFilterPropertySetter(const std::string&& name, std::bitset<NumOfBits>& ref, const std::vector<Token>&& tokens)
|
||||
: property(ref), propertyTokens(tokens), PropertySetter(std::move(name)) {}
|
||||
bool parseAndSet(const std::string& str) override {
|
||||
const auto& tokens = str.empty() ?
|
||||
std::vector<std::string>{"all"} : ov::util::split(ov::util::to_lower(str), ',');
|
||||
property.reset();
|
||||
for (const auto& token : tokens) {
|
||||
const bool tokenVal = (token.front() != '-');
|
||||
const auto& tokenName = tokenVal ? token : token.substr(1);
|
||||
const auto& foundToken = std::find_if(propertyTokens.begin(), propertyTokens.end(),
|
||||
[tokenName] (const Token& token) { return token.name == tokenName; });
|
||||
if (foundToken == propertyTokens.end())
|
||||
return false;
|
||||
|
||||
for (const auto& bit : foundToken->bits) {
|
||||
property.set(bit, tokenVal);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
std::string getPropertyValueDescription(void) const override {
|
||||
std::string supportedTokens = "comma separated filter tokens: ";
|
||||
for (auto i = 0; i < propertyTokens.size(); i++) {
|
||||
if (i)
|
||||
supportedTokens.push_back(',');
|
||||
supportedTokens.append(propertyTokens[i].name);
|
||||
}
|
||||
supportedTokens.append("; -'token' is used for exclusion, case does not matter, no tokens is treated as 'all'");
|
||||
return supportedTokens;
|
||||
}
|
||||
|
||||
private:
|
||||
std::bitset<NumOfBits>& property;
|
||||
const std::vector<Token> propertyTokens;
|
||||
};
|
||||
|
||||
void readProperties();
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
||||
#endif // CPU_DEBUG_CAPS
|
113
src/plugins/intel_cpu/src/utils/ngraph_transformation.hpp
Normal file
113
src/plugins/intel_cpu/src/utils/ngraph_transformation.hpp
Normal file
@ -0,0 +1,113 @@
|
||||
// Copyright (C) 2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#pragma once
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
|
||||
#include "debug_caps_config.h"
|
||||
#include "openvino/util/file_util.hpp"
|
||||
#include <openvino/pass/manager.hpp>
|
||||
#include <openvino/pass/serialize.hpp>
|
||||
#include <openvino/pass/visualize_tree.hpp>
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
class TransformationDumper {
|
||||
public:
|
||||
explicit TransformationDumper(const DebugCapsConfig& config, const DebugCapsConfig::TransformationFilter::Type type,
|
||||
const std::shared_ptr<ov::Model>& model)
|
||||
: config(config), type(type), model(model) {
|
||||
for (auto prev = infoMap.at(type).prev; prev != TransformationType::NumOfTypes;
|
||||
prev = infoMap.at(prev).prev) {
|
||||
// no need to serialize input graph if there was no transformations from previous dump
|
||||
if (config.disable.transformations.filter[prev])
|
||||
continue;
|
||||
if (!config.dumpIR.transformations.filter[prev])
|
||||
break;
|
||||
if (wasDumped()[prev])
|
||||
return;
|
||||
}
|
||||
dump("_in");
|
||||
}
|
||||
~TransformationDumper() {
|
||||
dump("_out");
|
||||
wasDumped().set(type);
|
||||
}
|
||||
|
||||
private:
|
||||
const DebugCapsConfig& config;
|
||||
const std::shared_ptr<ov::Model>& model;
|
||||
using TransformationType = DebugCapsConfig::TransformationFilter::Type;
|
||||
const TransformationType type;
|
||||
|
||||
struct TransformationInfo {
|
||||
std::string name;
|
||||
TransformationType prev;
|
||||
};
|
||||
// std::hash<std::underlying_type<FILTER>::type> is necessary for Ubuntu-16.04 (gcc-5.4 and defect in C++11 standart)
|
||||
const std::unordered_map<TransformationType, TransformationInfo,
|
||||
std::hash<std::underlying_type<TransformationType>::type>> infoMap =
|
||||
{{TransformationType::PreLpt, {"preLpt", TransformationType::NumOfTypes}},
|
||||
{TransformationType::Lpt, {"lpt", TransformationType::PreLpt}},
|
||||
{TransformationType::PostLpt, {"postLpt", TransformationType::Lpt}},
|
||||
{TransformationType::Snippets, {"snippets", TransformationType::PostLpt}},
|
||||
{TransformationType::Specific, {"cpuSpecific", TransformationType::Snippets}}};
|
||||
std::bitset<TransformationType::NumOfTypes>& wasDumped(void) {
|
||||
static std::bitset<TransformationType::NumOfTypes> wasDumped;
|
||||
return wasDumped;
|
||||
}
|
||||
void dump(const std::string&& postfix) {
|
||||
static int num = 0; // just to keep dumped IRs ordered in filesystem
|
||||
const auto pathAndName = config.dumpIR.dir + "/ir_" + std::to_string(num) + '_' +
|
||||
infoMap.at(type).name + postfix;
|
||||
|
||||
ov::util::create_directory_recursive(config.dumpIR.dir);
|
||||
|
||||
ov::pass::Manager serializer;
|
||||
|
||||
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::XmlBin])
|
||||
serializer.register_pass<ov::pass::Serialize>(pathAndName + ".xml", "");
|
||||
|
||||
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Xml]) {
|
||||
std::string xmlFile(pathAndName + ".xml");
|
||||
std::string binFile("/dev/null"); // @todo make it crossplatform using dummy implementation of std::ostream
|
||||
|
||||
serializer.register_pass<ov::pass::Serialize>(xmlFile, binFile);
|
||||
}
|
||||
|
||||
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Svg]) {
|
||||
serializer.register_pass<ov::pass::VisualizeTree>(pathAndName + ".svg");
|
||||
}
|
||||
|
||||
if (config.dumpIR.format.filter[DebugCapsConfig::IrFormatFilter::Dot]) {
|
||||
serializer.register_pass<ov::pass::VisualizeTree>(pathAndName + ".dot");
|
||||
}
|
||||
|
||||
serializer.run_passes(model);
|
||||
num++;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace intel_cpu
|
||||
} // namespace ov
|
||||
|
||||
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(_config, _type) \
|
||||
_config.disable.transformations.filter[DebugCapsConfig::TransformationFilter::Type::_type]
|
||||
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(...) !CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(__VA_ARGS__)
|
||||
# define CPU_DEBUG_CAP_TRANSFORMATION_DUMP(_this, _type) \
|
||||
IE_ASSERT(CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(_this->config.debugCaps, _type)); \
|
||||
auto dumperPtr = _this->config.debugCaps.dumpIR.transformations.filter[DebugCapsConfig::TransformationFilter::Type::_type] ? \
|
||||
std::unique_ptr<TransformationDumper>(new TransformationDumper(_this->config.debugCaps, \
|
||||
DebugCapsConfig::TransformationFilter::Type::_type, _this->model)) : \
|
||||
nullptr
|
||||
# define CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(_this, _type) \
|
||||
if (CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(_this->config.debugCaps, _type)) \
|
||||
return; \
|
||||
CPU_DEBUG_CAP_TRANSFORMATION_DUMP(_this, _type)
|
||||
#else
|
||||
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_DISABLED(_config, _type) false
|
||||
# define CPU_DEBUG_CAP_IS_TRANSFORMATION_ENABLED(...) true
|
||||
# define CPU_DEBUG_CAP_TRANSFORMATION_DUMP(_this, _type)
|
||||
# define CPU_DEBUG_CAP_TRANSFORMATION_SCOPE(_this, _type)
|
||||
#endif // CPU_DEBUG_CAPS
|
@ -5,6 +5,7 @@
|
||||
|
||||
#include "node_dumper.h"
|
||||
|
||||
#include "utils/debug_caps_config.h"
|
||||
#include <node.h>
|
||||
#include "ie_common.h"
|
||||
#include "utils/blob_dump.h"
|
||||
@ -26,20 +27,20 @@ static void formatNodeName(std::string& name) {
|
||||
std::replace(name.begin(), name.end(), ':', '-');
|
||||
}
|
||||
|
||||
static bool shouldBeDumped(const NodePtr& node, const Config& config, const std::string& portsKind) {
|
||||
static bool shouldBeDumped(const NodePtr& node, const DebugCapsConfig& config, const std::string& portsKind) {
|
||||
const auto& dumpFilters = config.blobDumpFilters;
|
||||
|
||||
if (dumpFilters.empty())
|
||||
return false;
|
||||
|
||||
if (dumpFilters.count(Config::FILTER::BY_PORTS)) { // filter by ports configured
|
||||
if (dumpFilters.at(Config::FILTER::BY_PORTS) != "ALL" &&
|
||||
portsKind != dumpFilters.at(Config::FILTER::BY_PORTS))
|
||||
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_PORTS)) { // filter by ports configured
|
||||
if (dumpFilters.at(DebugCapsConfig::FILTER::BY_PORTS) != "ALL" &&
|
||||
portsKind != dumpFilters.at(DebugCapsConfig::FILTER::BY_PORTS))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dumpFilters.count(Config::FILTER::BY_EXEC_ID)) { // filter by exec id configured
|
||||
std::stringstream ss(dumpFilters.at(Config::FILTER::BY_EXEC_ID));
|
||||
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_EXEC_ID)) { // filter by exec id configured
|
||||
std::stringstream ss(dumpFilters.at(DebugCapsConfig::FILTER::BY_EXEC_ID));
|
||||
int id;
|
||||
bool matched = false;
|
||||
|
||||
@ -54,8 +55,8 @@ static bool shouldBeDumped(const NodePtr& node, const Config& config, const std:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dumpFilters.count(Config::FILTER::BY_TYPE)) { // filter by type configured
|
||||
std::stringstream ss(dumpFilters.at(Config::FILTER::BY_TYPE));
|
||||
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_TYPE)) { // filter by type configured
|
||||
std::stringstream ss(dumpFilters.at(DebugCapsConfig::FILTER::BY_TYPE));
|
||||
std::string type;
|
||||
bool matched = false;
|
||||
|
||||
@ -70,22 +71,22 @@ static bool shouldBeDumped(const NodePtr& node, const Config& config, const std:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dumpFilters.count(Config::FILTER::BY_NAME)) { // filter by name configured
|
||||
if (dumpFilters.at(Config::FILTER::BY_NAME) != "*" && // to have 'single char' option for matching all the names
|
||||
!std::regex_match(node->getName(), std::regex(dumpFilters.at(Config::FILTER::BY_NAME)))) // name does not match
|
||||
if (dumpFilters.count(DebugCapsConfig::FILTER::BY_NAME)) { // filter by name configured
|
||||
if (dumpFilters.at(DebugCapsConfig::FILTER::BY_NAME) != "*" && // to have 'single char' option for matching all the names
|
||||
!std::regex_match(node->getName(), std::regex(dumpFilters.at(DebugCapsConfig::FILTER::BY_NAME)))) // name does not match
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void dump(const BlobDumper& bd, const std::string& file, const Config& config) {
|
||||
static void dump(const BlobDumper& bd, const std::string& file, const DebugCapsConfig& config) {
|
||||
switch (config.blobDumpFormat) {
|
||||
case Config::FORMAT::BIN: {
|
||||
case DebugCapsConfig::FORMAT::BIN: {
|
||||
bd.dump(file);
|
||||
break;
|
||||
}
|
||||
case Config::FORMAT::TEXT: {
|
||||
case DebugCapsConfig::FORMAT::TEXT: {
|
||||
bd.dumpAsTxt(file);
|
||||
break;
|
||||
}
|
||||
@ -94,7 +95,7 @@ static void dump(const BlobDumper& bd, const std::string& file, const Config& co
|
||||
}
|
||||
}
|
||||
|
||||
static void dumpInternalBlobs(const NodePtr& node, const Config& config) {
|
||||
static void dumpInternalBlobs(const NodePtr& node, const DebugCapsConfig& config) {
|
||||
std::string nodeName = node->getName();
|
||||
formatNodeName(nodeName);
|
||||
|
||||
@ -116,7 +117,7 @@ static void dumpInternalBlobs(const NodePtr& node, const Config& config) {
|
||||
}
|
||||
}
|
||||
|
||||
void dumpInputBlobs(const NodePtr& node, const Config& config, int count) {
|
||||
void dumpInputBlobs(const NodePtr& node, const DebugCapsConfig& config, int count) {
|
||||
if (!shouldBeDumped(node, config, "IN"))
|
||||
return;
|
||||
|
||||
@ -150,7 +151,7 @@ void dumpInputBlobs(const NodePtr& node, const Config& config, int count) {
|
||||
dumpInternalBlobs(node, config);
|
||||
}
|
||||
|
||||
void dumpOutputBlobs(const NodePtr& node, const Config& config, int count) {
|
||||
void dumpOutputBlobs(const NodePtr& node, const DebugCapsConfig& config, int count) {
|
||||
if (!shouldBeDumped(node, config, "OUT"))
|
||||
return;
|
||||
|
||||
|
@ -1,25 +1,26 @@
|
||||
// Copyright (C) 2018-2022 Intel Corporation
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
#pragma once
|
||||
|
||||
#ifdef CPU_DEBUG_CAPS
|
||||
#include "utils/debug_caps_config.h"
|
||||
#include <node.h>
|
||||
#include "config.h"
|
||||
|
||||
namespace ov {
|
||||
namespace intel_cpu {
|
||||
|
||||
void dumpInputBlobs(const NodePtr &node, const Config& config, int count = -1);
|
||||
void dumpOutputBlobs(const NodePtr &node, const Config& config, int count = -1);
|
||||
void dumpInputBlobs(const NodePtr &node, const DebugCapsConfig& config, int count = -1);
|
||||
void dumpOutputBlobs(const NodePtr &node, const DebugCapsConfig& config, int count = -1);
|
||||
|
||||
class DumpHelper {
|
||||
const NodePtr& node;
|
||||
const int count;
|
||||
const Config& config;
|
||||
const DebugCapsConfig& config;
|
||||
|
||||
public:
|
||||
explicit DumpHelper(const NodePtr& _node, const Config& _config, int _count = -1): node(_node), config(_config), count(_count) {
|
||||
explicit DumpHelper(const NodePtr& _node, const DebugCapsConfig& _config, int _count = -1):
|
||||
node(_node), config(_config), count(_count) {
|
||||
dumpInputBlobs(node, config, count);
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user