[POT] Update CPU_SPR transformer quantization (#15241)

* [POT] Update CPU_SPR transformer quantization

* Add comments
This commit is contained in:
Liubov Talamanova 2023-01-24 11:56:22 +00:00 committed by GitHub
parent 54ea2612ae
commit 5fca707ebd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 58 additions and 38 deletions

View File

@ -33,7 +33,7 @@ def load_hardware_config(config):
raise ValueError('Unsupported target_device : {}'.format(config['target_device'])) raise ValueError('Unsupported target_device : {}'.format(config['target_device']))
hardware_config_path = __HARDWARE_CONFIG_DIR / __HARDWARE_CONFIGS_MAP.get(config['target_device'], "cpu.json") hardware_config_path = __HARDWARE_CONFIG_DIR / __HARDWARE_CONFIGS_MAP.get(config['target_device'], "cpu.json")
return HardwareConfig.from_json(hardware_config_path.as_posix()) return HardwareConfig.from_json(hardware_config_path.as_posix(), config['target_device'])
def append_estimator_configs(quantization_configs, is_weights, config, opt_conf=None): def append_estimator_configs(quantization_configs, is_weights, config, opt_conf=None):

View File

@ -110,7 +110,7 @@ def get_common_argument_parser():
'--keep-uncompressed-weights', '--keep-uncompressed-weights',
action='store_true', action='store_true',
default=False, default=False,
help='Keep Convolution, Deconvolution and FullyConnected weights uncompressed') help='Keep Convolution, ConvolutionBackpropData and MatMul weights uncompressed')
parser.add_argument( parser.add_argument(
'--data-source', '--data-source',

View File

@ -33,11 +33,13 @@ class HardwareConfig(list):
return config return config
@classmethod @classmethod
def from_json(cls, path): def from_json(cls, path, target_device=None):
with open(path) as f: with open(path) as f:
json_config = json.load(f, object_pairs_hook=OrderedDict) json_config = json.load(f, object_pairs_hook=OrderedDict)
if target_device is None:
target_device = json_config['target_device']
hw_config = cls() hw_config = cls()
hw_config.append(Dict(('target_device', json_config['target_device']))) hw_config.append(Dict(('target_device', target_device)))
hw_config.append(Dict(('primary_bitwidth', json_config.get('primary_bitwidth', 8)), hw_config.append(Dict(('primary_bitwidth', json_config.get('primary_bitwidth', 8)),
('input_priority_types', json_config.get('input_priority_types', [])))) ('input_priority_types', json_config.get('input_priority_types', []))))

View File

@ -2,10 +2,24 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from .pattern_utils import check_fused_scale_shift_patterns, get_fused_scale_shift_patterns, \ from .pattern_utils import check_fused_scale_shift_patterns, get_fused_scale_shift_patterns, \
check_fused_op_const_patterns, get_fused_op_const_pattern, get_clamp_mult_const_pattern check_fused_op_const_patterns, get_fused_op_const_pattern, get_clamp_mult_const_pattern, \
get_softmax_reshape_transpose_gather_matmul_pattern
def get_cpu_ignored_patterns(): def get_cpu_ignored_patterns():
return {
'blocks': [(pattern, check_fused_scale_shift_patterns) for pattern in get_fused_scale_shift_patterns()] +
[(pattern, check_fused_op_const_patterns) for pattern in get_fused_op_const_pattern()] +
[get_softmax_reshape_transpose_gather_matmul_pattern()],
'activations': [get_clamp_mult_const_pattern()],
'inputs': []
}
# For CPU_SPR we should quantize self-attention block with
# FQ propagated before to Reshape to remove quantization overhead
# For details look at ticket: 97884
def get_cpu_spr_ignored_patterns():
return { return {
'blocks': [(pattern, check_fused_scale_shift_patterns) for pattern in get_fused_scale_shift_patterns()] + 'blocks': [(pattern, check_fused_scale_shift_patterns) for pattern in get_fused_scale_shift_patterns()] +
[(pattern, check_fused_op_const_patterns) for pattern in get_fused_op_const_pattern()], [(pattern, check_fused_op_const_patterns) for pattern in get_fused_op_const_pattern()],

View File

@ -1,12 +1,13 @@
# Copyright (C) 2020-2022 Intel Corporation # Copyright (C) 2020-2022 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from openvino.tools.pot.graph.pattern_utils import get_assign_result_pattern from openvino.tools.pot.graph.pattern_utils import get_assign_result_pattern, \
get_softmax_reshape_transpose_gather_matmul_pattern
def get_gna_ignored_patterns(): def get_gna_ignored_patterns():
return { return {
'blocks': [get_assign_result_pattern()], 'blocks': [get_assign_result_pattern(), get_softmax_reshape_transpose_gather_matmul_pattern()],
'activations': [], 'activations': [],
'inputs': [] 'inputs': []
} }
@ -14,7 +15,7 @@ def get_gna_ignored_patterns():
def get_gna3_ignored_patterns(): def get_gna3_ignored_patterns():
return { return {
'blocks': [get_assign_result_pattern()], 'blocks': [get_assign_result_pattern(), get_softmax_reshape_transpose_gather_matmul_pattern()],
'activations': [], 'activations': [],
'inputs': [] 'inputs': []
} }

View File

@ -2,13 +2,15 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from .pattern_utils import check_fused_scale_shift_patterns, get_fused_scale_shift_patterns, \ from .pattern_utils import check_fused_scale_shift_patterns, get_fused_scale_shift_patterns, \
check_fused_op_const_patterns, get_fused_op_const_pattern, get_clamp_mult_const_pattern check_fused_op_const_patterns, get_fused_op_const_pattern, get_clamp_mult_const_pattern, \
get_softmax_reshape_transpose_gather_matmul_pattern
def get_gpu_ignored_patterns(): def get_gpu_ignored_patterns():
return { return {
'blocks': [(pattern, check_fused_scale_shift_patterns) for pattern in get_fused_scale_shift_patterns()] + 'blocks': [(pattern, check_fused_scale_shift_patterns) for pattern in get_fused_scale_shift_patterns()] +
[(pattern, check_fused_op_const_patterns) for pattern in get_fused_op_const_pattern()], [(pattern, check_fused_op_const_patterns) for pattern in get_fused_op_const_pattern()] +
[get_softmax_reshape_transpose_gather_matmul_pattern()],
'activations': [get_clamp_mult_const_pattern()], 'activations': [get_clamp_mult_const_pattern()],
'inputs': [] 'inputs': []
} }

View File

@ -102,7 +102,7 @@ def get_all_operation_nodes(model: CompressedModel, recursively: bool = True):
def build_model_for_node(nx_model, input_name, input_shape, node, remove_bias=False, def build_model_for_node(nx_model, input_name, input_shape, node, remove_bias=False,
remove_fake_quantize=False, target_device='ANY'): remove_fake_quantize=False, target_device='ANY'):
""" Build Model containing Subgraph of CompressedModel (input - node - output). """ Build Model containing Subgraph of CompressedModel (input - node - output).
The Convolution, FullyConnected node types are supported. The Convolution, MatMul node types are supported.
:param nx_model: CompressedModel model :param nx_model: CompressedModel model
:param input_name: name of the input node in the generated graph :param input_name: name of the input node in the generated graph
:param input_shape: shape of the input node in the generated graph :param input_shape: shape of the input node in the generated graph

View File

@ -576,10 +576,10 @@ class RemoveFakeQuantize:
check_is_inputs_fq = lambda node: all([op.type == 'FakeQuantize' for op in node]) check_is_inputs_fq = lambda node: all([op.type == 'FakeQuantize' for op in node])
for op in get_nodes_by_type(graph, ['Add']): for op in get_nodes_by_type(graph, ['Add']):
if not nu.check_const_input(op): if not nu.check_const_input(op):
inputs_node = np.array(get_node_inputs(op)) inputs_node = get_node_inputs(op)
count_outputs_node = np.array([len(get_all_node_outputs(node)) for node in inputs_node]) count_outputs_node = np.array([len(get_all_node_outputs(node)) for node in inputs_node])
indices = count_outputs_node.argsort()[::-1] indices = count_outputs_node.argsort()[::-1]
inputs_node = inputs_node[indices] inputs_node = [inputs_node[idx] for idx in indices]
if check_is_inputs_fq(inputs_node): if check_is_inputs_fq(inputs_node):
delete_one_fq(inputs_node) delete_one_fq(inputs_node)
@ -831,7 +831,7 @@ def create_fake_quantize_node(graph: Graph, name, data_type=np.float32, **kwargs
def insert_fake_quantize(graph, node, ports=None, names=None, fq_types=None, hw_config=None, input_priority_types=[]): def insert_fake_quantize(graph, node, ports=None, names=None, fq_types=None, hw_config=None, input_priority_types=[]):
blobs_as_inputs_nodes_type = ['Convolution', 'Deconvolution', 'MatMul'] blobs_as_inputs_nodes_type = ['Convolution', 'ConvolutionBackpropData', 'MatMul']
gru_node_types = ['GRUCell', 'GRUSequence'] gru_node_types = ['GRUCell', 'GRUSequence']
port_name = None port_name = None

View File

@ -82,3 +82,18 @@ def get_fq_result_pattern():
pattern.append_single_op('FakeQuantize', 'fq') pattern.append_single_op('FakeQuantize', 'fq')
pattern.append_single_op('Result', 'result') pattern.append_single_op('Result', 'result')
return pattern.set_name('fq_result').pattern return pattern.set_name('fq_result').pattern
# Self-attention block in vision transformers (Swin, Twins, ViTPose)
def get_softmax_reshape_transpose_gather_matmul_pattern():
pattern = PatternBuilder()
pattern_2 = PatternBuilder()
softmax_out = pattern.append_single_op('SoftMax', 'softmax').get_last_node()
pattern_2.append_single_op('Add', 'add').get_last_node()
pattern_2.append_op_const('Reshape', 'reshape')
pattern_2.append_single_op('Transpose', 'transpose').get_last_node()
gather_out = pattern_2.append_single_op('Gather', 'gather').get_last_node()
pattern.pattern['nodes'] += pattern_2.pattern['nodes']
pattern.pattern['edges'] += pattern_2.pattern['edges']
pattern.insert_single_op([softmax_out, gather_out], None, 'MatMul', 'matmul')
return pattern.set_name('softmax_reshape_transpose_gather_matmul').pattern

View File

@ -308,21 +308,6 @@ def create_stable_diffusion_pattern():
return pattern.set_name('stable_diffusion').pattern return pattern.set_name('stable_diffusion').pattern
@registry_ignore_patterns('blocks')
def create_softmax_reshape_transpose_gather_matmul_pattern():
pattern = PatternBuilder()
pattern_2 = PatternBuilder()
softmax_out = pattern.append_single_op('SoftMax', 'softmax').get_last_node()
pattern_2.append_single_op('Add', 'add').get_last_node()
pattern_2.append_op_const('Reshape', 'reshape')
pattern_2.append_single_op('Transpose', 'transpose').get_last_node()
gather_out = pattern_2.append_single_op('Gather', 'gather').get_last_node()
pattern.pattern['nodes'] += pattern_2.pattern['nodes']
pattern.pattern['edges'] += pattern_2.pattern['edges']
pattern.insert_single_op([softmax_out, gather_out], None, 'MatMul', 'matmul')
return pattern.set_name('softmax_reshape_transpose_gather_matmul').pattern
@registry_ignore_patterns('blocks') @registry_ignore_patterns('blocks')
def create_hswish_without_denominator_pattern(): def create_hswish_without_denominator_pattern():
pattern = PatternBuilder() pattern = PatternBuilder()

View File

@ -8,7 +8,7 @@ import json
import numpy as np import numpy as np
from openvino.tools.pot.version import get_version from openvino.tools.pot.version import get_version
from .cpu_patterns import get_cpu_ignored_patterns from .cpu_patterns import get_cpu_ignored_patterns, get_cpu_spr_ignored_patterns
from .gpu_patterns import get_gpu_ignored_patterns from .gpu_patterns import get_gpu_ignored_patterns
from .vpu_patterns import get_vpu_ignored_patterns from .vpu_patterns import get_vpu_ignored_patterns
from .gna_patterns import get_gna_ignored_patterns, get_gna3_ignored_patterns from .gna_patterns import get_gna_ignored_patterns, get_gna3_ignored_patterns
@ -16,13 +16,14 @@ from .special_operations import QUANTIZE_AGNOSTIC_OPERATIONS
from .node_utils import get_all_node_outputs, get_input_shape from .node_utils import get_all_node_outputs, get_input_shape
HARDWARE_AWARE_IGNORED_PATTERNS = { HARDWARE_AWARE_IGNORED_PATTERNS = {
'ANY': get_cpu_ignored_patterns(),
'CPU': get_cpu_ignored_patterns(), 'CPU': get_cpu_ignored_patterns(),
'GPU': get_gpu_ignored_patterns(), 'GPU': get_gpu_ignored_patterns(),
'VPU': get_vpu_ignored_patterns(), 'VPU': get_vpu_ignored_patterns(),
'GNA': get_gna_ignored_patterns(), 'GNA': get_gna_ignored_patterns(),
'GNA3': get_gna3_ignored_patterns(), 'GNA3': get_gna3_ignored_patterns(),
'GNA3.5': get_gna3_ignored_patterns(), 'GNA3.5': get_gna3_ignored_patterns(),
'CPU_SPR': get_cpu_ignored_patterns() 'CPU_SPR': get_cpu_spr_ignored_patterns()
} }
DEFAULT_PATH = 'PATH' DEFAULT_PATH = 'PATH'

View File

@ -1,11 +1,12 @@
# Copyright (C) 2020-2022 Intel Corporation # Copyright (C) 2020-2022 Intel Corporation
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from openvino.tools.pot.graph.pattern_utils import get_clamp_mult_const_pattern from openvino.tools.pot.graph.pattern_utils import get_clamp_mult_const_pattern, \
get_softmax_reshape_transpose_gather_matmul_pattern
def get_vpu_ignored_patterns(): def get_vpu_ignored_patterns():
return { return {
'blocks': [], 'blocks': [get_softmax_reshape_transpose_gather_matmul_pattern()],
'activations': [get_clamp_mult_const_pattern()], 'activations': [get_clamp_mult_const_pattern()],
'inputs': [] 'inputs': []
} }

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1 version https://git-lfs.github.com/spec/v1
oid sha256:ef7c4f5fdfc04ec0b5d0091310682cc9ff1b9f3ebabdfb9f5c33f056bb7adcec oid sha256:e314f836c4d7a148e25835b3cda0ffa7c69417ccdf9cf418e200cff3a5fed84f
size 121728 size 107188

View File

@ -19,7 +19,6 @@ GNA_CONFIG_PATH = HARDWARE_CONFIG_PATH / 'gna.json'
TEST_MODELS = [ TEST_MODELS = [
('mobilenetv2_example', 'pytorch', 'ANY'), ('mobilenetv2_example', 'pytorch', 'ANY'),
('resnet_example', 'pytorch', 'ANY'),
('googlenet_example', 'pytorch', 'ANY'), ('googlenet_example', 'pytorch', 'ANY'),
('mobilenetv2_ssd_example', 'pytorch', 'ANY'), ('mobilenetv2_ssd_example', 'pytorch', 'ANY'),
('densenet121_example', 'pytorch', 'ANY'), ('densenet121_example', 'pytorch', 'ANY'),
@ -53,9 +52,9 @@ def test_build_quantization_graph(tmp_path, models, model_name, model_framework,
model = load_model(model.model_params, target_device=target_device) model = load_model(model.model_params, target_device=target_device)
if target_device == 'GNA': if target_device == 'GNA':
hardware_config = HardwareConfig.from_json(GNA_CONFIG_PATH.as_posix()) hardware_config = HardwareConfig.from_json(GNA_CONFIG_PATH.as_posix(), target_device)
else: else:
hardware_config = HardwareConfig.from_json(CPU_CONFIG_PATH.as_posix()) hardware_config = HardwareConfig.from_json(CPU_CONFIG_PATH.as_posix(), target_device)
quantization_model = GraphTransformer(hardware_config).insert_fake_quantize(model) quantization_model = GraphTransformer(hardware_config).insert_fake_quantize(model)