[MO] compress_to_fp16=False by default (#16854)

* compress_to_fp16=False by default * Apply suggestions from code review Co-authored-by: Karol Blaszczak <karol.blaszczak@intel.com> * note abound RAM consumption for FP16 compressed models * detailed notion about RAM usage * update 'get_compression_message()' * corrected get_compression_message: remove infor about RAM * fix pytorch convert layer tests --------- Co-authored-by: Karol Blaszczak <karol.blaszczak@intel.com>
2023-04-14 03:16:41 +02:00 · 2023-04-14 03:16:41 +02:00 · 68f46ff9a1
commit 68f46ff9a1
parent de8f34c8f0
4 changed files with 30 additions and 35 deletions
--- a/docs/MO_DG/prepare_model/FP16_Compression.md
+++ b/docs/MO_DG/prepare_model/FP16_Compression.md
@ -2,18 +2,16 @@

@sphinxdirective

-Model Optimizer by default converts all floating-point weights to ``FP16`` data type. 
-The resulting IR is called compressed ``FP16`` model. The resulting model will occupy 
-about twice as less space in the file system, but it may have some accuracy drop. 
-For most models, the accuracy drop is negligible. But in case if accuracy drop is 
-significant user can disable compression explicitly.
+Model Optimizer can convert all floating-point weights to the ``FP16`` data type. 
+It results in creating a "compressed ``FP16`` model", which occupies about half of 
+the original space in the file system. The compression may introduce a drop in accuracy.
+but it is negligible for most models.

-By default, models are compressed to ``FP16``, but you can disable compression by 
-specifying ``--compress_to_fp16=False``:
+To compress the model, use the `--compress_to_fp16` or `--compress_to_fp16=True` option:

 .. code-block:: sh

-   mo --input_model INPUT_MODEL --compress_to_fp16=False
+   mo --input_model INPUT_MODEL --compress_to_fp16


 For details on how plugins handle compressed ``FP16`` models, see 
@ -26,4 +24,11 @@ For details on how plugins handle compressed ``FP16`` models, see
   information about that.


+.. note::
+
+   Some large models (larger than a few Gb) when compressed to ``FP16`` may consume enormous amount of RAM on the loading
+   phase of the inference. In case if you are facing such problems, please try to convert them without compression: 
+   `mo --input_model INPUT_MODEL --compress_to_fp16=False`
+
+
@endsphinxdirective
--- a/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py
+++ b/tests/layer_tests/mo_python_api_tests/test_mo_convert_pytorch.py
@ -458,23 +458,18 @@ def create_pytorch_nn_module_mean_list(tmp_dir):
        "input": [InputCutInfo("x", None, "f32", None), InputCutInfo("y", None, "f32", None)]}


-def create_pytorch_nn_module_mean_list_default_compression(tmp_dir):
-    # by default compression should be enabled (same as setting 'compress_to_fp16': True)
-    # therefore decompression Converts will be present
+def create_pytorch_nn_module_mean_list_default_no_compression(tmp_dir):
+    # by default compression is disabled (same as setting 'compress_to_fp16': False)
    pt_model = make_pt_model_two_inputs()
    shape = [1, 10, 10, 3]

    shape = PartialShape(shape)
    param1 = ov.opset8.parameter(shape)
    param2 = ov.opset8.parameter(shape)
-    const1 = ov.opset8.constant([[[[0, 0, 0]]]], dtype=np.float16)
-    const1_decompressed = ov.opset8.convert(
-        const1, destination_type=np.float32)
-    const2 = ov.opset8.constant([[[[0, 0, 0]]]], dtype=np.float16)
-    const2_decompressed = ov.opset8.convert(
-        const2, destination_type=np.float32)
-    sub1 = ov.opset8.subtract(param1, const1_decompressed)
-    sub2 = ov.opset8.subtract(param2, const2_decompressed)
+    const1 = ov.opset8.constant([[[[0, 0, 0]]]], dtype=np.float32)
+    const2 = ov.opset8.constant([[[[0, 0, 0]]]], dtype=np.float32)
+    sub1 = ov.opset8.subtract(param1, const1)
+    sub2 = ov.opset8.subtract(param2, const2)
    add = ov.opset8.add(sub1, sub2)
    relu = ov.opset8.relu(add)
    sigm = ov.opset8.sigmoid(relu)
@ -529,23 +524,18 @@ def create_pytorch_nn_module_scale_list(tmp_dir):
    return pt_model, ref_model, {'input_shape': [shape, shape], 'scale_values': [[1, 1, 1], [1, 1, 1]], 'compress_to_fp16': False, "input": [InputCutInfo("x", None, "f32", None), InputCutInfo("y", None, "f32", None)]}


-def create_pytorch_nn_module_scale_list_default_compression(tmp_dir):
-    # by default compression should be enabled (same as setting 'compress_to_fp16': True)
-    # therefore decompression Converts will be present
+def create_pytorch_nn_module_scale_list_default_no_compression(tmp_dir):
+    # by default compression is disabled (same as setting 'compress_to_fp16': False)
    pt_model = make_pt_model_two_inputs()
    shape = [1, 10, 10, 3]

    shape = PartialShape(shape)
    param1 = ov.opset8.parameter(shape)
    param2 = ov.opset8.parameter(shape)
-    const1 = ov.opset8.constant([[[[1, 1, 1]]]], dtype=np.float16)
-    const1_decompressed = ov.opset8.convert(
-        const1, destination_type=np.float32)
-    const2 = ov.opset8.constant([[[[1, 1, 1]]]], dtype=np.float16)
-    const2_decompressed = ov.opset8.convert(
-        const2, destination_type=np.float32)
-    sub1 = ov.opset8.multiply(param1, const1_decompressed)
-    sub2 = ov.opset8.multiply(param2, const2_decompressed)
+    const1 = ov.opset8.constant([[[[1, 1, 1]]]], dtype=np.float32)
+    const2 = ov.opset8.constant([[[[1, 1, 1]]]], dtype=np.float32)
+    sub1 = ov.opset8.multiply(param1, const1)
+    sub2 = ov.opset8.multiply(param2, const2)
    add = ov.opset8.add(sub1, sub2)
    relu = ov.opset8.relu(add)
    sigm = ov.opset8.sigmoid(relu)
@ -754,10 +744,10 @@ class TestMoConvertPyTorch(CommonMOConvertTest):
        create_pytorch_nn_module_layout_list,
        create_pytorch_nn_module_layout_list_case2,
        create_pytorch_nn_module_mean_list,
-        create_pytorch_nn_module_mean_list_default_compression,
+        create_pytorch_nn_module_mean_list_default_no_compression,
        create_pytorch_nn_module_mean_list_compressin_enabled,
        create_pytorch_nn_module_scale_list,
-        create_pytorch_nn_module_scale_list_default_compression,
+        create_pytorch_nn_module_scale_list_default_no_compression,
        create_pytorch_nn_module_scale_list_compression_enabled,
        create_pytorch_nn_module_shapes_list_static,
        create_pytorch_nn_module_shapes_list_dynamic,
--- a/tools/mo/openvino/tools/mo/convert.py
+++ b/tools/mo/openvino/tools/mo/convert.py
@ -34,7 +34,7 @@ def convert_model(
        source_layout: [str, Layout, dict] = (),
        target_layout: [str, Layout, dict] = (),
        layout: [str, Layout, LayoutMap, list, dict] = (),
-        compress_to_fp16: bool = True,
+        compress_to_fp16: bool = False,
        extensions: [str, pathlib.Path, list, Any] = None,
        transform: [str, list, tuple] = "",
        transformations_config: [str, pathlib.Path] = None,
--- a/tools/mo/openvino/tools/mo/utils/get_ov_update_message.py
+++ b/tools/mo/openvino/tools/mo/utils/get_ov_update_message.py
@ -37,7 +37,7 @@ def get_tf_fe_message():
 def get_compression_message():
    link = "https://docs.openvino.ai/latest/openvino_docs_MO_DG_FP16_Compression.html"
    message = '[ INFO ] Generated IR will be compressed to FP16. ' \
-              'If you get lower accuracy, please consider disabling compression explicitly ' \
-              'by adding argument --compress_to_fp16=False.\n' \
+              'If you get lower accuracy, please consider disabling compression ' \
+              'by removing argument --compress_to_fp16 or set it to false --compress_to_fp16=False.\n' \
              'Find more information about compression to FP16 at {}'.format(link)
    return message