Efficient FP32 -> FP16 conversion for convert_precision, save_model, ovc and mo (#18988)

* WIP Postpone fp16 in CompressFloatConstantsImpl * Apply suggestions from code review Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> * WIP: Compression to FP16 in Serialize * Prepared for efficient fp32 to fp16 conversion * Update src/core/reference/src/runtime/reference/convert.cpp * Called real slow reference implementations in the place where the optimized versions are supposed to be implemented * Code style * Fixed 0 values in the fast f64 to f16 compression * Optimized convert_from_f32_to_f16_with_clamp * Added optimized f32->f16 instance of change_constant_precision * compression transformation Python test * use tmp dir, minor corrections * Update src/bindings/python/tests/test_transformations/test_compression.py * Update src/bindings/python/tests/test_transformations/test_compression.py * style fix * define rt_info for postponed_fp16_compression * remove redundant class * fix temp dir for Win in test_compression.py * update definitions in convert.hpp * Update implementation in convert.cpp * Update serialize.cpp * Update compress_float_constants.cpp * added macros for ARM/non_x86 in convert.cpp * fix macros in convert.cpp * change fixme placement in serialize.cpp * style_fix * Update src/core/reference/src/runtime/reference/convert.cpp * style_fix * Optimized count_out_of_f16_range * Code style * Revert unused * Update src/core/src/pass/serialize.cpp Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> * Update src/core/reference/src/runtime/reference/convert.cpp Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> * use optimized convert_from_f32_to_f16_with_clamp for non postponed * minor corrections * Update src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp * Update compress_float_constants.cpp * Switched mo and ovc to save_model instead of serialize to leverage performance improvements in fp32->fp16 * Applied minor code imporvements to address review feedback * Minor changes in code * Update tools/ovc/openvino/tools/ovc/main.py * Apply suggestions from code review * Fixed failed test in case when both usual xml compression and fp16 compression are applied simultaneously (disabled for now) * Added description for CompressFloatConstantImpl postponed parameter * Description of postponed parameter for CompressFloatConstants * Reverted switching to save_model in mo as the compression can be applied not only via CLI and old code should be kept for Python path (not applicable for ovc) * Removed remining committed test artefacts and reverted remaining changes in mo --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: dmitrygo <dmitry.gorokhov@intel.com> Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com> Co-authored-by: Pavel Esir <pavel.esir@intel.com> Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
2023-08-17 15:08:33 +04:00
parent 2394732055
commit f0300a36eb
11 changed files with 666 additions and 44 deletions
--- a/src/bindings/python/tests/test_transformations/test_compression.py
+++ b/src/bindings/python/tests/test_transformations/test_compression.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2018-2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import tempfile
+from typing import List
+
+import numpy as np
+from openvino.runtime.op import Parameter, Constant
+from openvino.runtime.opset12 import add, multiply
+
+import openvino as ov
+
+
+def make_constant(values, transposed):
+    return Constant(ov.Type.f32, ov.Shape([1, len(values)] if transposed else [len(values), 1]), values)
+
+
+# keep fp16 denormals, flush fp32 denormals to zero
+in_range = [-65504.0, -2.0, 1.00097656, -1.0, -0.99951172, -0.00006103515625, -0.000000059604645, 0.0,
+            0.000000059604645, 0.99951172, 0.00006103515625, 1.0, 1.00097656, 2.0, 65504]
+out_of_range = [float("-inf"), -65505.0, -1e-10, -1e-39, 1e-39, 1e-10, 65505.0, float("inf")]
+converted_out_of_range = [-65504.0, -65504.0, 0, 0, 0, 0, 65504.0, 65504.0]
+
+# test inputs
+more_in_range = out_of_range + 10 * in_range
+more_out_of_range = in_range + 10 * out_of_range
+
+# reference after conversion more_in_range to fp16
+converted_more_in_range = converted_out_of_range + 10 * in_range
+
+
+def make_model(add_consts, mul_consts):
+    parameter1 = Parameter(ov.Type.f32, ov.PartialShape([-1]))
+    add1 = add(parameter1, make_constant(add_consts, False))
+    mul1 = multiply(add1, make_constant(mul_consts, True))
+    return ov.Model([mul1], [parameter1])
+
+
+def get_constants(model) -> List[Constant]:
+    from pathlib import Path
+    model_name = Path(tempfile.gettempdir()) / "f32_partially_compressed.xml"
+    ov.save_model(model, model_name)
+    core = ov.Core()
+    restored_model = core.read_model(model_name)
+
+    op_ind_map = {"Add": 0, "Multiply": 1}
+    constants_list = [[]] * len(op_ind_map)
+
+    for op in restored_model.get_ordered_ops():
+        op_type = op.get_type_info().name
+        if op_type not in op_ind_map.keys():
+            continue
+
+        in_node = op.input_value(1).get_node()
+        if in_node.get_type_info().name == "Convert":
+            const_node = in_node.input_value(0).get_node()
+            if const_node.get_type_info().name != "Constant":
+                const_node = None
+        elif in_node.get_type_info().name == "Constant":
+            const_node = in_node
+
+        constants_list[op_ind_map[op_type]] = const_node
+
+    for node in constants_list:
+        assert not isinstance(node, list)
+
+    # sanity check that model is compilable
+    ov.compile_model(restored_model)
+    return constants_list
+
+
+def test_compression_1():
+    model = make_model(more_in_range, more_out_of_range)
+    const_fp16, const_fp32 = get_constants(model)
+    assert const_fp32 is not None, "There is no Constant op on FP32 branch"
+    assert const_fp16 is not None, "There is no compressed Constant + Convert op on FP16 branch"
+
+    assert const_fp32.get_output_element_type(0) == ov.Type.f32
+    assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32.get_vector())
+
+    assert const_fp16.get_output_element_type(0) == ov.Type.f16
+
+    msg = f"Difference: {np.array(converted_more_in_range, dtype=np.float32) - const_fp16.get_vector()}"
+    assert np.all(np.array(converted_more_in_range, dtype=np.float32) == const_fp16.get_vector()), msg
+
+
+def test_compression_2():
+    model = make_model(more_in_range, more_in_range)
+    const_fp16_1, const_fp16_2 = get_constants(model)
+
+    assert const_fp16_1 is not None, "There is no Constant op on FP16 branch"
+    assert const_fp16_2 is not None, "There is no Constant op on FP16 branch"
+
+    assert const_fp16_1.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16"
+    assert const_fp16_2.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16"
+    f16_min, f16_max = np.finfo(np.float16).min, np.finfo(np.float16).max
+    in_range_clipped = np.clip(more_in_range, f16_min, f16_max).astype(np.float16)
+
+    assert np.all(in_range_clipped == const_fp16_1.get_vector())
+    assert np.all(in_range_clipped == const_fp16_2.get_vector())
+
+
+def test_no_compression():
+    model = make_model(more_out_of_range, more_out_of_range)
+    const_fp32_1, const_fp32_2 = get_constants(model)
+
+    assert const_fp32_1 is not None, "There is no Constant op on FP32 branch"
+    assert const_fp32_2 is not None, "There is no Constant op on FP32 branch"
+
+    assert const_fp32_1.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32"
+
+    assert const_fp32_2.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32"
+
+    assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_1.get_vector())
+    assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_2.get_vector())
--- a/src/common/transformations/include/transformations/common_optimizations/compress_float_constants.hpp
+++ b/src/common/transformations/include/transformations/common_optimizations/compress_float_constants.hpp
@@ -24,7 +24,13 @@ class TRANSFORMATIONS_API CompressFloatConstants;
 class ov::pass::CompressFloatConstantsImpl : public ov::pass::MatcherPass {
 public:
    OPENVINO_RTTI("CompressFloatConstantsImpl", "0");
-    CompressFloatConstantsImpl();
+    /// @brief Transformation constructor
+    /// @param postponed If true then the transformation won't compress the constants
+    ///                  keeping them in the original type but still will insert Converts. This is
+    ///                  a special mode of operation that requires another transformation to
+    ///                  apply a real compression on constants. Constants eligible for
+    ///                  postponed compression are marked with a special rt_info tag.
+    CompressFloatConstantsImpl(bool postponed = false);
 };

 /**
@@ -44,8 +50,10 @@ public:
 class ov::pass::CompressFloatConstants : public ov::pass::GraphRewrite {
 public:
    OPENVINO_RTTI("CompressFloatConstants", "0");
-    CompressFloatConstants() {
-        add_matcher<ov::pass::CompressFloatConstantsImpl>();
+    /// @brief Transformation constructor
+    /// @param postponed Postponed compression, see ov::pass::CompressFloatConstantsImpl for details.
+    CompressFloatConstants(bool postponed = false) {
+        add_matcher<ov::pass::CompressFloatConstantsImpl>(postponed);
        add_matcher<ov::pass::AddOldApiMapToParameters>();
    }
 };
--- a/src/common/transformations/include/transformations/rt_info/disable_fp16_compression.hpp
+++ b/src/common/transformations/include/transformations/rt_info/disable_fp16_compression.hpp
@@ -16,6 +16,12 @@ TRANSFORMATIONS_API void enable_fp16_compression(const std::shared_ptr<Node>& no

 TRANSFORMATIONS_API bool fp16_compression_is_disabled(const std::shared_ptr<const Node>& node);

+TRANSFORMATIONS_API void postpone_fp16_compression(RTMap& rt_info);
+
+TRANSFORMATIONS_API bool is_fp16_compression_postponed(const RTMap& rt_info);
+
+TRANSFORMATIONS_API void do_not_postpone_fp16_compression(RTMap& rt_info);
+
 /**
 * @ingroup ie_runtime_attr_api
 * @brief DisableFP16Compression class represents runtime info attribute that marks operation
--- a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp
+++ b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp
@@ -5,6 +5,7 @@
 #include "transformations/common_optimizations/compress_float_constants.hpp"

 #include "itt.hpp"
+#include "ngraph/runtime/reference/convert.hpp"
 #include "openvino/core/rt_info.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/convert.hpp"
@@ -16,7 +17,8 @@

 namespace {
 template <ov::element::Type_t PREC_FROM>
-std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::op::v0::Constant>& constant) {
+std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::op::v0::Constant>& constant,
+                                                            bool postponed = false) {
    using src_type = typename ov::element_type_traits<PREC_FROM>::value_type;

    const auto* src_data = constant->get_data_ptr<src_type>();
@@ -24,9 +26,10 @@ std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::

    auto new_constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16, constant->get_shape());
    auto* dst_data = const_cast<ov::float16*>(reinterpret_cast<const ov::float16*>(new_constant->get_data_ptr()));
-    if (dst_data == nullptr)
+    if (!dst_data || !size)
        return nullptr;

+    // slow implementation: is used when optimized ones are not available: f64 or for ARM (both for f64 and f32)
    int num_out_of_range = 0;
    for (size_t i = 0; i < size; ++i) {
        // if abs value is smaller than the smallest positive fp16, but not zero
@@ -44,18 +47,24 @@ std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::
    }

    // if more than 75% of a FP32 constant do not fit into FP16 keep in FP32
-    float keep_threshold = 0.75f;
-    float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
+    const float keep_threshold = 0.75f;
+    const float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);

    if (out_of_range_proportion >= keep_threshold) {
        return nullptr;
    }

-    return new_constant;
+    if (postponed) {
+        // dispose just converted constant to avoid allocation too much memory
+        // it will be converted again while serialization
+        return constant;
+    } else {
+        return new_constant;
+    }
 }
 }  // namespace

-ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() {
+ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed) {
    MATCHER_SCOPE(CompressFloatConstantsImpl);
    auto const_pattern = pattern::wrap_type<ov::op::v0::Constant>();

@@ -72,26 +81,68 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() {

        auto c_type = const_node->get_element_type();
        std::shared_ptr<ov::Node> new_const;
+
+#if !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64)
        if (c_type == ov::element::f32) {
-            new_const = change_constant_precision_to_fp16<ov::element::Type_t::f32>(const_node);
+            new_const = change_constant_precision_to_fp16<ov::element::Type_t::f32>(const_node, postponed);
        } else if (c_type == ov::element::f64) {
-            new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node);
+            new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node, postponed);
+        }
+        if (!new_const)  // if out of range > threshold -> then new_const == nullptr
+            return false;
+#else
+        if (c_type == ov::element::f32) {
+            auto size = shape_size(const_node->get_output_shape(0));
+            if (size == 0)
+                return false;
+            auto num_out_of_range =
+                ngraph::runtime::reference::count_out_of_f16_range(const_node->get_data_ptr<ov::element::f32>(), size);
+
+            // if more than 75% of a FP32 constant do not fit into FP16 keep in FP32
+            const float keep_threshold = 0.75f;
+            const float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
+            if (out_of_range_proportion >= keep_threshold)
+                return false;
+
+            if (postponed) {
+                new_const = const_node;
+            } else {
+                const auto* src_data = const_node->get_data_ptr<float>();
+                auto compressed_const =
+                    std::make_shared<ov::op::v0::Constant>(ov::element::f16, const_node->get_shape());
+                auto* dst_data =
+                    const_cast<ov::float16*>(reinterpret_cast<const ov::float16*>(compressed_const->get_data_ptr()));
+                OPENVINO_ASSERT(dst_data);
+                ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size);
+                new_const = compressed_const;
+            }
+        } else if (c_type == ov::element::f64) {
+            new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node, postponed);
        } else {
            return false;
        }
+#endif  // !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64)

        if (!new_const) {
            return false;
        }
+        auto constant_target_inputs = const_node->get_output_target_inputs(0);
        auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());

-        new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
        convert->set_friendly_name(const_node->get_friendly_name());
+        new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
        ov::copy_runtime_info(const_node, convert);
        ov::mark_as_decompression(convert);
+        if (postponed) {
+            postpone_fp16_compression(new_const->get_rt_info());
+            postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());

-        ov::replace_node(const_node, convert);
-
+            for (const auto& target_input : constant_target_inputs) {
+                target_input.replace_source_output(convert);
+            }
+        } else {
+            ov::replace_node(const_node, convert);
+        }
        return true;
    };

--- a/src/common/transformations/src/transformations/convert_precision.cpp
+++ b/src/common/transformations/src/transformations/convert_precision.cpp
@@ -895,6 +895,26 @@ std::shared_ptr<ngraph::Node> change_constant_precision(std::shared_ptr<opset4::
    return new_constant;
 }

+template <>
+std::shared_ptr<Node> change_constant_precision<ov::element::Type_t::f32, ov::element::Type_t::f16>(
+    std::shared_ptr<opset4::Constant>& constant) {
+    using src_type = typename element_type_traits<ov::element::Type_t::f32>::value_type;
+    using dst_type = typename element_type_traits<ov::element::Type_t::f16>::value_type;
+
+    const auto* src_data = constant->get_data_ptr<src_type>();
+    const auto size = shape_size(constant->get_shape());
+
+    auto new_constant = std::make_shared<opset4::Constant>(ov::element::Type_t::f16, constant->get_shape());
+    new_constant->output(0).set_names(constant->output(0).get_names());
+    auto* dst_data = const_cast<dst_type*>(reinterpret_cast<const dst_type*>(new_constant->get_data_ptr()));
+    if (dst_data == nullptr)
+        OPENVINO_THROW("Can't get destination data pointer");
+
+    ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size);
+
+    return new_constant;
+}
+
 template <>
 std::shared_ptr<Node> change_constant_precision<ov::element::Type_t::f16, ov::element::Type_t::f32>(
    std::shared_ptr<opset4::Constant>& constant) {
--- a/src/common/transformations/src/transformations/rt_info/disable_fp16_compression.cpp
+++ b/src/common/transformations/src/transformations/rt_info/disable_fp16_compression.cpp
@@ -4,6 +4,10 @@

 #include "transformations/rt_info/disable_fp16_compression.hpp"

+namespace {
+const std::string& postponed_fp16_compression_tag = "postponed_fp16_compression";
+}
+
 void ov::disable_fp16_compression(const std::shared_ptr<Node>& node) {
    auto& rt_info = node->get_rt_info();
    rt_info[DisableFP16Compression::get_type_info_static()] = DisableFP16Compression{};
@@ -18,3 +22,15 @@ bool ov::fp16_compression_is_disabled(const std::shared_ptr<const Node>& node) {
    const auto& rt_info = node->get_rt_info();
    return rt_info.count(DisableFP16Compression::get_type_info_static());
 }
+
+void ov::postpone_fp16_compression(ov::RTMap& rt_info) {
+    rt_info[postponed_fp16_compression_tag] = true;
+}
+
+bool ov::is_fp16_compression_postponed(const ov::RTMap& rt_info) {
+    return rt_info.count(postponed_fp16_compression_tag);
+}
+
+void ov::do_not_postpone_fp16_compression(ov::RTMap& rt_info) {
+    rt_info.erase(postponed_fp16_compression_tag);
+}
--- a/src/core/reference/include/ngraph/runtime/reference/convert.hpp
+++ b/src/core/reference/include/ngraph/runtime/reference/convert.hpp
@@ -115,6 +115,12 @@ void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count);

 #endif  // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64

+// Count how many f32 values is out of normal finite numbers range when converted to f16
+size_t count_out_of_f16_range(const float* arg, size_t count);
+
+// Convert values from f32 to f16 with claming to f16 min/max when value is out of normal finite numbers range
+void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count);
+
 // overload to handle ngraph::boolean (it is stored as char)
 template <typename TI, typename TO>
 typename std::enable_if<std::is_same<TO, char>::value>::type convert(const TI* arg, TO* out, size_t count) {
--- a/src/core/reference/src/runtime/reference/convert.cpp
+++ b/src/core/reference/src/runtime/reference/convert.cpp
@@ -5,17 +5,18 @@
 #include "ngraph/runtime/reference/convert.hpp"

 #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
-
 #    include "jit_generator.hpp"
+#endif

 namespace ngraph {
 namespace runtime {
 namespace reference {
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
 namespace {
-template <typename src_t, typename dst_t>
+template <typename src_t, typename dst_t, bool clamp = false>
 void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&);

-template <typename src_t, typename dst_t>
+template <typename src_t, typename dst_t, bool clamp = false>
 void jit_convert_vec_prepare(jit::Generator&) {}

 template <>
@@ -53,6 +54,37 @@ void jit_convert_vec<float, float16>(jit::Generator& gen, const Xbyak::RegExp& s
    gen.vmovdqu(gen.xword[dst], f16vec);
 }

+template <>
+void jit_convert_vec_prepare<float, float16, true>(jit::Generator& gen) {
+    auto upper_bound = gen.ymm5;
+    auto lower_bound = gen.ymm6;
+    auto addr = gen.r15;
+
+    static const float f16_max = std::numeric_limits<ov::float16>::max();
+    static const float f16_min = std::numeric_limits<ov::float16>::lowest();
+    static const float upper_bounds[8] = {f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max};
+    static const float lower_bounds[8] = {f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min};
+
+    gen.mov(addr, (size_t)upper_bounds);
+    gen.vmovdqu(upper_bound, gen.yword[addr]);
+    gen.mov(addr, (size_t)lower_bounds);
+    gen.vmovdqu(lower_bound, gen.yword[addr]);
+}
+
+template <>
+void jit_convert_vec<float, float16, true>(jit::Generator& gen, const Xbyak::RegExp& src, const Xbyak::RegExp& dst) {
+    auto f16vec = gen.xmm3;
+    auto f32vec = gen.ymm4;
+    auto upper_bound = gen.ymm5;
+    auto lower_bound = gen.ymm6;
+
+    gen.vmovups(f32vec, gen.yword[src]);
+    gen.vminps(f32vec, f32vec, upper_bound);
+    gen.vmaxps(f32vec, f32vec, lower_bound);
+    gen.vcvtps2ph(f16vec, f32vec, 0);
+    gen.vmovdqu(gen.xword[dst], f16vec);
+}
+
 template <>
 void jit_convert_vec_prepare<float, int8_t>(jit::Generator& gen) {
    auto order = gen.ymm1;
@@ -175,13 +207,13 @@ public:

    typedef void (*fn_t)(const args_t*);

-    template <typename src_t, typename dst_t>
+    template <typename src_t, typename dst_t, bool clamp = false>
    static fn_t get() {
        if (is_x64() && mayiuse(avx) && mayiuse(avx2) && mayiuse(fp16)) {
            static const jit_convert_array::context_t context{{sizeof(src_t), &jit::Generator::copy<src_t>},
                                                              {sizeof(dst_t), &jit::Generator::copy<dst_t>},
-                                                              jit_convert_vec<src_t, dst_t>,
-                                                              jit_convert_vec_prepare<src_t, dst_t>};
+                                                              jit_convert_vec<src_t, dst_t, clamp>,
+                                                              jit_convert_vec_prepare<src_t, dst_t, clamp>};

            static jit_convert_array generator(context);

@@ -191,9 +223,9 @@ public:
    }
 };

-template <typename TI, typename TO>
+template <typename TI, typename TO, bool clamp = false>
 void convert_impl(const TI* arg, TO* out, size_t count) {
-    auto converter = jit_convert_array::get<TI, TO>();
+    auto converter = jit_convert_array::get<TI, TO, clamp>();

    if (converter) {
        jit_convert_array::args_t args = {arg, out, count};
@@ -204,6 +236,232 @@ void convert_impl(const TI* arg, TO* out, size_t count) {
        }
    }
 }
+
+template <>
+void convert_impl<float, float16, true>(const float* arg, float16* out, size_t count) {
+    auto converter = jit_convert_array::get<float, float16, true>();
+
+    if (converter) {
+        jit_convert_array::args_t args = {arg, out, count};
+        converter(&args);
+    } else {
+        for (size_t i = 0; i < count; ++i) {
+            if (arg[i] > std::numeric_limits<ov::float16>::max()) {
+                out[i] = std::numeric_limits<ov::float16>::max();
+            } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
+                out[i] = std::numeric_limits<ov::float16>::lowest();
+            } else {
+                out[i] = static_cast<ov::float16>(arg[i]);
+            }
+        }
+    }
+}
+
+template <typename data_t, typename range_t>
+void jit_count_out_of_range_vec_prepare(jit::Generator&) {}
+
+template <typename data_t, typename range_t>
+void jit_count_out_of_range_vec(jit::Generator&, const Xbyak::RegExp&);
+
+template <typename data_t, typename range_t>
+void jit_count_out_of_range_vec_finalize(jit::Generator&, const Xbyak::RegExp&) {}
+
+template <>
+void jit_count_out_of_range_vec_prepare<float, float16>(jit::Generator& gen) {
+    auto accum_vec = gen.ymm4;
+    auto f16_max_pos_vec = gen.ymm5;
+    auto f16_max_neg_vec = gen.ymm6;
+    auto f16_min_pos_vec = gen.ymm7;
+    auto f16_min_neg_vec = gen.ymm8;
+    auto f16_zero_vec = gen.ymm9;
+    auto i32_ones_vec = gen.ymm10;
+    auto addr = gen.r15;
+
+    static const float f16_max_pos = std::numeric_limits<ov::float16>::max();
+    static const float f16_max_neg = std::numeric_limits<ov::float16>::lowest();
+    static const float f16_min_pos = ov::float16::from_bits(0x0001);
+    static const float f16_min_neg = -ov::float16::from_bits(0x0001);
+    static const int32_t i32_one = 1;
+
+    static const float max_pos_bounds[8] =
+        {f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos};
+    static const float max_neg_bounds[8] =
+        {f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg};
+    static const float min_pos_bounds[8] =
+        {f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos};
+    static const float min_neg_bounds[8] =
+        {f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg};
+    static const int32_t i32_ones[8] = {i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one};
+
+    auto load_vec = [&gen, &addr](Xbyak::Ymm vec, size_t ptr) {
+        gen.mov(addr, ptr);
+        gen.vmovdqu(vec, gen.yword[addr]);
+    };
+
+    load_vec(f16_max_pos_vec, (size_t)max_pos_bounds);
+    load_vec(f16_max_neg_vec, (size_t)max_neg_bounds);
+    load_vec(f16_min_pos_vec, (size_t)min_pos_bounds);
+    load_vec(f16_min_neg_vec, (size_t)min_neg_bounds);
+    load_vec(i32_ones_vec, (size_t)i32_ones);
+    gen.vxorps(f16_zero_vec, f16_zero_vec, f16_zero_vec);
+    gen.vxorps(accum_vec, accum_vec, accum_vec);
+}
+
+template <>
+void jit_count_out_of_range_vec<float, float16>(jit::Generator& gen, const Xbyak::RegExp& data) {
+    auto data_vec = gen.ymm1;
+    auto mask_vec = gen.ymm2;
+    auto mask_vec_xmm = gen.xmm2;
+    auto tmp_vec = gen.ymm3;
+    auto accum_vec = gen.ymm4;
+    auto f16_max_pos_vec = gen.ymm5;
+    auto f16_max_neg_vec = gen.ymm6;
+    auto f16_min_pos_vec = gen.ymm7;
+    auto f16_min_neg_vec = gen.ymm8;
+    auto f16_zero_vec = gen.ymm9;
+    auto i32_ones_vec = gen.ymm10;
+
+    const unsigned char _cmp_lt_os = 1;
+    const unsigned char _cmp_neq_uq = 4;
+    const unsigned char _cmp_gt_os = 6;
+
+    // std::abs(data) < ov::float16::from_bits(0x0001)
+    gen.vmovups(data_vec, gen.yword[data]);
+    gen.vcmpps(tmp_vec, data_vec, f16_min_pos_vec, _cmp_lt_os);
+    gen.vcmpps(mask_vec, data_vec, f16_min_neg_vec, _cmp_gt_os);
+    gen.vandps(mask_vec, mask_vec, tmp_vec);
+
+    // data != 0.0f
+    gen.vcmpps(tmp_vec, data_vec, f16_zero_vec, _cmp_neq_uq);
+    gen.vandps(mask_vec, mask_vec, tmp_vec);
+
+    // data > std::numeric_limits<ov::float16>::max()
+    gen.vcmpps(tmp_vec, data_vec, f16_max_pos_vec, _cmp_gt_os);
+    gen.vorps(mask_vec, mask_vec, tmp_vec);
+
+    // data < std::numeric_limits<ov::float16>::lowest()
+    gen.vcmpps(tmp_vec, data_vec, f16_max_neg_vec, _cmp_lt_os);
+    gen.vorps(mask_vec, mask_vec, tmp_vec);
+
+    // addition to i64 accumulator
+    gen.vandps(mask_vec, mask_vec, i32_ones_vec);
+    gen.vphaddd(mask_vec, mask_vec, mask_vec);
+    gen.vpermq(mask_vec, mask_vec, 0x08);
+    gen.vpmovsxdq(mask_vec, mask_vec_xmm);
+    gen.vpaddq(accum_vec, accum_vec, mask_vec);
+}
+
+template <>
+void jit_count_out_of_range_vec_finalize<float, float16>(jit::Generator& gen, const Xbyak::RegExp& dst) {
+    auto tmp_vec_xmm0 = gen.xmm2;  // reuse mask_vec
+    auto tmp_vec_xmm1 = gen.xmm3;  // reuse tmp_vec
+    auto accum_vec_ymm = gen.ymm4;
+    auto accum_vec_xmm = gen.xmm4;
+
+    // horizontal sum of four i64 values
+    gen.vextractf128(tmp_vec_xmm0, accum_vec_ymm, 0);
+    gen.vextractf128(tmp_vec_xmm1, accum_vec_ymm, 1);
+    gen.vpaddq(accum_vec_xmm, tmp_vec_xmm0, tmp_vec_xmm1);
+    gen.vpermilpd(tmp_vec_xmm0, accum_vec_xmm, 0x01);
+    gen.vpaddq(accum_vec_xmm, accum_vec_xmm, tmp_vec_xmm0);
+    gen.vmovq(gen.qword[dst], accum_vec_xmm);
+}
+
+class jit_count_out_of_range : public jit::Generator {
+    typedef struct context {
+        struct {
+            size_t type_size;
+            void (jit::Generator::*copy)(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size);
+        } data;
+        void (*prepare)(jit::Generator&);
+        void (*count_out_of_range)(jit::Generator&, const Xbyak::RegExp&);
+        void (*finalize)(jit::Generator&, const Xbyak::RegExp& dst);
+    } context_t;
+
+    jit_count_out_of_range(const context_t& ctx) {
+        using namespace Xbyak;
+
+        const uint32_t vlen = 8u;
+
+        auto reg_src = rax;
+        auto reg_dst = rbx;
+        auto reg_sz = rdx;
+
+        Label tail, exit;
+
+        preamble();
+
+        ctx.prepare(*this);
+
+        mov(reg_src, ptr[param + offsetof(args_t, src)]);
+        mov(reg_dst, ptr[param + offsetof(args_t, dst)]);
+        mov(reg_sz, ptr[param + offsetof(args_t, count)]);
+
+        xor_(rsi, rsi);
+        mov(r8, reg_sz);
+        shr(r8, 3);
+
+        foreach (rsi, 1, r8, [&, this](const Xbyak::Reg64& idx) {
+            ctx.count_out_of_range(*this, reg_src);
+            add(reg_src, static_cast<uint32_t>(ctx.data.type_size * vlen));
+        })
+            ;
+
+        L(tail);
+
+        shl(rsi, 3);
+        sub(reg_sz, rsi);
+        test(reg_sz, reg_sz);
+        jz(exit);
+
+        // allocate array for 8 floats on stack
+        sub(rsp, vlen * sizeof(float));
+        mov(r8, rsp);
+
+        auto tmp_vec = ymm2;  // reuse mask_vec
+        vpxor(tmp_vec, tmp_vec, tmp_vec);
+        vmovups(yword[r8], tmp_vec);
+
+        // Tail conversion
+        (this->*ctx.data.copy)(r8, reg_src, reg_sz);
+        ctx.count_out_of_range(*this, r8);
+
+        // Free the array on stack
+        add(rsp, vlen * sizeof(float));
+
+        L(exit);
+
+        ctx.finalize(*this, reg_dst);
+
+        postamble();
+    }
+
+public:
+    typedef struct {
+        const void* src;
+        void* dst;
+        const size_t count;
+    } args_t;
+
+    typedef void (*fn_t)(const args_t*);
+
+    template <typename data_t, typename range_t>
+    static fn_t get() {
+        if (is_x64() && mayiuse(avx2)) {
+            static const jit_count_out_of_range::context_t context{
+                {sizeof(data_t), &jit::Generator::copy<data_t>},
+                jit_count_out_of_range_vec_prepare<data_t, range_t>,
+                jit_count_out_of_range_vec<data_t, range_t>,
+                jit_count_out_of_range_vec_finalize<data_t, range_t>};
+
+            static jit_count_out_of_range generator(context);
+
+            return (fn_t)generator.getCode();
+        }
+        return nullptr;
+    }
+};
+
 }  // namespace

 template <>
@@ -231,8 +489,49 @@ void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count) {
    convert_impl(arg, out, count);
 }

+#endif  // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
+
+void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count) {
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+    convert_impl<float, float16, true>(arg, out, count);
+#else
+    // FIXME: duplicate and stub for ARM, provide more optimized solution
+    for (size_t i = 0; i < count; ++i) {
+        if (arg[i] > std::numeric_limits<ov::float16>::max()) {
+            out[i] = std::numeric_limits<ov::float16>::max();
+        } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
+            out[i] = std::numeric_limits<ov::float16>::lowest();
+        } else {
+            out[i] = static_cast<ov::float16>(arg[i]);
+        }
+    }
+#endif  // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+}
+
+size_t count_out_of_f16_range(const float* arg, size_t count) {
+    size_t num_out_of_range = 0;
+
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+    auto converter = jit_count_out_of_range::get<float, float16>();
+    if (converter) {
+        jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
+        converter(&args);
+        return num_out_of_range;
+    }
+#endif
+    for (size_t i = 0; i < count; ++i) {
+        // if abs value is smaller than the smallest positive fp16, but not zero
+        if (std::abs(arg[i]) < ov::float16::from_bits(0x0001) && arg[i] != 0.0f) {
+            num_out_of_range++;
+        } else if (arg[i] > std::numeric_limits<ov::float16>::max()) {
+            num_out_of_range++;
+        } else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
+            num_out_of_range++;
+        }
+    }
+    return num_out_of_range;
+}
+
 }  // namespace reference
 }  // namespace runtime
 }  // namespace ngraph
-
-#endif  // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
--- a/src/core/src/graph_util.cpp
+++ b/src/core/src/graph_util.cpp
@@ -356,7 +356,7 @@ void save_model(const std::shared_ptr<const ov::Model>& m, const std::string& ou
    ov::pass::Manager manager;
    if (compress_to_fp16) {
        manager.register_pass<ov::pass::MarkPrecisionSensitiveConstants>();
-        manager.register_pass<ov::pass::CompressFloatConstants>();
+        manager.register_pass<ov::pass::CompressFloatConstants>(/*postponed=*/true);
    }
    manager.register_pass<ov::pass::FusedNamesCleanup>();
    manager.register_pass<ov::pass::Serialize>(output_model, "");
--- a/src/core/src/pass/serialize.cpp
+++ b/src/core/src/pass/serialize.cpp
@@ -12,16 +12,19 @@
 #include <unordered_map>
 #include <unordered_set>

+#include "ngraph/runtime/reference/convert.hpp"
 #include "openvino/core/coordinate_diff.hpp"
 #include "openvino/core/except.hpp"
 #include "openvino/core/meta_data.hpp"
 #include "openvino/core/model.hpp"
+#include "openvino/core/type/float16.hpp"
 #include "openvino/op/util/framework_node.hpp"
 #include "openvino/opsets/opset1.hpp"
 #include "openvino/pass/constant_folding.hpp"
 #include "openvino/util/file_util.hpp"
 #include "pugixml.hpp"
 #include "transformations/hash.hpp"
+#include "transformations/rt_info/disable_fp16_compression.hpp"
 #include "transformations/rt_info/primitives_priority_attribute.hpp"

 OPENVINO_SUPPRESS_DEPRECATED_START
@@ -89,13 +92,28 @@ public:
          m_enable_compression(enable_compression),
          m_blob_offset(bin_data.tellp()) {}

-    FilePosition write(const char* ptr, size_t size) {
+    FilePosition write(const char* ptr,
+                       size_t size,
+                       size_t* new_size,
+                       bool compress_to_fp16 = false,
+                       ov::element::Type src_type = ov::element::dynamic) {
        const FilePosition write_pos = m_binary_output.tellp();
        const auto offset = write_pos - m_blob_offset;
-        if (!m_enable_compression) {
-            m_binary_output.write(ptr, size);
+        *new_size = size;
+
+        if (!m_enable_compression || compress_to_fp16) {
+            write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
            return offset;
        }
+        // TODO: Find a way to keep both types of compression (m_enable_compression and compress_to_fp16)
+        // simultaneously. Disabled usual compression by m_enable_compression for those constants that are requested to
+        // be compressed by compress_to_fp16 for now. To implement both compression types applied simultaneously
+        // we need to save element_type for each constant in the cache together with the compression status
+        // that implies a wider impact and requires a more accurate implementation of cache handling.
+        // When FP16 compression is turned on together with the usual compression enabled by m_enable_compression, we
+        // can avoid comparing FP32 weights, but it would require comparing with data from a file, because on-the-fly
+        // converted FP16 constants are not kept in memory.
+
        // This hash is weak (but efficient) and must be replace with some other
        // more stable hash algorithm. For example current hash algorithms gives
        // the same hash for {2, 2} and {0, 128} arrays. So we have to compare
@@ -107,13 +125,64 @@ public:
            return found->second.first;
        }

-        m_binary_output.write(ptr, size);
+        write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
        m_hash_to_file_positions.insert({hash, {offset, static_cast<void const*>(ptr)}});

        return offset;
    }

 private:
+    void write_with_optional_fp16_compression(const char* ptr,
+                                              size_t size,
+                                              size_t* new_size,
+                                              bool compress_to_fp16 = false,
+                                              ov::element::Type src_type = ov::element::dynamic) {
+        if (!compress_to_fp16) {
+            m_binary_output.write(ptr, size);
+        } else {
+            OPENVINO_ASSERT(size % src_type.size() == 0);
+            auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size);
+            m_binary_output.write(fp16_buffer.get(), *new_size);
+            // Compressed data is disposed
+        }
+    }
+
+    std::unique_ptr<char[]> compress_data_to_fp16(const char* ptr,
+                                                  size_t size,
+                                                  ov::element::Type src_type,
+                                                  size_t* compressed_size) {
+        auto num_src_elements = size / src_type.size();
+        *compressed_size = num_src_elements * ov::element::f16.size();
+        if (src_type == ov::element::f32) {
+            auto new_ptr = std::unique_ptr<char[]>(new char[*compressed_size]);
+            auto dst_data = reinterpret_cast<ov::float16*>(new_ptr.get());
+            auto src_data = reinterpret_cast<const float*>(ptr);
+            ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, num_src_elements);
+            return new_ptr;
+        } else if (src_type == ov::element::f64) {
+            auto new_ptr = std::unique_ptr<char[]>(new char[*compressed_size]);
+            auto dst_data = reinterpret_cast<ov::float16*>(new_ptr.get());
+            auto src_data = reinterpret_cast<const double*>(ptr);
+
+            // Reference implementation for fp64 to fp16 conversoin
+            for (size_t i = 0; i < num_src_elements; ++i) {
+                // if abs value is smaller than the smallest positive fp16, but not zero
+                if (std::abs(src_data[i]) < ov::float16::from_bits(0x0001) && src_data[i] != 0.0f) {
+                    dst_data[i] = 0;
+                } else if (src_data[i] > std::numeric_limits<ov::float16>::max()) {
+                    dst_data[i] = std::numeric_limits<ov::float16>::max();
+                } else if (src_data[i] < std::numeric_limits<ov::float16>::lowest()) {
+                    dst_data[i] = std::numeric_limits<ov::float16>::lowest();
+                } else {
+                    dst_data[i] = static_cast<ov::float16>(src_data[i]);
+                }
+            }
+            return new_ptr;
+        } else {
+            OPENVINO_THROW("[ INTERNAL ERROR ] Not supported source type for weights compression: ", src_type);
+        }
+    }
+
    ConstWritePositions m_hash_to_file_positions;
    std::ostream& m_binary_output;
    bool m_enable_compression;
@@ -237,6 +306,8 @@ class XmlSerializer : public ov::AttributeVisitor {
    ConstantWriter& m_constant_write_handler;
    int64_t m_version;
    bool m_deterministic;
+    bool m_compress_to_fp16;
+    ov::element::Type m_output_element_type;

    template <typename T>
    std::string create_atribute_list(ov::ValueAccessor<std::vector<T>>& adapter) {
@@ -354,13 +425,17 @@ public:
                  const std::map<std::string, ngraph::OpSet>& custom_opsets,
                  ConstantWriter& constant_write_handler,
                  int64_t version,
-                  bool deterministic = false)
+                  bool deterministic = false,
+                  bool compress_to_fp16 = false,
+                  ov::element::Type output_element_type = ov::element::dynamic)
        : m_xml_node(data),
          m_node_type_name(node_type_name),
          m_custom_opsets(custom_opsets),
          m_constant_write_handler(constant_write_handler),
          m_version(version),
-          m_deterministic(deterministic) {}
+          m_deterministic(deterministic),
+          m_compress_to_fp16(compress_to_fp16),
+          m_output_element_type(output_element_type) {}

    void on_adapter(const std::string& name, ov::ValueAccessor<void>& adapter) override {
        using BodyTargetNames = std::tuple<std::string, std::string, std::vector<std::string>>;
@@ -444,10 +519,15 @@ public:
                       ov::as_type<ov::AttributeAdapter<std::shared_ptr<ngraph::runtime::AlignedBuffer>>>(&adapter)) {
            if (name == "value" && translate_type_name(m_node_type_name) == "Const") {
                const int64_t size = a->get()->size();
-                int64_t offset = m_constant_write_handler.write(static_cast<const char*>(a->get()->get_ptr()), size);
+                size_t new_size;
+                int64_t offset = m_constant_write_handler.write(static_cast<const char*>(a->get()->get_ptr()),
+                                                                size,
+                                                                &new_size,
+                                                                m_compress_to_fp16,
+                                                                m_output_element_type);

                m_xml_node.append_attribute("offset").set_value(static_cast<unsigned long long>(offset));
-                m_xml_node.append_attribute("size").set_value(static_cast<unsigned long long>(size));
+                m_xml_node.append_attribute("size").set_value(static_cast<unsigned long long>(new_size));
            }
        } else if (const auto& a = ov::as_type<ov::AttributeAdapter<ov::op::util::FrameworkNodeAttrs>>(&adapter)) {
            const auto& attrs = a->get();
@@ -496,7 +576,13 @@ public:
        m_xml_node.append_attribute(name.c_str()).set_value(adapter.get());
    }
    void on_adapter(const std::string& name, ov::ValueAccessor<std::string>& adapter) override {
-        m_xml_node.append_attribute(name.c_str()).set_value(adapter.get().c_str());
+        std::string value;
+        if (m_compress_to_fp16 && name == "element_type") {
+            value = ov::as_string(static_cast<ov::element::Type_t>(ov::element::f16));
+        } else {
+            value = adapter.get();
+        }
+        m_xml_node.append_attribute(name.c_str()).set_value(value.c_str());
    }
    void on_adapter(const std::string& name, ov::ValueAccessor<int64_t>& adapter) override {
        m_xml_node.append_attribute(name.c_str()).set_value(static_cast<long long>(adapter.get()));
@@ -913,7 +999,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,

                pugi::xml_node port = input.append_child("port");
                port.append_attribute("id").set_value(port_id++);
-                port.append_attribute("precision").set_value(get_precision_name(i.get_element_type()).c_str());
+
+                auto rt_info = i.get_tensor().get_rt_info();
+                auto port_element_type =
+                    is_fp16_compression_postponed(rt_info) ? ov::element::f16 : i.get_element_type();
+
+                port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str());
                for (auto d : i.get_partial_shape()) {
                    pugi::xml_node dim = port.append_child("dim");
                    if (d.is_dynamic()) {
@@ -937,7 +1028,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
            for (auto& o : node->outputs()) {
                pugi::xml_node port = output.append_child("port");
                port.append_attribute("id").set_value(port_id++);
-                port.append_attribute("precision").set_value(get_precision_name(o.get_element_type()).c_str());
+
+                auto rt_info = o.get_tensor().get_rt_info();
+                auto port_element_type =
+                    is_fp16_compression_postponed(rt_info) ? ov::element::f16 : o.get_element_type();
+
+                port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str());

                // Sort tensor names
                const auto& tensor_names = o.get_tensor().get_names();
@@ -973,6 +1069,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,

        // fill <data> general attributes
        {
+            bool compress_to_fp16 = false;
+            ov::element::Type output_element_type = ov::element::dynamic;
+            if (is_fp16_compression_postponed(node->get_rt_info())) {
+                compress_to_fp16 = true;
+                output_element_type = node->get_output_element_type(0);
+            }
            // Backward compatibility: clear padding values for nodes with auto_pad
            PaddingsFixer fixed_node(node);
            XmlSerializer visitor(data,
@@ -980,7 +1082,9 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
                                  custom_opsets,
                                  constant_node_write_handler,
                                  version,
-                                  deterministic);
+                                  deterministic,
+                                  compress_to_fp16,
+                                  output_element_type);
            OPENVINO_ASSERT(fixed_node.get_node()->visit_attributes(visitor), "Visitor API is not supported in ", node);
        }
        rt_info::XmlSerializer{data}.serialize(node->get_rt_info());
--- a/tools/ovc/openvino/tools/ovc/main.py
+++ b/tools/ovc/openvino/tools/ovc/main.py
@@ -13,7 +13,7 @@ from openvino.tools.ovc.convert_impl import _convert
 from openvino.tools.ovc.cli_parser import get_model_name_from_args

 # pylint: disable=no-name-in-module,import-error
-from openvino.runtime import serialize
+from openvino.runtime import save_model


 def main():
@@ -24,12 +24,8 @@ def main():

    model_path = get_model_name_from_args(argv)

-    # TODO: replace compress_model + serialize with save_model
-    if argv.compress_to_fp16:
-        from openvino.tools.ovc.moc_frontend.offline_transformations import compress_model
-        compress_model(ngraph_function)
-
-    serialize(ngraph_function, model_path.encode('utf-8'), model_path.replace('.xml', '.bin').encode('utf-8'))
+    compress_to_fp16 = 'compress_to_fp16' in argv and argv.compress_to_fp16
+    save_model(ngraph_function, model_path.encode('utf-8'), compress_to_fp16)

    print('[ SUCCESS ] XML file: {}'.format(model_path))
    print('[ SUCCESS ] BIN file: {}'.format(model_path.replace('.xml', '.bin')))