diff --git a/src/bindings/python/tests/test_transformations/test_compression.py b/src/bindings/python/tests/test_transformations/test_compression.py new file mode 100644 index 00000000000..c1ac76c04e5 --- /dev/null +++ b/src/bindings/python/tests/test_transformations/test_compression.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import tempfile +from typing import List + +import numpy as np +from openvino.runtime.op import Parameter, Constant +from openvino.runtime.opset12 import add, multiply + +import openvino as ov + + +def make_constant(values, transposed): + return Constant(ov.Type.f32, ov.Shape([1, len(values)] if transposed else [len(values), 1]), values) + + +# keep fp16 denormals, flush fp32 denormals to zero +in_range = [-65504.0, -2.0, 1.00097656, -1.0, -0.99951172, -0.00006103515625, -0.000000059604645, 0.0, + 0.000000059604645, 0.99951172, 0.00006103515625, 1.0, 1.00097656, 2.0, 65504] +out_of_range = [float("-inf"), -65505.0, -1e-10, -1e-39, 1e-39, 1e-10, 65505.0, float("inf")] +converted_out_of_range = [-65504.0, -65504.0, 0, 0, 0, 0, 65504.0, 65504.0] + +# test inputs +more_in_range = out_of_range + 10 * in_range +more_out_of_range = in_range + 10 * out_of_range + +# reference after conversion more_in_range to fp16 +converted_more_in_range = converted_out_of_range + 10 * in_range + + +def make_model(add_consts, mul_consts): + parameter1 = Parameter(ov.Type.f32, ov.PartialShape([-1])) + add1 = add(parameter1, make_constant(add_consts, False)) + mul1 = multiply(add1, make_constant(mul_consts, True)) + return ov.Model([mul1], [parameter1]) + + +def get_constants(model) -> List[Constant]: + from pathlib import Path + model_name = Path(tempfile.gettempdir()) / "f32_partially_compressed.xml" + ov.save_model(model, model_name) + core = ov.Core() + restored_model = core.read_model(model_name) + + op_ind_map = {"Add": 0, "Multiply": 1} + constants_list = [[]] * len(op_ind_map) + + for op in restored_model.get_ordered_ops(): + op_type = op.get_type_info().name + if op_type not in op_ind_map.keys(): + continue + + in_node = op.input_value(1).get_node() + if in_node.get_type_info().name == "Convert": + const_node = in_node.input_value(0).get_node() + if const_node.get_type_info().name != "Constant": + const_node = None + elif in_node.get_type_info().name == "Constant": + const_node = in_node + + constants_list[op_ind_map[op_type]] = const_node + + for node in constants_list: + assert not isinstance(node, list) + + # sanity check that model is compilable + ov.compile_model(restored_model) + return constants_list + + +def test_compression_1(): + model = make_model(more_in_range, more_out_of_range) + const_fp16, const_fp32 = get_constants(model) + assert const_fp32 is not None, "There is no Constant op on FP32 branch" + assert const_fp16 is not None, "There is no compressed Constant + Convert op on FP16 branch" + + assert const_fp32.get_output_element_type(0) == ov.Type.f32 + assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32.get_vector()) + + assert const_fp16.get_output_element_type(0) == ov.Type.f16 + + msg = f"Difference: {np.array(converted_more_in_range, dtype=np.float32) - const_fp16.get_vector()}" + assert np.all(np.array(converted_more_in_range, dtype=np.float32) == const_fp16.get_vector()), msg + + +def test_compression_2(): + model = make_model(more_in_range, more_in_range) + const_fp16_1, const_fp16_2 = get_constants(model) + + assert const_fp16_1 is not None, "There is no Constant op on FP16 branch" + assert const_fp16_2 is not None, "There is no Constant op on FP16 branch" + + assert const_fp16_1.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16" + assert const_fp16_2.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16" + f16_min, f16_max = np.finfo(np.float16).min, np.finfo(np.float16).max + in_range_clipped = np.clip(more_in_range, f16_min, f16_max).astype(np.float16) + + assert np.all(in_range_clipped == const_fp16_1.get_vector()) + assert np.all(in_range_clipped == const_fp16_2.get_vector()) + + +def test_no_compression(): + model = make_model(more_out_of_range, more_out_of_range) + const_fp32_1, const_fp32_2 = get_constants(model) + + assert const_fp32_1 is not None, "There is no Constant op on FP32 branch" + assert const_fp32_2 is not None, "There is no Constant op on FP32 branch" + + assert const_fp32_1.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32" + + assert const_fp32_2.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32" + + assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_1.get_vector()) + assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_2.get_vector()) diff --git a/src/common/transformations/include/transformations/common_optimizations/compress_float_constants.hpp b/src/common/transformations/include/transformations/common_optimizations/compress_float_constants.hpp index 7d4db8f62d7..dc6479c0ce6 100644 --- a/src/common/transformations/include/transformations/common_optimizations/compress_float_constants.hpp +++ b/src/common/transformations/include/transformations/common_optimizations/compress_float_constants.hpp @@ -24,7 +24,13 @@ class TRANSFORMATIONS_API CompressFloatConstants; class ov::pass::CompressFloatConstantsImpl : public ov::pass::MatcherPass { public: OPENVINO_RTTI("CompressFloatConstantsImpl", "0"); - CompressFloatConstantsImpl(); + /// @brief Transformation constructor + /// @param postponed If true then the transformation won't compress the constants + /// keeping them in the original type but still will insert Converts. This is + /// a special mode of operation that requires another transformation to + /// apply a real compression on constants. Constants eligible for + /// postponed compression are marked with a special rt_info tag. + CompressFloatConstantsImpl(bool postponed = false); }; /** @@ -44,8 +50,10 @@ public: class ov::pass::CompressFloatConstants : public ov::pass::GraphRewrite { public: OPENVINO_RTTI("CompressFloatConstants", "0"); - CompressFloatConstants() { - add_matcher(); + /// @brief Transformation constructor + /// @param postponed Postponed compression, see ov::pass::CompressFloatConstantsImpl for details. + CompressFloatConstants(bool postponed = false) { + add_matcher(postponed); add_matcher(); } }; diff --git a/src/common/transformations/include/transformations/rt_info/disable_fp16_compression.hpp b/src/common/transformations/include/transformations/rt_info/disable_fp16_compression.hpp index 364b5c74d59..0ecbd3641ad 100644 --- a/src/common/transformations/include/transformations/rt_info/disable_fp16_compression.hpp +++ b/src/common/transformations/include/transformations/rt_info/disable_fp16_compression.hpp @@ -16,6 +16,12 @@ TRANSFORMATIONS_API void enable_fp16_compression(const std::shared_ptr& no TRANSFORMATIONS_API bool fp16_compression_is_disabled(const std::shared_ptr& node); +TRANSFORMATIONS_API void postpone_fp16_compression(RTMap& rt_info); + +TRANSFORMATIONS_API bool is_fp16_compression_postponed(const RTMap& rt_info); + +TRANSFORMATIONS_API void do_not_postpone_fp16_compression(RTMap& rt_info); + /** * @ingroup ie_runtime_attr_api * @brief DisableFP16Compression class represents runtime info attribute that marks operation diff --git a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp index 358acc548dd..8cf7988e98e 100644 --- a/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp +++ b/src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp @@ -5,6 +5,7 @@ #include "transformations/common_optimizations/compress_float_constants.hpp" #include "itt.hpp" +#include "ngraph/runtime/reference/convert.hpp" #include "openvino/core/rt_info.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/convert.hpp" @@ -16,7 +17,8 @@ namespace { template -std::shared_ptr change_constant_precision_to_fp16(std::shared_ptr& constant) { +std::shared_ptr change_constant_precision_to_fp16(std::shared_ptr& constant, + bool postponed = false) { using src_type = typename ov::element_type_traits::value_type; const auto* src_data = constant->get_data_ptr(); @@ -24,9 +26,10 @@ std::shared_ptr change_constant_precision_to_fp16(std::shared_ptr(ov::element::f16, constant->get_shape()); auto* dst_data = const_cast(reinterpret_cast(new_constant->get_data_ptr())); - if (dst_data == nullptr) + if (!dst_data || !size) return nullptr; + // slow implementation: is used when optimized ones are not available: f64 or for ARM (both for f64 and f32) int num_out_of_range = 0; for (size_t i = 0; i < size; ++i) { // if abs value is smaller than the smallest positive fp16, but not zero @@ -44,18 +47,24 @@ std::shared_ptr change_constant_precision_to_fp16(std::shared_ptr(num_out_of_range) / static_cast(size); + const float keep_threshold = 0.75f; + const float out_of_range_proportion = static_cast(num_out_of_range) / static_cast(size); if (out_of_range_proportion >= keep_threshold) { return nullptr; } - return new_constant; + if (postponed) { + // dispose just converted constant to avoid allocation too much memory + // it will be converted again while serialization + return constant; + } else { + return new_constant; + } } } // namespace -ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() { +ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed) { MATCHER_SCOPE(CompressFloatConstantsImpl); auto const_pattern = pattern::wrap_type(); @@ -72,26 +81,68 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() { auto c_type = const_node->get_element_type(); std::shared_ptr new_const; + +#if !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64) if (c_type == ov::element::f32) { - new_const = change_constant_precision_to_fp16(const_node); + new_const = change_constant_precision_to_fp16(const_node, postponed); } else if (c_type == ov::element::f64) { - new_const = change_constant_precision_to_fp16(const_node); + new_const = change_constant_precision_to_fp16(const_node, postponed); + } + if (!new_const) // if out of range > threshold -> then new_const == nullptr + return false; +#else + if (c_type == ov::element::f32) { + auto size = shape_size(const_node->get_output_shape(0)); + if (size == 0) + return false; + auto num_out_of_range = + ngraph::runtime::reference::count_out_of_f16_range(const_node->get_data_ptr(), size); + + // if more than 75% of a FP32 constant do not fit into FP16 keep in FP32 + const float keep_threshold = 0.75f; + const float out_of_range_proportion = static_cast(num_out_of_range) / static_cast(size); + if (out_of_range_proportion >= keep_threshold) + return false; + + if (postponed) { + new_const = const_node; + } else { + const auto* src_data = const_node->get_data_ptr(); + auto compressed_const = + std::make_shared(ov::element::f16, const_node->get_shape()); + auto* dst_data = + const_cast(reinterpret_cast(compressed_const->get_data_ptr())); + OPENVINO_ASSERT(dst_data); + ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size); + new_const = compressed_const; + } + } else if (c_type == ov::element::f64) { + new_const = change_constant_precision_to_fp16(const_node, postponed); } else { return false; } +#endif // !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64) if (!new_const) { return false; } + auto constant_target_inputs = const_node->get_output_target_inputs(0); auto convert = std::make_shared(new_const, const_node->get_element_type()); - new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed"); convert->set_friendly_name(const_node->get_friendly_name()); + new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed"); ov::copy_runtime_info(const_node, convert); ov::mark_as_decompression(convert); + if (postponed) { + postpone_fp16_compression(new_const->get_rt_info()); + postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info()); - ov::replace_node(const_node, convert); - + for (const auto& target_input : constant_target_inputs) { + target_input.replace_source_output(convert); + } + } else { + ov::replace_node(const_node, convert); + } return true; }; diff --git a/src/common/transformations/src/transformations/convert_precision.cpp b/src/common/transformations/src/transformations/convert_precision.cpp index 50f484ae0ae..e1cc8127b69 100644 --- a/src/common/transformations/src/transformations/convert_precision.cpp +++ b/src/common/transformations/src/transformations/convert_precision.cpp @@ -895,6 +895,26 @@ std::shared_ptr change_constant_precision(std::shared_ptr +std::shared_ptr change_constant_precision( + std::shared_ptr& constant) { + using src_type = typename element_type_traits::value_type; + using dst_type = typename element_type_traits::value_type; + + const auto* src_data = constant->get_data_ptr(); + const auto size = shape_size(constant->get_shape()); + + auto new_constant = std::make_shared(ov::element::Type_t::f16, constant->get_shape()); + new_constant->output(0).set_names(constant->output(0).get_names()); + auto* dst_data = const_cast(reinterpret_cast(new_constant->get_data_ptr())); + if (dst_data == nullptr) + OPENVINO_THROW("Can't get destination data pointer"); + + ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size); + + return new_constant; +} + template <> std::shared_ptr change_constant_precision( std::shared_ptr& constant) { diff --git a/src/common/transformations/src/transformations/rt_info/disable_fp16_compression.cpp b/src/common/transformations/src/transformations/rt_info/disable_fp16_compression.cpp index 95af78e9487..91384e0b39d 100644 --- a/src/common/transformations/src/transformations/rt_info/disable_fp16_compression.cpp +++ b/src/common/transformations/src/transformations/rt_info/disable_fp16_compression.cpp @@ -4,6 +4,10 @@ #include "transformations/rt_info/disable_fp16_compression.hpp" +namespace { +const std::string& postponed_fp16_compression_tag = "postponed_fp16_compression"; +} + void ov::disable_fp16_compression(const std::shared_ptr& node) { auto& rt_info = node->get_rt_info(); rt_info[DisableFP16Compression::get_type_info_static()] = DisableFP16Compression{}; @@ -18,3 +22,15 @@ bool ov::fp16_compression_is_disabled(const std::shared_ptr& node) { const auto& rt_info = node->get_rt_info(); return rt_info.count(DisableFP16Compression::get_type_info_static()); } + +void ov::postpone_fp16_compression(ov::RTMap& rt_info) { + rt_info[postponed_fp16_compression_tag] = true; +} + +bool ov::is_fp16_compression_postponed(const ov::RTMap& rt_info) { + return rt_info.count(postponed_fp16_compression_tag); +} + +void ov::do_not_postpone_fp16_compression(ov::RTMap& rt_info) { + rt_info.erase(postponed_fp16_compression_tag); +} diff --git a/src/core/reference/include/ngraph/runtime/reference/convert.hpp b/src/core/reference/include/ngraph/runtime/reference/convert.hpp index 0bab3471d8a..a05b34d50a0 100644 --- a/src/core/reference/include/ngraph/runtime/reference/convert.hpp +++ b/src/core/reference/include/ngraph/runtime/reference/convert.hpp @@ -115,6 +115,12 @@ void convert(const float16* arg, int8_t* out, size_t count); #endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 +// Count how many f32 values is out of normal finite numbers range when converted to f16 +size_t count_out_of_f16_range(const float* arg, size_t count); + +// Convert values from f32 to f16 with claming to f16 min/max when value is out of normal finite numbers range +void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count); + // overload to handle ngraph::boolean (it is stored as char) template typename std::enable_if::value>::type convert(const TI* arg, TO* out, size_t count) { diff --git a/src/core/reference/src/runtime/reference/convert.cpp b/src/core/reference/src/runtime/reference/convert.cpp index 3e43e779a36..f30753ed13f 100644 --- a/src/core/reference/src/runtime/reference/convert.cpp +++ b/src/core/reference/src/runtime/reference/convert.cpp @@ -5,17 +5,18 @@ #include "ngraph/runtime/reference/convert.hpp" #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) - # include "jit_generator.hpp" +#endif namespace ngraph { namespace runtime { namespace reference { +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) namespace { -template +template void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&); -template +template void jit_convert_vec_prepare(jit::Generator&) {} template <> @@ -53,6 +54,37 @@ void jit_convert_vec(jit::Generator& gen, const Xbyak::RegExp& s gen.vmovdqu(gen.xword[dst], f16vec); } +template <> +void jit_convert_vec_prepare(jit::Generator& gen) { + auto upper_bound = gen.ymm5; + auto lower_bound = gen.ymm6; + auto addr = gen.r15; + + static const float f16_max = std::numeric_limits::max(); + static const float f16_min = std::numeric_limits::lowest(); + static const float upper_bounds[8] = {f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max}; + static const float lower_bounds[8] = {f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min}; + + gen.mov(addr, (size_t)upper_bounds); + gen.vmovdqu(upper_bound, gen.yword[addr]); + gen.mov(addr, (size_t)lower_bounds); + gen.vmovdqu(lower_bound, gen.yword[addr]); +} + +template <> +void jit_convert_vec(jit::Generator& gen, const Xbyak::RegExp& src, const Xbyak::RegExp& dst) { + auto f16vec = gen.xmm3; + auto f32vec = gen.ymm4; + auto upper_bound = gen.ymm5; + auto lower_bound = gen.ymm6; + + gen.vmovups(f32vec, gen.yword[src]); + gen.vminps(f32vec, f32vec, upper_bound); + gen.vmaxps(f32vec, f32vec, lower_bound); + gen.vcvtps2ph(f16vec, f32vec, 0); + gen.vmovdqu(gen.xword[dst], f16vec); +} + template <> void jit_convert_vec_prepare(jit::Generator& gen) { auto order = gen.ymm1; @@ -175,13 +207,13 @@ public: typedef void (*fn_t)(const args_t*); - template + template static fn_t get() { if (is_x64() && mayiuse(avx) && mayiuse(avx2) && mayiuse(fp16)) { static const jit_convert_array::context_t context{{sizeof(src_t), &jit::Generator::copy}, {sizeof(dst_t), &jit::Generator::copy}, - jit_convert_vec, - jit_convert_vec_prepare}; + jit_convert_vec, + jit_convert_vec_prepare}; static jit_convert_array generator(context); @@ -191,9 +223,9 @@ public: } }; -template +template void convert_impl(const TI* arg, TO* out, size_t count) { - auto converter = jit_convert_array::get(); + auto converter = jit_convert_array::get(); if (converter) { jit_convert_array::args_t args = {arg, out, count}; @@ -204,6 +236,232 @@ void convert_impl(const TI* arg, TO* out, size_t count) { } } } + +template <> +void convert_impl(const float* arg, float16* out, size_t count) { + auto converter = jit_convert_array::get(); + + if (converter) { + jit_convert_array::args_t args = {arg, out, count}; + converter(&args); + } else { + for (size_t i = 0; i < count; ++i) { + if (arg[i] > std::numeric_limits::max()) { + out[i] = std::numeric_limits::max(); + } else if (arg[i] < std::numeric_limits::lowest()) { + out[i] = std::numeric_limits::lowest(); + } else { + out[i] = static_cast(arg[i]); + } + } + } +} + +template +void jit_count_out_of_range_vec_prepare(jit::Generator&) {} + +template +void jit_count_out_of_range_vec(jit::Generator&, const Xbyak::RegExp&); + +template +void jit_count_out_of_range_vec_finalize(jit::Generator&, const Xbyak::RegExp&) {} + +template <> +void jit_count_out_of_range_vec_prepare(jit::Generator& gen) { + auto accum_vec = gen.ymm4; + auto f16_max_pos_vec = gen.ymm5; + auto f16_max_neg_vec = gen.ymm6; + auto f16_min_pos_vec = gen.ymm7; + auto f16_min_neg_vec = gen.ymm8; + auto f16_zero_vec = gen.ymm9; + auto i32_ones_vec = gen.ymm10; + auto addr = gen.r15; + + static const float f16_max_pos = std::numeric_limits::max(); + static const float f16_max_neg = std::numeric_limits::lowest(); + static const float f16_min_pos = ov::float16::from_bits(0x0001); + static const float f16_min_neg = -ov::float16::from_bits(0x0001); + static const int32_t i32_one = 1; + + static const float max_pos_bounds[8] = + {f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos}; + static const float max_neg_bounds[8] = + {f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg}; + static const float min_pos_bounds[8] = + {f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos}; + static const float min_neg_bounds[8] = + {f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg}; + static const int32_t i32_ones[8] = {i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one}; + + auto load_vec = [&gen, &addr](Xbyak::Ymm vec, size_t ptr) { + gen.mov(addr, ptr); + gen.vmovdqu(vec, gen.yword[addr]); + }; + + load_vec(f16_max_pos_vec, (size_t)max_pos_bounds); + load_vec(f16_max_neg_vec, (size_t)max_neg_bounds); + load_vec(f16_min_pos_vec, (size_t)min_pos_bounds); + load_vec(f16_min_neg_vec, (size_t)min_neg_bounds); + load_vec(i32_ones_vec, (size_t)i32_ones); + gen.vxorps(f16_zero_vec, f16_zero_vec, f16_zero_vec); + gen.vxorps(accum_vec, accum_vec, accum_vec); +} + +template <> +void jit_count_out_of_range_vec(jit::Generator& gen, const Xbyak::RegExp& data) { + auto data_vec = gen.ymm1; + auto mask_vec = gen.ymm2; + auto mask_vec_xmm = gen.xmm2; + auto tmp_vec = gen.ymm3; + auto accum_vec = gen.ymm4; + auto f16_max_pos_vec = gen.ymm5; + auto f16_max_neg_vec = gen.ymm6; + auto f16_min_pos_vec = gen.ymm7; + auto f16_min_neg_vec = gen.ymm8; + auto f16_zero_vec = gen.ymm9; + auto i32_ones_vec = gen.ymm10; + + const unsigned char _cmp_lt_os = 1; + const unsigned char _cmp_neq_uq = 4; + const unsigned char _cmp_gt_os = 6; + + // std::abs(data) < ov::float16::from_bits(0x0001) + gen.vmovups(data_vec, gen.yword[data]); + gen.vcmpps(tmp_vec, data_vec, f16_min_pos_vec, _cmp_lt_os); + gen.vcmpps(mask_vec, data_vec, f16_min_neg_vec, _cmp_gt_os); + gen.vandps(mask_vec, mask_vec, tmp_vec); + + // data != 0.0f + gen.vcmpps(tmp_vec, data_vec, f16_zero_vec, _cmp_neq_uq); + gen.vandps(mask_vec, mask_vec, tmp_vec); + + // data > std::numeric_limits::max() + gen.vcmpps(tmp_vec, data_vec, f16_max_pos_vec, _cmp_gt_os); + gen.vorps(mask_vec, mask_vec, tmp_vec); + + // data < std::numeric_limits::lowest() + gen.vcmpps(tmp_vec, data_vec, f16_max_neg_vec, _cmp_lt_os); + gen.vorps(mask_vec, mask_vec, tmp_vec); + + // addition to i64 accumulator + gen.vandps(mask_vec, mask_vec, i32_ones_vec); + gen.vphaddd(mask_vec, mask_vec, mask_vec); + gen.vpermq(mask_vec, mask_vec, 0x08); + gen.vpmovsxdq(mask_vec, mask_vec_xmm); + gen.vpaddq(accum_vec, accum_vec, mask_vec); +} + +template <> +void jit_count_out_of_range_vec_finalize(jit::Generator& gen, const Xbyak::RegExp& dst) { + auto tmp_vec_xmm0 = gen.xmm2; // reuse mask_vec + auto tmp_vec_xmm1 = gen.xmm3; // reuse tmp_vec + auto accum_vec_ymm = gen.ymm4; + auto accum_vec_xmm = gen.xmm4; + + // horizontal sum of four i64 values + gen.vextractf128(tmp_vec_xmm0, accum_vec_ymm, 0); + gen.vextractf128(tmp_vec_xmm1, accum_vec_ymm, 1); + gen.vpaddq(accum_vec_xmm, tmp_vec_xmm0, tmp_vec_xmm1); + gen.vpermilpd(tmp_vec_xmm0, accum_vec_xmm, 0x01); + gen.vpaddq(accum_vec_xmm, accum_vec_xmm, tmp_vec_xmm0); + gen.vmovq(gen.qword[dst], accum_vec_xmm); +} + +class jit_count_out_of_range : public jit::Generator { + typedef struct context { + struct { + size_t type_size; + void (jit::Generator::*copy)(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size); + } data; + void (*prepare)(jit::Generator&); + void (*count_out_of_range)(jit::Generator&, const Xbyak::RegExp&); + void (*finalize)(jit::Generator&, const Xbyak::RegExp& dst); + } context_t; + + jit_count_out_of_range(const context_t& ctx) { + using namespace Xbyak; + + const uint32_t vlen = 8u; + + auto reg_src = rax; + auto reg_dst = rbx; + auto reg_sz = rdx; + + Label tail, exit; + + preamble(); + + ctx.prepare(*this); + + mov(reg_src, ptr[param + offsetof(args_t, src)]); + mov(reg_dst, ptr[param + offsetof(args_t, dst)]); + mov(reg_sz, ptr[param + offsetof(args_t, count)]); + + xor_(rsi, rsi); + mov(r8, reg_sz); + shr(r8, 3); + + foreach (rsi, 1, r8, [&, this](const Xbyak::Reg64& idx) { + ctx.count_out_of_range(*this, reg_src); + add(reg_src, static_cast(ctx.data.type_size * vlen)); + }) + ; + + L(tail); + + shl(rsi, 3); + sub(reg_sz, rsi); + test(reg_sz, reg_sz); + jz(exit); + + // allocate array for 8 floats on stack + sub(rsp, vlen * sizeof(float)); + mov(r8, rsp); + + auto tmp_vec = ymm2; // reuse mask_vec + vpxor(tmp_vec, tmp_vec, tmp_vec); + vmovups(yword[r8], tmp_vec); + + // Tail conversion + (this->*ctx.data.copy)(r8, reg_src, reg_sz); + ctx.count_out_of_range(*this, r8); + + // Free the array on stack + add(rsp, vlen * sizeof(float)); + + L(exit); + + ctx.finalize(*this, reg_dst); + + postamble(); + } + +public: + typedef struct { + const void* src; + void* dst; + const size_t count; + } args_t; + + typedef void (*fn_t)(const args_t*); + + template + static fn_t get() { + if (is_x64() && mayiuse(avx2)) { + static const jit_count_out_of_range::context_t context{ + {sizeof(data_t), &jit::Generator::copy}, + jit_count_out_of_range_vec_prepare, + jit_count_out_of_range_vec, + jit_count_out_of_range_vec_finalize}; + + static jit_count_out_of_range generator(context); + + return (fn_t)generator.getCode(); + } + return nullptr; + } +}; + } // namespace template <> @@ -231,8 +489,49 @@ void convert(const float16* arg, int8_t* out, size_t count) { convert_impl(arg, out, count); } +#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 + +void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count) { +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) + convert_impl(arg, out, count); +#else + // FIXME: duplicate and stub for ARM, provide more optimized solution + for (size_t i = 0; i < count; ++i) { + if (arg[i] > std::numeric_limits::max()) { + out[i] = std::numeric_limits::max(); + } else if (arg[i] < std::numeric_limits::lowest()) { + out[i] = std::numeric_limits::lowest(); + } else { + out[i] = static_cast(arg[i]); + } + } +#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) +} + +size_t count_out_of_f16_range(const float* arg, size_t count) { + size_t num_out_of_range = 0; + +#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) + auto converter = jit_count_out_of_range::get(); + if (converter) { + jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count}; + converter(&args); + return num_out_of_range; + } +#endif + for (size_t i = 0; i < count; ++i) { + // if abs value is smaller than the smallest positive fp16, but not zero + if (std::abs(arg[i]) < ov::float16::from_bits(0x0001) && arg[i] != 0.0f) { + num_out_of_range++; + } else if (arg[i] > std::numeric_limits::max()) { + num_out_of_range++; + } else if (arg[i] < std::numeric_limits::lowest()) { + num_out_of_range++; + } + } + return num_out_of_range; +} + } // namespace reference } // namespace runtime } // namespace ngraph - -#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64 diff --git a/src/core/src/graph_util.cpp b/src/core/src/graph_util.cpp index af5138ff6c7..93457bd1708 100644 --- a/src/core/src/graph_util.cpp +++ b/src/core/src/graph_util.cpp @@ -356,7 +356,7 @@ void save_model(const std::shared_ptr& m, const std::string& ou ov::pass::Manager manager; if (compress_to_fp16) { manager.register_pass(); - manager.register_pass(); + manager.register_pass(/*postponed=*/true); } manager.register_pass(); manager.register_pass(output_model, ""); diff --git a/src/core/src/pass/serialize.cpp b/src/core/src/pass/serialize.cpp index d806f771f3f..eb5b3ca277d 100644 --- a/src/core/src/pass/serialize.cpp +++ b/src/core/src/pass/serialize.cpp @@ -12,16 +12,19 @@ #include #include +#include "ngraph/runtime/reference/convert.hpp" #include "openvino/core/coordinate_diff.hpp" #include "openvino/core/except.hpp" #include "openvino/core/meta_data.hpp" #include "openvino/core/model.hpp" +#include "openvino/core/type/float16.hpp" #include "openvino/op/util/framework_node.hpp" #include "openvino/opsets/opset1.hpp" #include "openvino/pass/constant_folding.hpp" #include "openvino/util/file_util.hpp" #include "pugixml.hpp" #include "transformations/hash.hpp" +#include "transformations/rt_info/disable_fp16_compression.hpp" #include "transformations/rt_info/primitives_priority_attribute.hpp" OPENVINO_SUPPRESS_DEPRECATED_START @@ -89,13 +92,28 @@ public: m_enable_compression(enable_compression), m_blob_offset(bin_data.tellp()) {} - FilePosition write(const char* ptr, size_t size) { + FilePosition write(const char* ptr, + size_t size, + size_t* new_size, + bool compress_to_fp16 = false, + ov::element::Type src_type = ov::element::dynamic) { const FilePosition write_pos = m_binary_output.tellp(); const auto offset = write_pos - m_blob_offset; - if (!m_enable_compression) { - m_binary_output.write(ptr, size); + *new_size = size; + + if (!m_enable_compression || compress_to_fp16) { + write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type); return offset; } + // TODO: Find a way to keep both types of compression (m_enable_compression and compress_to_fp16) + // simultaneously. Disabled usual compression by m_enable_compression for those constants that are requested to + // be compressed by compress_to_fp16 for now. To implement both compression types applied simultaneously + // we need to save element_type for each constant in the cache together with the compression status + // that implies a wider impact and requires a more accurate implementation of cache handling. + // When FP16 compression is turned on together with the usual compression enabled by m_enable_compression, we + // can avoid comparing FP32 weights, but it would require comparing with data from a file, because on-the-fly + // converted FP16 constants are not kept in memory. + // This hash is weak (but efficient) and must be replace with some other // more stable hash algorithm. For example current hash algorithms gives // the same hash for {2, 2} and {0, 128} arrays. So we have to compare @@ -107,13 +125,64 @@ public: return found->second.first; } - m_binary_output.write(ptr, size); + write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type); m_hash_to_file_positions.insert({hash, {offset, static_cast(ptr)}}); return offset; } private: + void write_with_optional_fp16_compression(const char* ptr, + size_t size, + size_t* new_size, + bool compress_to_fp16 = false, + ov::element::Type src_type = ov::element::dynamic) { + if (!compress_to_fp16) { + m_binary_output.write(ptr, size); + } else { + OPENVINO_ASSERT(size % src_type.size() == 0); + auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size); + m_binary_output.write(fp16_buffer.get(), *new_size); + // Compressed data is disposed + } + } + + std::unique_ptr compress_data_to_fp16(const char* ptr, + size_t size, + ov::element::Type src_type, + size_t* compressed_size) { + auto num_src_elements = size / src_type.size(); + *compressed_size = num_src_elements * ov::element::f16.size(); + if (src_type == ov::element::f32) { + auto new_ptr = std::unique_ptr(new char[*compressed_size]); + auto dst_data = reinterpret_cast(new_ptr.get()); + auto src_data = reinterpret_cast(ptr); + ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, num_src_elements); + return new_ptr; + } else if (src_type == ov::element::f64) { + auto new_ptr = std::unique_ptr(new char[*compressed_size]); + auto dst_data = reinterpret_cast(new_ptr.get()); + auto src_data = reinterpret_cast(ptr); + + // Reference implementation for fp64 to fp16 conversoin + for (size_t i = 0; i < num_src_elements; ++i) { + // if abs value is smaller than the smallest positive fp16, but not zero + if (std::abs(src_data[i]) < ov::float16::from_bits(0x0001) && src_data[i] != 0.0f) { + dst_data[i] = 0; + } else if (src_data[i] > std::numeric_limits::max()) { + dst_data[i] = std::numeric_limits::max(); + } else if (src_data[i] < std::numeric_limits::lowest()) { + dst_data[i] = std::numeric_limits::lowest(); + } else { + dst_data[i] = static_cast(src_data[i]); + } + } + return new_ptr; + } else { + OPENVINO_THROW("[ INTERNAL ERROR ] Not supported source type for weights compression: ", src_type); + } + } + ConstWritePositions m_hash_to_file_positions; std::ostream& m_binary_output; bool m_enable_compression; @@ -237,6 +306,8 @@ class XmlSerializer : public ov::AttributeVisitor { ConstantWriter& m_constant_write_handler; int64_t m_version; bool m_deterministic; + bool m_compress_to_fp16; + ov::element::Type m_output_element_type; template std::string create_atribute_list(ov::ValueAccessor>& adapter) { @@ -354,13 +425,17 @@ public: const std::map& custom_opsets, ConstantWriter& constant_write_handler, int64_t version, - bool deterministic = false) + bool deterministic = false, + bool compress_to_fp16 = false, + ov::element::Type output_element_type = ov::element::dynamic) : m_xml_node(data), m_node_type_name(node_type_name), m_custom_opsets(custom_opsets), m_constant_write_handler(constant_write_handler), m_version(version), - m_deterministic(deterministic) {} + m_deterministic(deterministic), + m_compress_to_fp16(compress_to_fp16), + m_output_element_type(output_element_type) {} void on_adapter(const std::string& name, ov::ValueAccessor& adapter) override { using BodyTargetNames = std::tuple>; @@ -444,10 +519,15 @@ public: ov::as_type>>(&adapter)) { if (name == "value" && translate_type_name(m_node_type_name) == "Const") { const int64_t size = a->get()->size(); - int64_t offset = m_constant_write_handler.write(static_cast(a->get()->get_ptr()), size); + size_t new_size; + int64_t offset = m_constant_write_handler.write(static_cast(a->get()->get_ptr()), + size, + &new_size, + m_compress_to_fp16, + m_output_element_type); m_xml_node.append_attribute("offset").set_value(static_cast(offset)); - m_xml_node.append_attribute("size").set_value(static_cast(size)); + m_xml_node.append_attribute("size").set_value(static_cast(new_size)); } } else if (const auto& a = ov::as_type>(&adapter)) { const auto& attrs = a->get(); @@ -496,7 +576,13 @@ public: m_xml_node.append_attribute(name.c_str()).set_value(adapter.get()); } void on_adapter(const std::string& name, ov::ValueAccessor& adapter) override { - m_xml_node.append_attribute(name.c_str()).set_value(adapter.get().c_str()); + std::string value; + if (m_compress_to_fp16 && name == "element_type") { + value = ov::as_string(static_cast(ov::element::f16)); + } else { + value = adapter.get(); + } + m_xml_node.append_attribute(name.c_str()).set_value(value.c_str()); } void on_adapter(const std::string& name, ov::ValueAccessor& adapter) override { m_xml_node.append_attribute(name.c_str()).set_value(static_cast(adapter.get())); @@ -913,7 +999,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml, pugi::xml_node port = input.append_child("port"); port.append_attribute("id").set_value(port_id++); - port.append_attribute("precision").set_value(get_precision_name(i.get_element_type()).c_str()); + + auto rt_info = i.get_tensor().get_rt_info(); + auto port_element_type = + is_fp16_compression_postponed(rt_info) ? ov::element::f16 : i.get_element_type(); + + port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str()); for (auto d : i.get_partial_shape()) { pugi::xml_node dim = port.append_child("dim"); if (d.is_dynamic()) { @@ -937,7 +1028,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml, for (auto& o : node->outputs()) { pugi::xml_node port = output.append_child("port"); port.append_attribute("id").set_value(port_id++); - port.append_attribute("precision").set_value(get_precision_name(o.get_element_type()).c_str()); + + auto rt_info = o.get_tensor().get_rt_info(); + auto port_element_type = + is_fp16_compression_postponed(rt_info) ? ov::element::f16 : o.get_element_type(); + + port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str()); // Sort tensor names const auto& tensor_names = o.get_tensor().get_names(); @@ -973,6 +1069,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml, // fill general attributes { + bool compress_to_fp16 = false; + ov::element::Type output_element_type = ov::element::dynamic; + if (is_fp16_compression_postponed(node->get_rt_info())) { + compress_to_fp16 = true; + output_element_type = node->get_output_element_type(0); + } // Backward compatibility: clear padding values for nodes with auto_pad PaddingsFixer fixed_node(node); XmlSerializer visitor(data, @@ -980,7 +1082,9 @@ void ngfunction_2_ir(pugi::xml_node& netXml, custom_opsets, constant_node_write_handler, version, - deterministic); + deterministic, + compress_to_fp16, + output_element_type); OPENVINO_ASSERT(fixed_node.get_node()->visit_attributes(visitor), "Visitor API is not supported in ", node); } rt_info::XmlSerializer{data}.serialize(node->get_rt_info()); diff --git a/tools/ovc/openvino/tools/ovc/main.py b/tools/ovc/openvino/tools/ovc/main.py index 45f26c210c0..6cdf0cc9f6a 100644 --- a/tools/ovc/openvino/tools/ovc/main.py +++ b/tools/ovc/openvino/tools/ovc/main.py @@ -13,7 +13,7 @@ from openvino.tools.ovc.convert_impl import _convert from openvino.tools.ovc.cli_parser import get_model_name_from_args # pylint: disable=no-name-in-module,import-error -from openvino.runtime import serialize +from openvino.runtime import save_model def main(): @@ -24,12 +24,8 @@ def main(): model_path = get_model_name_from_args(argv) - # TODO: replace compress_model + serialize with save_model - if argv.compress_to_fp16: - from openvino.tools.ovc.moc_frontend.offline_transformations import compress_model - compress_model(ngraph_function) - - serialize(ngraph_function, model_path.encode('utf-8'), model_path.replace('.xml', '.bin').encode('utf-8')) + compress_to_fp16 = 'compress_to_fp16' in argv and argv.compress_to_fp16 + save_model(ngraph_function, model_path.encode('utf-8'), compress_to_fp16) print('[ SUCCESS ] XML file: {}'.format(model_path)) print('[ SUCCESS ] BIN file: {}'.format(model_path.replace('.xml', '.bin')))