Efficient FP32 -> FP16 conversion for convert_precision, save_model, ovc and mo (#18988)

* WIP Postpone fp16 in CompressFloatConstantsImpl

* Apply suggestions from code review

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>

* WIP: Compression to FP16 in Serialize

* Prepared for efficient fp32 to fp16 conversion

* Update src/core/reference/src/runtime/reference/convert.cpp

* Called real slow reference implementations in the place where the optimized versions are supposed to be implemented

* Code style

* Fixed 0 values in the fast f64 to f16 compression

* Optimized convert_from_f32_to_f16_with_clamp

* Added optimized f32->f16 instance of change_constant_precision

* compression transformation Python test

* use tmp dir, minor corrections

* Update src/bindings/python/tests/test_transformations/test_compression.py

* Update src/bindings/python/tests/test_transformations/test_compression.py

* style fix

* define rt_info for postponed_fp16_compression

* remove redundant class

* fix temp dir for Win in test_compression.py

* update definitions in convert.hpp

* Update implementation in convert.cpp

* Update serialize.cpp

* Update compress_float_constants.cpp

* added macros for ARM/non_x86 in convert.cpp

* fix macros in convert.cpp

* change fixme placement in serialize.cpp

* style_fix

* Update src/core/reference/src/runtime/reference/convert.cpp

* style_fix

* Optimized count_out_of_f16_range

* Code style

* Revert unused

* Update src/core/src/pass/serialize.cpp

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>

* Update src/core/reference/src/runtime/reference/convert.cpp

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>

* use optimized convert_from_f32_to_f16_with_clamp for non postponed

* minor corrections

* Update src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp

* Update compress_float_constants.cpp

* Switched mo and ovc to save_model instead of serialize to leverage performance improvements in fp32->fp16

* Applied minor code imporvements to address review feedback

* Minor changes in code

* Update tools/ovc/openvino/tools/ovc/main.py

* Apply suggestions from code review

* Fixed failed test in case when both usual xml compression and fp16 compression are applied simultaneously (disabled for now)

* Added description for CompressFloatConstantImpl postponed parameter

* Description of postponed parameter for CompressFloatConstants

* Reverted switching to save_model in mo as the compression can be applied not only via CLI and old code should be kept for Python path (not applicable for ovc)

* Removed remining committed test artefacts and reverted remaining changes in mo

---------

Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
Co-authored-by: dmitrygo <dmitry.gorokhov@intel.com>
Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Co-authored-by: Pavel Esir <pavel.esir@intel.com>
Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
This commit is contained in:
Sergey Lyalin
2023-08-17 15:08:33 +04:00
committed by GitHub
parent 2394732055
commit f0300a36eb
11 changed files with 666 additions and 44 deletions

View File

@@ -0,0 +1,116 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import tempfile
from typing import List
import numpy as np
from openvino.runtime.op import Parameter, Constant
from openvino.runtime.opset12 import add, multiply
import openvino as ov
def make_constant(values, transposed):
return Constant(ov.Type.f32, ov.Shape([1, len(values)] if transposed else [len(values), 1]), values)
# keep fp16 denormals, flush fp32 denormals to zero
in_range = [-65504.0, -2.0, 1.00097656, -1.0, -0.99951172, -0.00006103515625, -0.000000059604645, 0.0,
0.000000059604645, 0.99951172, 0.00006103515625, 1.0, 1.00097656, 2.0, 65504]
out_of_range = [float("-inf"), -65505.0, -1e-10, -1e-39, 1e-39, 1e-10, 65505.0, float("inf")]
converted_out_of_range = [-65504.0, -65504.0, 0, 0, 0, 0, 65504.0, 65504.0]
# test inputs
more_in_range = out_of_range + 10 * in_range
more_out_of_range = in_range + 10 * out_of_range
# reference after conversion more_in_range to fp16
converted_more_in_range = converted_out_of_range + 10 * in_range
def make_model(add_consts, mul_consts):
parameter1 = Parameter(ov.Type.f32, ov.PartialShape([-1]))
add1 = add(parameter1, make_constant(add_consts, False))
mul1 = multiply(add1, make_constant(mul_consts, True))
return ov.Model([mul1], [parameter1])
def get_constants(model) -> List[Constant]:
from pathlib import Path
model_name = Path(tempfile.gettempdir()) / "f32_partially_compressed.xml"
ov.save_model(model, model_name)
core = ov.Core()
restored_model = core.read_model(model_name)
op_ind_map = {"Add": 0, "Multiply": 1}
constants_list = [[]] * len(op_ind_map)
for op in restored_model.get_ordered_ops():
op_type = op.get_type_info().name
if op_type not in op_ind_map.keys():
continue
in_node = op.input_value(1).get_node()
if in_node.get_type_info().name == "Convert":
const_node = in_node.input_value(0).get_node()
if const_node.get_type_info().name != "Constant":
const_node = None
elif in_node.get_type_info().name == "Constant":
const_node = in_node
constants_list[op_ind_map[op_type]] = const_node
for node in constants_list:
assert not isinstance(node, list)
# sanity check that model is compilable
ov.compile_model(restored_model)
return constants_list
def test_compression_1():
model = make_model(more_in_range, more_out_of_range)
const_fp16, const_fp32 = get_constants(model)
assert const_fp32 is not None, "There is no Constant op on FP32 branch"
assert const_fp16 is not None, "There is no compressed Constant + Convert op on FP16 branch"
assert const_fp32.get_output_element_type(0) == ov.Type.f32
assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32.get_vector())
assert const_fp16.get_output_element_type(0) == ov.Type.f16
msg = f"Difference: {np.array(converted_more_in_range, dtype=np.float32) - const_fp16.get_vector()}"
assert np.all(np.array(converted_more_in_range, dtype=np.float32) == const_fp16.get_vector()), msg
def test_compression_2():
model = make_model(more_in_range, more_in_range)
const_fp16_1, const_fp16_2 = get_constants(model)
assert const_fp16_1 is not None, "There is no Constant op on FP16 branch"
assert const_fp16_2 is not None, "There is no Constant op on FP16 branch"
assert const_fp16_1.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16"
assert const_fp16_2.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16"
f16_min, f16_max = np.finfo(np.float16).min, np.finfo(np.float16).max
in_range_clipped = np.clip(more_in_range, f16_min, f16_max).astype(np.float16)
assert np.all(in_range_clipped == const_fp16_1.get_vector())
assert np.all(in_range_clipped == const_fp16_2.get_vector())
def test_no_compression():
model = make_model(more_out_of_range, more_out_of_range)
const_fp32_1, const_fp32_2 = get_constants(model)
assert const_fp32_1 is not None, "There is no Constant op on FP32 branch"
assert const_fp32_2 is not None, "There is no Constant op on FP32 branch"
assert const_fp32_1.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32"
assert const_fp32_2.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32"
assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_1.get_vector())
assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_2.get_vector())

View File

@@ -24,7 +24,13 @@ class TRANSFORMATIONS_API CompressFloatConstants;
class ov::pass::CompressFloatConstantsImpl : public ov::pass::MatcherPass {
public:
OPENVINO_RTTI("CompressFloatConstantsImpl", "0");
CompressFloatConstantsImpl();
/// @brief Transformation constructor
/// @param postponed If true then the transformation won't compress the constants
/// keeping them in the original type but still will insert Converts. This is
/// a special mode of operation that requires another transformation to
/// apply a real compression on constants. Constants eligible for
/// postponed compression are marked with a special rt_info tag.
CompressFloatConstantsImpl(bool postponed = false);
};
/**
@@ -44,8 +50,10 @@ public:
class ov::pass::CompressFloatConstants : public ov::pass::GraphRewrite {
public:
OPENVINO_RTTI("CompressFloatConstants", "0");
CompressFloatConstants() {
add_matcher<ov::pass::CompressFloatConstantsImpl>();
/// @brief Transformation constructor
/// @param postponed Postponed compression, see ov::pass::CompressFloatConstantsImpl for details.
CompressFloatConstants(bool postponed = false) {
add_matcher<ov::pass::CompressFloatConstantsImpl>(postponed);
add_matcher<ov::pass::AddOldApiMapToParameters>();
}
};

View File

@@ -16,6 +16,12 @@ TRANSFORMATIONS_API void enable_fp16_compression(const std::shared_ptr<Node>& no
TRANSFORMATIONS_API bool fp16_compression_is_disabled(const std::shared_ptr<const Node>& node);
TRANSFORMATIONS_API void postpone_fp16_compression(RTMap& rt_info);
TRANSFORMATIONS_API bool is_fp16_compression_postponed(const RTMap& rt_info);
TRANSFORMATIONS_API void do_not_postpone_fp16_compression(RTMap& rt_info);
/**
* @ingroup ie_runtime_attr_api
* @brief DisableFP16Compression class represents runtime info attribute that marks operation

View File

@@ -5,6 +5,7 @@
#include "transformations/common_optimizations/compress_float_constants.hpp"
#include "itt.hpp"
#include "ngraph/runtime/reference/convert.hpp"
#include "openvino/core/rt_info.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/convert.hpp"
@@ -16,7 +17,8 @@
namespace {
template <ov::element::Type_t PREC_FROM>
std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::op::v0::Constant>& constant) {
std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::op::v0::Constant>& constant,
bool postponed = false) {
using src_type = typename ov::element_type_traits<PREC_FROM>::value_type;
const auto* src_data = constant->get_data_ptr<src_type>();
@@ -24,9 +26,10 @@ std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::
auto new_constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16, constant->get_shape());
auto* dst_data = const_cast<ov::float16*>(reinterpret_cast<const ov::float16*>(new_constant->get_data_ptr()));
if (dst_data == nullptr)
if (!dst_data || !size)
return nullptr;
// slow implementation: is used when optimized ones are not available: f64 or for ARM (both for f64 and f32)
int num_out_of_range = 0;
for (size_t i = 0; i < size; ++i) {
// if abs value is smaller than the smallest positive fp16, but not zero
@@ -44,18 +47,24 @@ std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::
}
// if more than 75% of a FP32 constant do not fit into FP16 keep in FP32
float keep_threshold = 0.75f;
float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
const float keep_threshold = 0.75f;
const float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
if (out_of_range_proportion >= keep_threshold) {
return nullptr;
}
return new_constant;
if (postponed) {
// dispose just converted constant to avoid allocation too much memory
// it will be converted again while serialization
return constant;
} else {
return new_constant;
}
}
} // namespace
ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() {
ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed) {
MATCHER_SCOPE(CompressFloatConstantsImpl);
auto const_pattern = pattern::wrap_type<ov::op::v0::Constant>();
@@ -72,26 +81,68 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() {
auto c_type = const_node->get_element_type();
std::shared_ptr<ov::Node> new_const;
#if !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64)
if (c_type == ov::element::f32) {
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f32>(const_node);
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f32>(const_node, postponed);
} else if (c_type == ov::element::f64) {
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node);
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node, postponed);
}
if (!new_const) // if out of range > threshold -> then new_const == nullptr
return false;
#else
if (c_type == ov::element::f32) {
auto size = shape_size(const_node->get_output_shape(0));
if (size == 0)
return false;
auto num_out_of_range =
ngraph::runtime::reference::count_out_of_f16_range(const_node->get_data_ptr<ov::element::f32>(), size);
// if more than 75% of a FP32 constant do not fit into FP16 keep in FP32
const float keep_threshold = 0.75f;
const float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
if (out_of_range_proportion >= keep_threshold)
return false;
if (postponed) {
new_const = const_node;
} else {
const auto* src_data = const_node->get_data_ptr<float>();
auto compressed_const =
std::make_shared<ov::op::v0::Constant>(ov::element::f16, const_node->get_shape());
auto* dst_data =
const_cast<ov::float16*>(reinterpret_cast<const ov::float16*>(compressed_const->get_data_ptr()));
OPENVINO_ASSERT(dst_data);
ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size);
new_const = compressed_const;
}
} else if (c_type == ov::element::f64) {
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node, postponed);
} else {
return false;
}
#endif // !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64)
if (!new_const) {
return false;
}
auto constant_target_inputs = const_node->get_output_target_inputs(0);
auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
convert->set_friendly_name(const_node->get_friendly_name());
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
ov::copy_runtime_info(const_node, convert);
ov::mark_as_decompression(convert);
if (postponed) {
postpone_fp16_compression(new_const->get_rt_info());
postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
ov::replace_node(const_node, convert);
for (const auto& target_input : constant_target_inputs) {
target_input.replace_source_output(convert);
}
} else {
ov::replace_node(const_node, convert);
}
return true;
};

View File

@@ -895,6 +895,26 @@ std::shared_ptr<ngraph::Node> change_constant_precision(std::shared_ptr<opset4::
return new_constant;
}
template <>
std::shared_ptr<Node> change_constant_precision<ov::element::Type_t::f32, ov::element::Type_t::f16>(
std::shared_ptr<opset4::Constant>& constant) {
using src_type = typename element_type_traits<ov::element::Type_t::f32>::value_type;
using dst_type = typename element_type_traits<ov::element::Type_t::f16>::value_type;
const auto* src_data = constant->get_data_ptr<src_type>();
const auto size = shape_size(constant->get_shape());
auto new_constant = std::make_shared<opset4::Constant>(ov::element::Type_t::f16, constant->get_shape());
new_constant->output(0).set_names(constant->output(0).get_names());
auto* dst_data = const_cast<dst_type*>(reinterpret_cast<const dst_type*>(new_constant->get_data_ptr()));
if (dst_data == nullptr)
OPENVINO_THROW("Can't get destination data pointer");
ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size);
return new_constant;
}
template <>
std::shared_ptr<Node> change_constant_precision<ov::element::Type_t::f16, ov::element::Type_t::f32>(
std::shared_ptr<opset4::Constant>& constant) {

View File

@@ -4,6 +4,10 @@
#include "transformations/rt_info/disable_fp16_compression.hpp"
namespace {
const std::string& postponed_fp16_compression_tag = "postponed_fp16_compression";
}
void ov::disable_fp16_compression(const std::shared_ptr<Node>& node) {
auto& rt_info = node->get_rt_info();
rt_info[DisableFP16Compression::get_type_info_static()] = DisableFP16Compression{};
@@ -18,3 +22,15 @@ bool ov::fp16_compression_is_disabled(const std::shared_ptr<const Node>& node) {
const auto& rt_info = node->get_rt_info();
return rt_info.count(DisableFP16Compression::get_type_info_static());
}
void ov::postpone_fp16_compression(ov::RTMap& rt_info) {
rt_info[postponed_fp16_compression_tag] = true;
}
bool ov::is_fp16_compression_postponed(const ov::RTMap& rt_info) {
return rt_info.count(postponed_fp16_compression_tag);
}
void ov::do_not_postpone_fp16_compression(ov::RTMap& rt_info) {
rt_info.erase(postponed_fp16_compression_tag);
}

View File

@@ -115,6 +115,12 @@ void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count);
#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
// Count how many f32 values is out of normal finite numbers range when converted to f16
size_t count_out_of_f16_range(const float* arg, size_t count);
// Convert values from f32 to f16 with claming to f16 min/max when value is out of normal finite numbers range
void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count);
// overload to handle ngraph::boolean (it is stored as char)
template <typename TI, typename TO>
typename std::enable_if<std::is_same<TO, char>::value>::type convert(const TI* arg, TO* out, size_t count) {

View File

@@ -5,17 +5,18 @@
#include "ngraph/runtime/reference/convert.hpp"
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
# include "jit_generator.hpp"
#endif
namespace ngraph {
namespace runtime {
namespace reference {
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
namespace {
template <typename src_t, typename dst_t>
template <typename src_t, typename dst_t, bool clamp = false>
void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&);
template <typename src_t, typename dst_t>
template <typename src_t, typename dst_t, bool clamp = false>
void jit_convert_vec_prepare(jit::Generator&) {}
template <>
@@ -53,6 +54,37 @@ void jit_convert_vec<float, float16>(jit::Generator& gen, const Xbyak::RegExp& s
gen.vmovdqu(gen.xword[dst], f16vec);
}
template <>
void jit_convert_vec_prepare<float, float16, true>(jit::Generator& gen) {
auto upper_bound = gen.ymm5;
auto lower_bound = gen.ymm6;
auto addr = gen.r15;
static const float f16_max = std::numeric_limits<ov::float16>::max();
static const float f16_min = std::numeric_limits<ov::float16>::lowest();
static const float upper_bounds[8] = {f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max};
static const float lower_bounds[8] = {f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min};
gen.mov(addr, (size_t)upper_bounds);
gen.vmovdqu(upper_bound, gen.yword[addr]);
gen.mov(addr, (size_t)lower_bounds);
gen.vmovdqu(lower_bound, gen.yword[addr]);
}
template <>
void jit_convert_vec<float, float16, true>(jit::Generator& gen, const Xbyak::RegExp& src, const Xbyak::RegExp& dst) {
auto f16vec = gen.xmm3;
auto f32vec = gen.ymm4;
auto upper_bound = gen.ymm5;
auto lower_bound = gen.ymm6;
gen.vmovups(f32vec, gen.yword[src]);
gen.vminps(f32vec, f32vec, upper_bound);
gen.vmaxps(f32vec, f32vec, lower_bound);
gen.vcvtps2ph(f16vec, f32vec, 0);
gen.vmovdqu(gen.xword[dst], f16vec);
}
template <>
void jit_convert_vec_prepare<float, int8_t>(jit::Generator& gen) {
auto order = gen.ymm1;
@@ -175,13 +207,13 @@ public:
typedef void (*fn_t)(const args_t*);
template <typename src_t, typename dst_t>
template <typename src_t, typename dst_t, bool clamp = false>
static fn_t get() {
if (is_x64() && mayiuse(avx) && mayiuse(avx2) && mayiuse(fp16)) {
static const jit_convert_array::context_t context{{sizeof(src_t), &jit::Generator::copy<src_t>},
{sizeof(dst_t), &jit::Generator::copy<dst_t>},
jit_convert_vec<src_t, dst_t>,
jit_convert_vec_prepare<src_t, dst_t>};
jit_convert_vec<src_t, dst_t, clamp>,
jit_convert_vec_prepare<src_t, dst_t, clamp>};
static jit_convert_array generator(context);
@@ -191,9 +223,9 @@ public:
}
};
template <typename TI, typename TO>
template <typename TI, typename TO, bool clamp = false>
void convert_impl(const TI* arg, TO* out, size_t count) {
auto converter = jit_convert_array::get<TI, TO>();
auto converter = jit_convert_array::get<TI, TO, clamp>();
if (converter) {
jit_convert_array::args_t args = {arg, out, count};
@@ -204,6 +236,232 @@ void convert_impl(const TI* arg, TO* out, size_t count) {
}
}
}
template <>
void convert_impl<float, float16, true>(const float* arg, float16* out, size_t count) {
auto converter = jit_convert_array::get<float, float16, true>();
if (converter) {
jit_convert_array::args_t args = {arg, out, count};
converter(&args);
} else {
for (size_t i = 0; i < count; ++i) {
if (arg[i] > std::numeric_limits<ov::float16>::max()) {
out[i] = std::numeric_limits<ov::float16>::max();
} else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
out[i] = std::numeric_limits<ov::float16>::lowest();
} else {
out[i] = static_cast<ov::float16>(arg[i]);
}
}
}
}
template <typename data_t, typename range_t>
void jit_count_out_of_range_vec_prepare(jit::Generator&) {}
template <typename data_t, typename range_t>
void jit_count_out_of_range_vec(jit::Generator&, const Xbyak::RegExp&);
template <typename data_t, typename range_t>
void jit_count_out_of_range_vec_finalize(jit::Generator&, const Xbyak::RegExp&) {}
template <>
void jit_count_out_of_range_vec_prepare<float, float16>(jit::Generator& gen) {
auto accum_vec = gen.ymm4;
auto f16_max_pos_vec = gen.ymm5;
auto f16_max_neg_vec = gen.ymm6;
auto f16_min_pos_vec = gen.ymm7;
auto f16_min_neg_vec = gen.ymm8;
auto f16_zero_vec = gen.ymm9;
auto i32_ones_vec = gen.ymm10;
auto addr = gen.r15;
static const float f16_max_pos = std::numeric_limits<ov::float16>::max();
static const float f16_max_neg = std::numeric_limits<ov::float16>::lowest();
static const float f16_min_pos = ov::float16::from_bits(0x0001);
static const float f16_min_neg = -ov::float16::from_bits(0x0001);
static const int32_t i32_one = 1;
static const float max_pos_bounds[8] =
{f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos};
static const float max_neg_bounds[8] =
{f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg};
static const float min_pos_bounds[8] =
{f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos};
static const float min_neg_bounds[8] =
{f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg};
static const int32_t i32_ones[8] = {i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one};
auto load_vec = [&gen, &addr](Xbyak::Ymm vec, size_t ptr) {
gen.mov(addr, ptr);
gen.vmovdqu(vec, gen.yword[addr]);
};
load_vec(f16_max_pos_vec, (size_t)max_pos_bounds);
load_vec(f16_max_neg_vec, (size_t)max_neg_bounds);
load_vec(f16_min_pos_vec, (size_t)min_pos_bounds);
load_vec(f16_min_neg_vec, (size_t)min_neg_bounds);
load_vec(i32_ones_vec, (size_t)i32_ones);
gen.vxorps(f16_zero_vec, f16_zero_vec, f16_zero_vec);
gen.vxorps(accum_vec, accum_vec, accum_vec);
}
template <>
void jit_count_out_of_range_vec<float, float16>(jit::Generator& gen, const Xbyak::RegExp& data) {
auto data_vec = gen.ymm1;
auto mask_vec = gen.ymm2;
auto mask_vec_xmm = gen.xmm2;
auto tmp_vec = gen.ymm3;
auto accum_vec = gen.ymm4;
auto f16_max_pos_vec = gen.ymm5;
auto f16_max_neg_vec = gen.ymm6;
auto f16_min_pos_vec = gen.ymm7;
auto f16_min_neg_vec = gen.ymm8;
auto f16_zero_vec = gen.ymm9;
auto i32_ones_vec = gen.ymm10;
const unsigned char _cmp_lt_os = 1;
const unsigned char _cmp_neq_uq = 4;
const unsigned char _cmp_gt_os = 6;
// std::abs(data) < ov::float16::from_bits(0x0001)
gen.vmovups(data_vec, gen.yword[data]);
gen.vcmpps(tmp_vec, data_vec, f16_min_pos_vec, _cmp_lt_os);
gen.vcmpps(mask_vec, data_vec, f16_min_neg_vec, _cmp_gt_os);
gen.vandps(mask_vec, mask_vec, tmp_vec);
// data != 0.0f
gen.vcmpps(tmp_vec, data_vec, f16_zero_vec, _cmp_neq_uq);
gen.vandps(mask_vec, mask_vec, tmp_vec);
// data > std::numeric_limits<ov::float16>::max()
gen.vcmpps(tmp_vec, data_vec, f16_max_pos_vec, _cmp_gt_os);
gen.vorps(mask_vec, mask_vec, tmp_vec);
// data < std::numeric_limits<ov::float16>::lowest()
gen.vcmpps(tmp_vec, data_vec, f16_max_neg_vec, _cmp_lt_os);
gen.vorps(mask_vec, mask_vec, tmp_vec);
// addition to i64 accumulator
gen.vandps(mask_vec, mask_vec, i32_ones_vec);
gen.vphaddd(mask_vec, mask_vec, mask_vec);
gen.vpermq(mask_vec, mask_vec, 0x08);
gen.vpmovsxdq(mask_vec, mask_vec_xmm);
gen.vpaddq(accum_vec, accum_vec, mask_vec);
}
template <>
void jit_count_out_of_range_vec_finalize<float, float16>(jit::Generator& gen, const Xbyak::RegExp& dst) {
auto tmp_vec_xmm0 = gen.xmm2; // reuse mask_vec
auto tmp_vec_xmm1 = gen.xmm3; // reuse tmp_vec
auto accum_vec_ymm = gen.ymm4;
auto accum_vec_xmm = gen.xmm4;
// horizontal sum of four i64 values
gen.vextractf128(tmp_vec_xmm0, accum_vec_ymm, 0);
gen.vextractf128(tmp_vec_xmm1, accum_vec_ymm, 1);
gen.vpaddq(accum_vec_xmm, tmp_vec_xmm0, tmp_vec_xmm1);
gen.vpermilpd(tmp_vec_xmm0, accum_vec_xmm, 0x01);
gen.vpaddq(accum_vec_xmm, accum_vec_xmm, tmp_vec_xmm0);
gen.vmovq(gen.qword[dst], accum_vec_xmm);
}
class jit_count_out_of_range : public jit::Generator {
typedef struct context {
struct {
size_t type_size;
void (jit::Generator::*copy)(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size);
} data;
void (*prepare)(jit::Generator&);
void (*count_out_of_range)(jit::Generator&, const Xbyak::RegExp&);
void (*finalize)(jit::Generator&, const Xbyak::RegExp& dst);
} context_t;
jit_count_out_of_range(const context_t& ctx) {
using namespace Xbyak;
const uint32_t vlen = 8u;
auto reg_src = rax;
auto reg_dst = rbx;
auto reg_sz = rdx;
Label tail, exit;
preamble();
ctx.prepare(*this);
mov(reg_src, ptr[param + offsetof(args_t, src)]);
mov(reg_dst, ptr[param + offsetof(args_t, dst)]);
mov(reg_sz, ptr[param + offsetof(args_t, count)]);
xor_(rsi, rsi);
mov(r8, reg_sz);
shr(r8, 3);
foreach (rsi, 1, r8, [&, this](const Xbyak::Reg64& idx) {
ctx.count_out_of_range(*this, reg_src);
add(reg_src, static_cast<uint32_t>(ctx.data.type_size * vlen));
})
;
L(tail);
shl(rsi, 3);
sub(reg_sz, rsi);
test(reg_sz, reg_sz);
jz(exit);
// allocate array for 8 floats on stack
sub(rsp, vlen * sizeof(float));
mov(r8, rsp);
auto tmp_vec = ymm2; // reuse mask_vec
vpxor(tmp_vec, tmp_vec, tmp_vec);
vmovups(yword[r8], tmp_vec);
// Tail conversion
(this->*ctx.data.copy)(r8, reg_src, reg_sz);
ctx.count_out_of_range(*this, r8);
// Free the array on stack
add(rsp, vlen * sizeof(float));
L(exit);
ctx.finalize(*this, reg_dst);
postamble();
}
public:
typedef struct {
const void* src;
void* dst;
const size_t count;
} args_t;
typedef void (*fn_t)(const args_t*);
template <typename data_t, typename range_t>
static fn_t get() {
if (is_x64() && mayiuse(avx2)) {
static const jit_count_out_of_range::context_t context{
{sizeof(data_t), &jit::Generator::copy<data_t>},
jit_count_out_of_range_vec_prepare<data_t, range_t>,
jit_count_out_of_range_vec<data_t, range_t>,
jit_count_out_of_range_vec_finalize<data_t, range_t>};
static jit_count_out_of_range generator(context);
return (fn_t)generator.getCode();
}
return nullptr;
}
};
} // namespace
template <>
@@ -231,8 +489,49 @@ void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count) {
convert_impl(arg, out, count);
}
#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count) {
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
convert_impl<float, float16, true>(arg, out, count);
#else
// FIXME: duplicate and stub for ARM, provide more optimized solution
for (size_t i = 0; i < count; ++i) {
if (arg[i] > std::numeric_limits<ov::float16>::max()) {
out[i] = std::numeric_limits<ov::float16>::max();
} else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
out[i] = std::numeric_limits<ov::float16>::lowest();
} else {
out[i] = static_cast<ov::float16>(arg[i]);
}
}
#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
}
size_t count_out_of_f16_range(const float* arg, size_t count) {
size_t num_out_of_range = 0;
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
auto converter = jit_count_out_of_range::get<float, float16>();
if (converter) {
jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
converter(&args);
return num_out_of_range;
}
#endif
for (size_t i = 0; i < count; ++i) {
// if abs value is smaller than the smallest positive fp16, but not zero
if (std::abs(arg[i]) < ov::float16::from_bits(0x0001) && arg[i] != 0.0f) {
num_out_of_range++;
} else if (arg[i] > std::numeric_limits<ov::float16>::max()) {
num_out_of_range++;
} else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
num_out_of_range++;
}
}
return num_out_of_range;
}
} // namespace reference
} // namespace runtime
} // namespace ngraph
#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64

View File

@@ -356,7 +356,7 @@ void save_model(const std::shared_ptr<const ov::Model>& m, const std::string& ou
ov::pass::Manager manager;
if (compress_to_fp16) {
manager.register_pass<ov::pass::MarkPrecisionSensitiveConstants>();
manager.register_pass<ov::pass::CompressFloatConstants>();
manager.register_pass<ov::pass::CompressFloatConstants>(/*postponed=*/true);
}
manager.register_pass<ov::pass::FusedNamesCleanup>();
manager.register_pass<ov::pass::Serialize>(output_model, "");

View File

@@ -12,16 +12,19 @@
#include <unordered_map>
#include <unordered_set>
#include "ngraph/runtime/reference/convert.hpp"
#include "openvino/core/coordinate_diff.hpp"
#include "openvino/core/except.hpp"
#include "openvino/core/meta_data.hpp"
#include "openvino/core/model.hpp"
#include "openvino/core/type/float16.hpp"
#include "openvino/op/util/framework_node.hpp"
#include "openvino/opsets/opset1.hpp"
#include "openvino/pass/constant_folding.hpp"
#include "openvino/util/file_util.hpp"
#include "pugixml.hpp"
#include "transformations/hash.hpp"
#include "transformations/rt_info/disable_fp16_compression.hpp"
#include "transformations/rt_info/primitives_priority_attribute.hpp"
OPENVINO_SUPPRESS_DEPRECATED_START
@@ -89,13 +92,28 @@ public:
m_enable_compression(enable_compression),
m_blob_offset(bin_data.tellp()) {}
FilePosition write(const char* ptr, size_t size) {
FilePosition write(const char* ptr,
size_t size,
size_t* new_size,
bool compress_to_fp16 = false,
ov::element::Type src_type = ov::element::dynamic) {
const FilePosition write_pos = m_binary_output.tellp();
const auto offset = write_pos - m_blob_offset;
if (!m_enable_compression) {
m_binary_output.write(ptr, size);
*new_size = size;
if (!m_enable_compression || compress_to_fp16) {
write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
return offset;
}
// TODO: Find a way to keep both types of compression (m_enable_compression and compress_to_fp16)
// simultaneously. Disabled usual compression by m_enable_compression for those constants that are requested to
// be compressed by compress_to_fp16 for now. To implement both compression types applied simultaneously
// we need to save element_type for each constant in the cache together with the compression status
// that implies a wider impact and requires a more accurate implementation of cache handling.
// When FP16 compression is turned on together with the usual compression enabled by m_enable_compression, we
// can avoid comparing FP32 weights, but it would require comparing with data from a file, because on-the-fly
// converted FP16 constants are not kept in memory.
// This hash is weak (but efficient) and must be replace with some other
// more stable hash algorithm. For example current hash algorithms gives
// the same hash for {2, 2} and {0, 128} arrays. So we have to compare
@@ -107,13 +125,64 @@ public:
return found->second.first;
}
m_binary_output.write(ptr, size);
write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
m_hash_to_file_positions.insert({hash, {offset, static_cast<void const*>(ptr)}});
return offset;
}
private:
void write_with_optional_fp16_compression(const char* ptr,
size_t size,
size_t* new_size,
bool compress_to_fp16 = false,
ov::element::Type src_type = ov::element::dynamic) {
if (!compress_to_fp16) {
m_binary_output.write(ptr, size);
} else {
OPENVINO_ASSERT(size % src_type.size() == 0);
auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size);
m_binary_output.write(fp16_buffer.get(), *new_size);
// Compressed data is disposed
}
}
std::unique_ptr<char[]> compress_data_to_fp16(const char* ptr,
size_t size,
ov::element::Type src_type,
size_t* compressed_size) {
auto num_src_elements = size / src_type.size();
*compressed_size = num_src_elements * ov::element::f16.size();
if (src_type == ov::element::f32) {
auto new_ptr = std::unique_ptr<char[]>(new char[*compressed_size]);
auto dst_data = reinterpret_cast<ov::float16*>(new_ptr.get());
auto src_data = reinterpret_cast<const float*>(ptr);
ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, num_src_elements);
return new_ptr;
} else if (src_type == ov::element::f64) {
auto new_ptr = std::unique_ptr<char[]>(new char[*compressed_size]);
auto dst_data = reinterpret_cast<ov::float16*>(new_ptr.get());
auto src_data = reinterpret_cast<const double*>(ptr);
// Reference implementation for fp64 to fp16 conversoin
for (size_t i = 0; i < num_src_elements; ++i) {
// if abs value is smaller than the smallest positive fp16, but not zero
if (std::abs(src_data[i]) < ov::float16::from_bits(0x0001) && src_data[i] != 0.0f) {
dst_data[i] = 0;
} else if (src_data[i] > std::numeric_limits<ov::float16>::max()) {
dst_data[i] = std::numeric_limits<ov::float16>::max();
} else if (src_data[i] < std::numeric_limits<ov::float16>::lowest()) {
dst_data[i] = std::numeric_limits<ov::float16>::lowest();
} else {
dst_data[i] = static_cast<ov::float16>(src_data[i]);
}
}
return new_ptr;
} else {
OPENVINO_THROW("[ INTERNAL ERROR ] Not supported source type for weights compression: ", src_type);
}
}
ConstWritePositions m_hash_to_file_positions;
std::ostream& m_binary_output;
bool m_enable_compression;
@@ -237,6 +306,8 @@ class XmlSerializer : public ov::AttributeVisitor {
ConstantWriter& m_constant_write_handler;
int64_t m_version;
bool m_deterministic;
bool m_compress_to_fp16;
ov::element::Type m_output_element_type;
template <typename T>
std::string create_atribute_list(ov::ValueAccessor<std::vector<T>>& adapter) {
@@ -354,13 +425,17 @@ public:
const std::map<std::string, ngraph::OpSet>& custom_opsets,
ConstantWriter& constant_write_handler,
int64_t version,
bool deterministic = false)
bool deterministic = false,
bool compress_to_fp16 = false,
ov::element::Type output_element_type = ov::element::dynamic)
: m_xml_node(data),
m_node_type_name(node_type_name),
m_custom_opsets(custom_opsets),
m_constant_write_handler(constant_write_handler),
m_version(version),
m_deterministic(deterministic) {}
m_deterministic(deterministic),
m_compress_to_fp16(compress_to_fp16),
m_output_element_type(output_element_type) {}
void on_adapter(const std::string& name, ov::ValueAccessor<void>& adapter) override {
using BodyTargetNames = std::tuple<std::string, std::string, std::vector<std::string>>;
@@ -444,10 +519,15 @@ public:
ov::as_type<ov::AttributeAdapter<std::shared_ptr<ngraph::runtime::AlignedBuffer>>>(&adapter)) {
if (name == "value" && translate_type_name(m_node_type_name) == "Const") {
const int64_t size = a->get()->size();
int64_t offset = m_constant_write_handler.write(static_cast<const char*>(a->get()->get_ptr()), size);
size_t new_size;
int64_t offset = m_constant_write_handler.write(static_cast<const char*>(a->get()->get_ptr()),
size,
&new_size,
m_compress_to_fp16,
m_output_element_type);
m_xml_node.append_attribute("offset").set_value(static_cast<unsigned long long>(offset));
m_xml_node.append_attribute("size").set_value(static_cast<unsigned long long>(size));
m_xml_node.append_attribute("size").set_value(static_cast<unsigned long long>(new_size));
}
} else if (const auto& a = ov::as_type<ov::AttributeAdapter<ov::op::util::FrameworkNodeAttrs>>(&adapter)) {
const auto& attrs = a->get();
@@ -496,7 +576,13 @@ public:
m_xml_node.append_attribute(name.c_str()).set_value(adapter.get());
}
void on_adapter(const std::string& name, ov::ValueAccessor<std::string>& adapter) override {
m_xml_node.append_attribute(name.c_str()).set_value(adapter.get().c_str());
std::string value;
if (m_compress_to_fp16 && name == "element_type") {
value = ov::as_string(static_cast<ov::element::Type_t>(ov::element::f16));
} else {
value = adapter.get();
}
m_xml_node.append_attribute(name.c_str()).set_value(value.c_str());
}
void on_adapter(const std::string& name, ov::ValueAccessor<int64_t>& adapter) override {
m_xml_node.append_attribute(name.c_str()).set_value(static_cast<long long>(adapter.get()));
@@ -913,7 +999,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
pugi::xml_node port = input.append_child("port");
port.append_attribute("id").set_value(port_id++);
port.append_attribute("precision").set_value(get_precision_name(i.get_element_type()).c_str());
auto rt_info = i.get_tensor().get_rt_info();
auto port_element_type =
is_fp16_compression_postponed(rt_info) ? ov::element::f16 : i.get_element_type();
port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str());
for (auto d : i.get_partial_shape()) {
pugi::xml_node dim = port.append_child("dim");
if (d.is_dynamic()) {
@@ -937,7 +1028,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
for (auto& o : node->outputs()) {
pugi::xml_node port = output.append_child("port");
port.append_attribute("id").set_value(port_id++);
port.append_attribute("precision").set_value(get_precision_name(o.get_element_type()).c_str());
auto rt_info = o.get_tensor().get_rt_info();
auto port_element_type =
is_fp16_compression_postponed(rt_info) ? ov::element::f16 : o.get_element_type();
port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str());
// Sort tensor names
const auto& tensor_names = o.get_tensor().get_names();
@@ -973,6 +1069,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
// fill <data> general attributes
{
bool compress_to_fp16 = false;
ov::element::Type output_element_type = ov::element::dynamic;
if (is_fp16_compression_postponed(node->get_rt_info())) {
compress_to_fp16 = true;
output_element_type = node->get_output_element_type(0);
}
// Backward compatibility: clear padding values for nodes with auto_pad
PaddingsFixer fixed_node(node);
XmlSerializer visitor(data,
@@ -980,7 +1082,9 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
custom_opsets,
constant_node_write_handler,
version,
deterministic);
deterministic,
compress_to_fp16,
output_element_type);
OPENVINO_ASSERT(fixed_node.get_node()->visit_attributes(visitor), "Visitor API is not supported in ", node);
}
rt_info::XmlSerializer{data}.serialize(node->get_rt_info());

View File

@@ -13,7 +13,7 @@ from openvino.tools.ovc.convert_impl import _convert
from openvino.tools.ovc.cli_parser import get_model_name_from_args
# pylint: disable=no-name-in-module,import-error
from openvino.runtime import serialize
from openvino.runtime import save_model
def main():
@@ -24,12 +24,8 @@ def main():
model_path = get_model_name_from_args(argv)
# TODO: replace compress_model + serialize with save_model
if argv.compress_to_fp16:
from openvino.tools.ovc.moc_frontend.offline_transformations import compress_model
compress_model(ngraph_function)
serialize(ngraph_function, model_path.encode('utf-8'), model_path.replace('.xml', '.bin').encode('utf-8'))
compress_to_fp16 = 'compress_to_fp16' in argv and argv.compress_to_fp16
save_model(ngraph_function, model_path.encode('utf-8'), compress_to_fp16)
print('[ SUCCESS ] XML file: {}'.format(model_path))
print('[ SUCCESS ] BIN file: {}'.format(model_path.replace('.xml', '.bin')))