Efficient FP32 -> FP16 conversion for convert_precision, save_model, ovc and mo (#18988)
* WIP Postpone fp16 in CompressFloatConstantsImpl * Apply suggestions from code review Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> * WIP: Compression to FP16 in Serialize * Prepared for efficient fp32 to fp16 conversion * Update src/core/reference/src/runtime/reference/convert.cpp * Called real slow reference implementations in the place where the optimized versions are supposed to be implemented * Code style * Fixed 0 values in the fast f64 to f16 compression * Optimized convert_from_f32_to_f16_with_clamp * Added optimized f32->f16 instance of change_constant_precision * compression transformation Python test * use tmp dir, minor corrections * Update src/bindings/python/tests/test_transformations/test_compression.py * Update src/bindings/python/tests/test_transformations/test_compression.py * style fix * define rt_info for postponed_fp16_compression * remove redundant class * fix temp dir for Win in test_compression.py * update definitions in convert.hpp * Update implementation in convert.cpp * Update serialize.cpp * Update compress_float_constants.cpp * added macros for ARM/non_x86 in convert.cpp * fix macros in convert.cpp * change fixme placement in serialize.cpp * style_fix * Update src/core/reference/src/runtime/reference/convert.cpp * style_fix * Optimized count_out_of_f16_range * Code style * Revert unused * Update src/core/src/pass/serialize.cpp Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> * Update src/core/reference/src/runtime/reference/convert.cpp Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> * use optimized convert_from_f32_to_f16_with_clamp for non postponed * minor corrections * Update src/common/transformations/src/transformations/common_optimizations/compress_float_constants.cpp * Update compress_float_constants.cpp * Switched mo and ovc to save_model instead of serialize to leverage performance improvements in fp32->fp16 * Applied minor code imporvements to address review feedback * Minor changes in code * Update tools/ovc/openvino/tools/ovc/main.py * Apply suggestions from code review * Fixed failed test in case when both usual xml compression and fp16 compression are applied simultaneously (disabled for now) * Added description for CompressFloatConstantImpl postponed parameter * Description of postponed parameter for CompressFloatConstants * Reverted switching to save_model in mo as the compression can be applied not only via CLI and old code should be kept for Python path (not applicable for ovc) * Removed remining committed test artefacts and reverted remaining changes in mo --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: dmitrygo <dmitry.gorokhov@intel.com> Co-authored-by: Vladimir Paramuzov <vladimir.paramuzov@intel.com> Co-authored-by: Pavel Esir <pavel.esir@intel.com> Co-authored-by: Pavel Esir <pavel.esir@gmail.com>
This commit is contained in:
@@ -0,0 +1,116 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2018-2023 Intel Corporation
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import tempfile
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from openvino.runtime.op import Parameter, Constant
|
||||
from openvino.runtime.opset12 import add, multiply
|
||||
|
||||
import openvino as ov
|
||||
|
||||
|
||||
def make_constant(values, transposed):
|
||||
return Constant(ov.Type.f32, ov.Shape([1, len(values)] if transposed else [len(values), 1]), values)
|
||||
|
||||
|
||||
# keep fp16 denormals, flush fp32 denormals to zero
|
||||
in_range = [-65504.0, -2.0, 1.00097656, -1.0, -0.99951172, -0.00006103515625, -0.000000059604645, 0.0,
|
||||
0.000000059604645, 0.99951172, 0.00006103515625, 1.0, 1.00097656, 2.0, 65504]
|
||||
out_of_range = [float("-inf"), -65505.0, -1e-10, -1e-39, 1e-39, 1e-10, 65505.0, float("inf")]
|
||||
converted_out_of_range = [-65504.0, -65504.0, 0, 0, 0, 0, 65504.0, 65504.0]
|
||||
|
||||
# test inputs
|
||||
more_in_range = out_of_range + 10 * in_range
|
||||
more_out_of_range = in_range + 10 * out_of_range
|
||||
|
||||
# reference after conversion more_in_range to fp16
|
||||
converted_more_in_range = converted_out_of_range + 10 * in_range
|
||||
|
||||
|
||||
def make_model(add_consts, mul_consts):
|
||||
parameter1 = Parameter(ov.Type.f32, ov.PartialShape([-1]))
|
||||
add1 = add(parameter1, make_constant(add_consts, False))
|
||||
mul1 = multiply(add1, make_constant(mul_consts, True))
|
||||
return ov.Model([mul1], [parameter1])
|
||||
|
||||
|
||||
def get_constants(model) -> List[Constant]:
|
||||
from pathlib import Path
|
||||
model_name = Path(tempfile.gettempdir()) / "f32_partially_compressed.xml"
|
||||
ov.save_model(model, model_name)
|
||||
core = ov.Core()
|
||||
restored_model = core.read_model(model_name)
|
||||
|
||||
op_ind_map = {"Add": 0, "Multiply": 1}
|
||||
constants_list = [[]] * len(op_ind_map)
|
||||
|
||||
for op in restored_model.get_ordered_ops():
|
||||
op_type = op.get_type_info().name
|
||||
if op_type not in op_ind_map.keys():
|
||||
continue
|
||||
|
||||
in_node = op.input_value(1).get_node()
|
||||
if in_node.get_type_info().name == "Convert":
|
||||
const_node = in_node.input_value(0).get_node()
|
||||
if const_node.get_type_info().name != "Constant":
|
||||
const_node = None
|
||||
elif in_node.get_type_info().name == "Constant":
|
||||
const_node = in_node
|
||||
|
||||
constants_list[op_ind_map[op_type]] = const_node
|
||||
|
||||
for node in constants_list:
|
||||
assert not isinstance(node, list)
|
||||
|
||||
# sanity check that model is compilable
|
||||
ov.compile_model(restored_model)
|
||||
return constants_list
|
||||
|
||||
|
||||
def test_compression_1():
|
||||
model = make_model(more_in_range, more_out_of_range)
|
||||
const_fp16, const_fp32 = get_constants(model)
|
||||
assert const_fp32 is not None, "There is no Constant op on FP32 branch"
|
||||
assert const_fp16 is not None, "There is no compressed Constant + Convert op on FP16 branch"
|
||||
|
||||
assert const_fp32.get_output_element_type(0) == ov.Type.f32
|
||||
assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32.get_vector())
|
||||
|
||||
assert const_fp16.get_output_element_type(0) == ov.Type.f16
|
||||
|
||||
msg = f"Difference: {np.array(converted_more_in_range, dtype=np.float32) - const_fp16.get_vector()}"
|
||||
assert np.all(np.array(converted_more_in_range, dtype=np.float32) == const_fp16.get_vector()), msg
|
||||
|
||||
|
||||
def test_compression_2():
|
||||
model = make_model(more_in_range, more_in_range)
|
||||
const_fp16_1, const_fp16_2 = get_constants(model)
|
||||
|
||||
assert const_fp16_1 is not None, "There is no Constant op on FP16 branch"
|
||||
assert const_fp16_2 is not None, "There is no Constant op on FP16 branch"
|
||||
|
||||
assert const_fp16_1.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16"
|
||||
assert const_fp16_2.get_output_element_type(0) == ov.Type.f16, "Const element type is not f16"
|
||||
f16_min, f16_max = np.finfo(np.float16).min, np.finfo(np.float16).max
|
||||
in_range_clipped = np.clip(more_in_range, f16_min, f16_max).astype(np.float16)
|
||||
|
||||
assert np.all(in_range_clipped == const_fp16_1.get_vector())
|
||||
assert np.all(in_range_clipped == const_fp16_2.get_vector())
|
||||
|
||||
|
||||
def test_no_compression():
|
||||
model = make_model(more_out_of_range, more_out_of_range)
|
||||
const_fp32_1, const_fp32_2 = get_constants(model)
|
||||
|
||||
assert const_fp32_1 is not None, "There is no Constant op on FP32 branch"
|
||||
assert const_fp32_2 is not None, "There is no Constant op on FP32 branch"
|
||||
|
||||
assert const_fp32_1.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32"
|
||||
|
||||
assert const_fp32_2.get_output_element_type(0) == ov.Type.f32, "Const element type is not f32"
|
||||
|
||||
assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_1.get_vector())
|
||||
assert np.all(np.array(more_out_of_range, dtype=np.float32) == const_fp32_2.get_vector())
|
||||
@@ -24,7 +24,13 @@ class TRANSFORMATIONS_API CompressFloatConstants;
|
||||
class ov::pass::CompressFloatConstantsImpl : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("CompressFloatConstantsImpl", "0");
|
||||
CompressFloatConstantsImpl();
|
||||
/// @brief Transformation constructor
|
||||
/// @param postponed If true then the transformation won't compress the constants
|
||||
/// keeping them in the original type but still will insert Converts. This is
|
||||
/// a special mode of operation that requires another transformation to
|
||||
/// apply a real compression on constants. Constants eligible for
|
||||
/// postponed compression are marked with a special rt_info tag.
|
||||
CompressFloatConstantsImpl(bool postponed = false);
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -44,8 +50,10 @@ public:
|
||||
class ov::pass::CompressFloatConstants : public ov::pass::GraphRewrite {
|
||||
public:
|
||||
OPENVINO_RTTI("CompressFloatConstants", "0");
|
||||
CompressFloatConstants() {
|
||||
add_matcher<ov::pass::CompressFloatConstantsImpl>();
|
||||
/// @brief Transformation constructor
|
||||
/// @param postponed Postponed compression, see ov::pass::CompressFloatConstantsImpl for details.
|
||||
CompressFloatConstants(bool postponed = false) {
|
||||
add_matcher<ov::pass::CompressFloatConstantsImpl>(postponed);
|
||||
add_matcher<ov::pass::AddOldApiMapToParameters>();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -16,6 +16,12 @@ TRANSFORMATIONS_API void enable_fp16_compression(const std::shared_ptr<Node>& no
|
||||
|
||||
TRANSFORMATIONS_API bool fp16_compression_is_disabled(const std::shared_ptr<const Node>& node);
|
||||
|
||||
TRANSFORMATIONS_API void postpone_fp16_compression(RTMap& rt_info);
|
||||
|
||||
TRANSFORMATIONS_API bool is_fp16_compression_postponed(const RTMap& rt_info);
|
||||
|
||||
TRANSFORMATIONS_API void do_not_postpone_fp16_compression(RTMap& rt_info);
|
||||
|
||||
/**
|
||||
* @ingroup ie_runtime_attr_api
|
||||
* @brief DisableFP16Compression class represents runtime info attribute that marks operation
|
||||
|
||||
@@ -5,6 +5,7 @@
|
||||
#include "transformations/common_optimizations/compress_float_constants.hpp"
|
||||
|
||||
#include "itt.hpp"
|
||||
#include "ngraph/runtime/reference/convert.hpp"
|
||||
#include "openvino/core/rt_info.hpp"
|
||||
#include "openvino/op/constant.hpp"
|
||||
#include "openvino/op/convert.hpp"
|
||||
@@ -16,7 +17,8 @@
|
||||
|
||||
namespace {
|
||||
template <ov::element::Type_t PREC_FROM>
|
||||
std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::op::v0::Constant>& constant) {
|
||||
std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::op::v0::Constant>& constant,
|
||||
bool postponed = false) {
|
||||
using src_type = typename ov::element_type_traits<PREC_FROM>::value_type;
|
||||
|
||||
const auto* src_data = constant->get_data_ptr<src_type>();
|
||||
@@ -24,9 +26,10 @@ std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::
|
||||
|
||||
auto new_constant = std::make_shared<ov::op::v0::Constant>(ov::element::f16, constant->get_shape());
|
||||
auto* dst_data = const_cast<ov::float16*>(reinterpret_cast<const ov::float16*>(new_constant->get_data_ptr()));
|
||||
if (dst_data == nullptr)
|
||||
if (!dst_data || !size)
|
||||
return nullptr;
|
||||
|
||||
// slow implementation: is used when optimized ones are not available: f64 or for ARM (both for f64 and f32)
|
||||
int num_out_of_range = 0;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
// if abs value is smaller than the smallest positive fp16, but not zero
|
||||
@@ -44,18 +47,24 @@ std::shared_ptr<ov::Node> change_constant_precision_to_fp16(std::shared_ptr<ov::
|
||||
}
|
||||
|
||||
// if more than 75% of a FP32 constant do not fit into FP16 keep in FP32
|
||||
float keep_threshold = 0.75f;
|
||||
float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
|
||||
const float keep_threshold = 0.75f;
|
||||
const float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
|
||||
|
||||
if (out_of_range_proportion >= keep_threshold) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return new_constant;
|
||||
if (postponed) {
|
||||
// dispose just converted constant to avoid allocation too much memory
|
||||
// it will be converted again while serialization
|
||||
return constant;
|
||||
} else {
|
||||
return new_constant;
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() {
|
||||
ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl(bool postponed) {
|
||||
MATCHER_SCOPE(CompressFloatConstantsImpl);
|
||||
auto const_pattern = pattern::wrap_type<ov::op::v0::Constant>();
|
||||
|
||||
@@ -72,26 +81,68 @@ ov::pass::CompressFloatConstantsImpl::CompressFloatConstantsImpl() {
|
||||
|
||||
auto c_type = const_node->get_element_type();
|
||||
std::shared_ptr<ov::Node> new_const;
|
||||
|
||||
#if !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64)
|
||||
if (c_type == ov::element::f32) {
|
||||
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f32>(const_node);
|
||||
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f32>(const_node, postponed);
|
||||
} else if (c_type == ov::element::f64) {
|
||||
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node);
|
||||
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node, postponed);
|
||||
}
|
||||
if (!new_const) // if out of range > threshold -> then new_const == nullptr
|
||||
return false;
|
||||
#else
|
||||
if (c_type == ov::element::f32) {
|
||||
auto size = shape_size(const_node->get_output_shape(0));
|
||||
if (size == 0)
|
||||
return false;
|
||||
auto num_out_of_range =
|
||||
ngraph::runtime::reference::count_out_of_f16_range(const_node->get_data_ptr<ov::element::f32>(), size);
|
||||
|
||||
// if more than 75% of a FP32 constant do not fit into FP16 keep in FP32
|
||||
const float keep_threshold = 0.75f;
|
||||
const float out_of_range_proportion = static_cast<float>(num_out_of_range) / static_cast<float>(size);
|
||||
if (out_of_range_proportion >= keep_threshold)
|
||||
return false;
|
||||
|
||||
if (postponed) {
|
||||
new_const = const_node;
|
||||
} else {
|
||||
const auto* src_data = const_node->get_data_ptr<float>();
|
||||
auto compressed_const =
|
||||
std::make_shared<ov::op::v0::Constant>(ov::element::f16, const_node->get_shape());
|
||||
auto* dst_data =
|
||||
const_cast<ov::float16*>(reinterpret_cast<const ov::float16*>(compressed_const->get_data_ptr()));
|
||||
OPENVINO_ASSERT(dst_data);
|
||||
ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size);
|
||||
new_const = compressed_const;
|
||||
}
|
||||
} else if (c_type == ov::element::f64) {
|
||||
new_const = change_constant_precision_to_fp16<ov::element::Type_t::f64>(const_node, postponed);
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
#endif // !defined(OPENVINO_ARCH_X86) && !defined(OPENVINO_ARCH_X86_64)
|
||||
|
||||
if (!new_const) {
|
||||
return false;
|
||||
}
|
||||
auto constant_target_inputs = const_node->get_output_target_inputs(0);
|
||||
auto convert = std::make_shared<ov::op::v0::Convert>(new_const, const_node->get_element_type());
|
||||
|
||||
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
|
||||
convert->set_friendly_name(const_node->get_friendly_name());
|
||||
new_const->set_friendly_name(const_node->get_friendly_name() + "_compressed");
|
||||
ov::copy_runtime_info(const_node, convert);
|
||||
ov::mark_as_decompression(convert);
|
||||
if (postponed) {
|
||||
postpone_fp16_compression(new_const->get_rt_info());
|
||||
postpone_fp16_compression(new_const->get_output_tensor(0).get_rt_info());
|
||||
|
||||
ov::replace_node(const_node, convert);
|
||||
|
||||
for (const auto& target_input : constant_target_inputs) {
|
||||
target_input.replace_source_output(convert);
|
||||
}
|
||||
} else {
|
||||
ov::replace_node(const_node, convert);
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
|
||||
@@ -895,6 +895,26 @@ std::shared_ptr<ngraph::Node> change_constant_precision(std::shared_ptr<opset4::
|
||||
return new_constant;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::shared_ptr<Node> change_constant_precision<ov::element::Type_t::f32, ov::element::Type_t::f16>(
|
||||
std::shared_ptr<opset4::Constant>& constant) {
|
||||
using src_type = typename element_type_traits<ov::element::Type_t::f32>::value_type;
|
||||
using dst_type = typename element_type_traits<ov::element::Type_t::f16>::value_type;
|
||||
|
||||
const auto* src_data = constant->get_data_ptr<src_type>();
|
||||
const auto size = shape_size(constant->get_shape());
|
||||
|
||||
auto new_constant = std::make_shared<opset4::Constant>(ov::element::Type_t::f16, constant->get_shape());
|
||||
new_constant->output(0).set_names(constant->output(0).get_names());
|
||||
auto* dst_data = const_cast<dst_type*>(reinterpret_cast<const dst_type*>(new_constant->get_data_ptr()));
|
||||
if (dst_data == nullptr)
|
||||
OPENVINO_THROW("Can't get destination data pointer");
|
||||
|
||||
ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, size);
|
||||
|
||||
return new_constant;
|
||||
}
|
||||
|
||||
template <>
|
||||
std::shared_ptr<Node> change_constant_precision<ov::element::Type_t::f16, ov::element::Type_t::f32>(
|
||||
std::shared_ptr<opset4::Constant>& constant) {
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
|
||||
#include "transformations/rt_info/disable_fp16_compression.hpp"
|
||||
|
||||
namespace {
|
||||
const std::string& postponed_fp16_compression_tag = "postponed_fp16_compression";
|
||||
}
|
||||
|
||||
void ov::disable_fp16_compression(const std::shared_ptr<Node>& node) {
|
||||
auto& rt_info = node->get_rt_info();
|
||||
rt_info[DisableFP16Compression::get_type_info_static()] = DisableFP16Compression{};
|
||||
@@ -18,3 +22,15 @@ bool ov::fp16_compression_is_disabled(const std::shared_ptr<const Node>& node) {
|
||||
const auto& rt_info = node->get_rt_info();
|
||||
return rt_info.count(DisableFP16Compression::get_type_info_static());
|
||||
}
|
||||
|
||||
void ov::postpone_fp16_compression(ov::RTMap& rt_info) {
|
||||
rt_info[postponed_fp16_compression_tag] = true;
|
||||
}
|
||||
|
||||
bool ov::is_fp16_compression_postponed(const ov::RTMap& rt_info) {
|
||||
return rt_info.count(postponed_fp16_compression_tag);
|
||||
}
|
||||
|
||||
void ov::do_not_postpone_fp16_compression(ov::RTMap& rt_info) {
|
||||
rt_info.erase(postponed_fp16_compression_tag);
|
||||
}
|
||||
|
||||
@@ -115,6 +115,12 @@ void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count);
|
||||
|
||||
#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
|
||||
|
||||
// Count how many f32 values is out of normal finite numbers range when converted to f16
|
||||
size_t count_out_of_f16_range(const float* arg, size_t count);
|
||||
|
||||
// Convert values from f32 to f16 with claming to f16 min/max when value is out of normal finite numbers range
|
||||
void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count);
|
||||
|
||||
// overload to handle ngraph::boolean (it is stored as char)
|
||||
template <typename TI, typename TO>
|
||||
typename std::enable_if<std::is_same<TO, char>::value>::type convert(const TI* arg, TO* out, size_t count) {
|
||||
|
||||
@@ -5,17 +5,18 @@
|
||||
#include "ngraph/runtime/reference/convert.hpp"
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
|
||||
# include "jit_generator.hpp"
|
||||
#endif
|
||||
|
||||
namespace ngraph {
|
||||
namespace runtime {
|
||||
namespace reference {
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
namespace {
|
||||
template <typename src_t, typename dst_t>
|
||||
template <typename src_t, typename dst_t, bool clamp = false>
|
||||
void jit_convert_vec(jit::Generator&, const Xbyak::RegExp&, const Xbyak::RegExp&);
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
template <typename src_t, typename dst_t, bool clamp = false>
|
||||
void jit_convert_vec_prepare(jit::Generator&) {}
|
||||
|
||||
template <>
|
||||
@@ -53,6 +54,37 @@ void jit_convert_vec<float, float16>(jit::Generator& gen, const Xbyak::RegExp& s
|
||||
gen.vmovdqu(gen.xword[dst], f16vec);
|
||||
}
|
||||
|
||||
template <>
|
||||
void jit_convert_vec_prepare<float, float16, true>(jit::Generator& gen) {
|
||||
auto upper_bound = gen.ymm5;
|
||||
auto lower_bound = gen.ymm6;
|
||||
auto addr = gen.r15;
|
||||
|
||||
static const float f16_max = std::numeric_limits<ov::float16>::max();
|
||||
static const float f16_min = std::numeric_limits<ov::float16>::lowest();
|
||||
static const float upper_bounds[8] = {f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max, f16_max};
|
||||
static const float lower_bounds[8] = {f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min, f16_min};
|
||||
|
||||
gen.mov(addr, (size_t)upper_bounds);
|
||||
gen.vmovdqu(upper_bound, gen.yword[addr]);
|
||||
gen.mov(addr, (size_t)lower_bounds);
|
||||
gen.vmovdqu(lower_bound, gen.yword[addr]);
|
||||
}
|
||||
|
||||
template <>
|
||||
void jit_convert_vec<float, float16, true>(jit::Generator& gen, const Xbyak::RegExp& src, const Xbyak::RegExp& dst) {
|
||||
auto f16vec = gen.xmm3;
|
||||
auto f32vec = gen.ymm4;
|
||||
auto upper_bound = gen.ymm5;
|
||||
auto lower_bound = gen.ymm6;
|
||||
|
||||
gen.vmovups(f32vec, gen.yword[src]);
|
||||
gen.vminps(f32vec, f32vec, upper_bound);
|
||||
gen.vmaxps(f32vec, f32vec, lower_bound);
|
||||
gen.vcvtps2ph(f16vec, f32vec, 0);
|
||||
gen.vmovdqu(gen.xword[dst], f16vec);
|
||||
}
|
||||
|
||||
template <>
|
||||
void jit_convert_vec_prepare<float, int8_t>(jit::Generator& gen) {
|
||||
auto order = gen.ymm1;
|
||||
@@ -175,13 +207,13 @@ public:
|
||||
|
||||
typedef void (*fn_t)(const args_t*);
|
||||
|
||||
template <typename src_t, typename dst_t>
|
||||
template <typename src_t, typename dst_t, bool clamp = false>
|
||||
static fn_t get() {
|
||||
if (is_x64() && mayiuse(avx) && mayiuse(avx2) && mayiuse(fp16)) {
|
||||
static const jit_convert_array::context_t context{{sizeof(src_t), &jit::Generator::copy<src_t>},
|
||||
{sizeof(dst_t), &jit::Generator::copy<dst_t>},
|
||||
jit_convert_vec<src_t, dst_t>,
|
||||
jit_convert_vec_prepare<src_t, dst_t>};
|
||||
jit_convert_vec<src_t, dst_t, clamp>,
|
||||
jit_convert_vec_prepare<src_t, dst_t, clamp>};
|
||||
|
||||
static jit_convert_array generator(context);
|
||||
|
||||
@@ -191,9 +223,9 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template <typename TI, typename TO>
|
||||
template <typename TI, typename TO, bool clamp = false>
|
||||
void convert_impl(const TI* arg, TO* out, size_t count) {
|
||||
auto converter = jit_convert_array::get<TI, TO>();
|
||||
auto converter = jit_convert_array::get<TI, TO, clamp>();
|
||||
|
||||
if (converter) {
|
||||
jit_convert_array::args_t args = {arg, out, count};
|
||||
@@ -204,6 +236,232 @@ void convert_impl(const TI* arg, TO* out, size_t count) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void convert_impl<float, float16, true>(const float* arg, float16* out, size_t count) {
|
||||
auto converter = jit_convert_array::get<float, float16, true>();
|
||||
|
||||
if (converter) {
|
||||
jit_convert_array::args_t args = {arg, out, count};
|
||||
converter(&args);
|
||||
} else {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
if (arg[i] > std::numeric_limits<ov::float16>::max()) {
|
||||
out[i] = std::numeric_limits<ov::float16>::max();
|
||||
} else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
|
||||
out[i] = std::numeric_limits<ov::float16>::lowest();
|
||||
} else {
|
||||
out[i] = static_cast<ov::float16>(arg[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename data_t, typename range_t>
|
||||
void jit_count_out_of_range_vec_prepare(jit::Generator&) {}
|
||||
|
||||
template <typename data_t, typename range_t>
|
||||
void jit_count_out_of_range_vec(jit::Generator&, const Xbyak::RegExp&);
|
||||
|
||||
template <typename data_t, typename range_t>
|
||||
void jit_count_out_of_range_vec_finalize(jit::Generator&, const Xbyak::RegExp&) {}
|
||||
|
||||
template <>
|
||||
void jit_count_out_of_range_vec_prepare<float, float16>(jit::Generator& gen) {
|
||||
auto accum_vec = gen.ymm4;
|
||||
auto f16_max_pos_vec = gen.ymm5;
|
||||
auto f16_max_neg_vec = gen.ymm6;
|
||||
auto f16_min_pos_vec = gen.ymm7;
|
||||
auto f16_min_neg_vec = gen.ymm8;
|
||||
auto f16_zero_vec = gen.ymm9;
|
||||
auto i32_ones_vec = gen.ymm10;
|
||||
auto addr = gen.r15;
|
||||
|
||||
static const float f16_max_pos = std::numeric_limits<ov::float16>::max();
|
||||
static const float f16_max_neg = std::numeric_limits<ov::float16>::lowest();
|
||||
static const float f16_min_pos = ov::float16::from_bits(0x0001);
|
||||
static const float f16_min_neg = -ov::float16::from_bits(0x0001);
|
||||
static const int32_t i32_one = 1;
|
||||
|
||||
static const float max_pos_bounds[8] =
|
||||
{f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos, f16_max_pos};
|
||||
static const float max_neg_bounds[8] =
|
||||
{f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg, f16_max_neg};
|
||||
static const float min_pos_bounds[8] =
|
||||
{f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos, f16_min_pos};
|
||||
static const float min_neg_bounds[8] =
|
||||
{f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg, f16_min_neg};
|
||||
static const int32_t i32_ones[8] = {i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one, i32_one};
|
||||
|
||||
auto load_vec = [&gen, &addr](Xbyak::Ymm vec, size_t ptr) {
|
||||
gen.mov(addr, ptr);
|
||||
gen.vmovdqu(vec, gen.yword[addr]);
|
||||
};
|
||||
|
||||
load_vec(f16_max_pos_vec, (size_t)max_pos_bounds);
|
||||
load_vec(f16_max_neg_vec, (size_t)max_neg_bounds);
|
||||
load_vec(f16_min_pos_vec, (size_t)min_pos_bounds);
|
||||
load_vec(f16_min_neg_vec, (size_t)min_neg_bounds);
|
||||
load_vec(i32_ones_vec, (size_t)i32_ones);
|
||||
gen.vxorps(f16_zero_vec, f16_zero_vec, f16_zero_vec);
|
||||
gen.vxorps(accum_vec, accum_vec, accum_vec);
|
||||
}
|
||||
|
||||
template <>
|
||||
void jit_count_out_of_range_vec<float, float16>(jit::Generator& gen, const Xbyak::RegExp& data) {
|
||||
auto data_vec = gen.ymm1;
|
||||
auto mask_vec = gen.ymm2;
|
||||
auto mask_vec_xmm = gen.xmm2;
|
||||
auto tmp_vec = gen.ymm3;
|
||||
auto accum_vec = gen.ymm4;
|
||||
auto f16_max_pos_vec = gen.ymm5;
|
||||
auto f16_max_neg_vec = gen.ymm6;
|
||||
auto f16_min_pos_vec = gen.ymm7;
|
||||
auto f16_min_neg_vec = gen.ymm8;
|
||||
auto f16_zero_vec = gen.ymm9;
|
||||
auto i32_ones_vec = gen.ymm10;
|
||||
|
||||
const unsigned char _cmp_lt_os = 1;
|
||||
const unsigned char _cmp_neq_uq = 4;
|
||||
const unsigned char _cmp_gt_os = 6;
|
||||
|
||||
// std::abs(data) < ov::float16::from_bits(0x0001)
|
||||
gen.vmovups(data_vec, gen.yword[data]);
|
||||
gen.vcmpps(tmp_vec, data_vec, f16_min_pos_vec, _cmp_lt_os);
|
||||
gen.vcmpps(mask_vec, data_vec, f16_min_neg_vec, _cmp_gt_os);
|
||||
gen.vandps(mask_vec, mask_vec, tmp_vec);
|
||||
|
||||
// data != 0.0f
|
||||
gen.vcmpps(tmp_vec, data_vec, f16_zero_vec, _cmp_neq_uq);
|
||||
gen.vandps(mask_vec, mask_vec, tmp_vec);
|
||||
|
||||
// data > std::numeric_limits<ov::float16>::max()
|
||||
gen.vcmpps(tmp_vec, data_vec, f16_max_pos_vec, _cmp_gt_os);
|
||||
gen.vorps(mask_vec, mask_vec, tmp_vec);
|
||||
|
||||
// data < std::numeric_limits<ov::float16>::lowest()
|
||||
gen.vcmpps(tmp_vec, data_vec, f16_max_neg_vec, _cmp_lt_os);
|
||||
gen.vorps(mask_vec, mask_vec, tmp_vec);
|
||||
|
||||
// addition to i64 accumulator
|
||||
gen.vandps(mask_vec, mask_vec, i32_ones_vec);
|
||||
gen.vphaddd(mask_vec, mask_vec, mask_vec);
|
||||
gen.vpermq(mask_vec, mask_vec, 0x08);
|
||||
gen.vpmovsxdq(mask_vec, mask_vec_xmm);
|
||||
gen.vpaddq(accum_vec, accum_vec, mask_vec);
|
||||
}
|
||||
|
||||
template <>
|
||||
void jit_count_out_of_range_vec_finalize<float, float16>(jit::Generator& gen, const Xbyak::RegExp& dst) {
|
||||
auto tmp_vec_xmm0 = gen.xmm2; // reuse mask_vec
|
||||
auto tmp_vec_xmm1 = gen.xmm3; // reuse tmp_vec
|
||||
auto accum_vec_ymm = gen.ymm4;
|
||||
auto accum_vec_xmm = gen.xmm4;
|
||||
|
||||
// horizontal sum of four i64 values
|
||||
gen.vextractf128(tmp_vec_xmm0, accum_vec_ymm, 0);
|
||||
gen.vextractf128(tmp_vec_xmm1, accum_vec_ymm, 1);
|
||||
gen.vpaddq(accum_vec_xmm, tmp_vec_xmm0, tmp_vec_xmm1);
|
||||
gen.vpermilpd(tmp_vec_xmm0, accum_vec_xmm, 0x01);
|
||||
gen.vpaddq(accum_vec_xmm, accum_vec_xmm, tmp_vec_xmm0);
|
||||
gen.vmovq(gen.qword[dst], accum_vec_xmm);
|
||||
}
|
||||
|
||||
class jit_count_out_of_range : public jit::Generator {
|
||||
typedef struct context {
|
||||
struct {
|
||||
size_t type_size;
|
||||
void (jit::Generator::*copy)(const Xbyak::Reg64& dst, const Xbyak::Reg64& src, const Xbyak::Reg64& size);
|
||||
} data;
|
||||
void (*prepare)(jit::Generator&);
|
||||
void (*count_out_of_range)(jit::Generator&, const Xbyak::RegExp&);
|
||||
void (*finalize)(jit::Generator&, const Xbyak::RegExp& dst);
|
||||
} context_t;
|
||||
|
||||
jit_count_out_of_range(const context_t& ctx) {
|
||||
using namespace Xbyak;
|
||||
|
||||
const uint32_t vlen = 8u;
|
||||
|
||||
auto reg_src = rax;
|
||||
auto reg_dst = rbx;
|
||||
auto reg_sz = rdx;
|
||||
|
||||
Label tail, exit;
|
||||
|
||||
preamble();
|
||||
|
||||
ctx.prepare(*this);
|
||||
|
||||
mov(reg_src, ptr[param + offsetof(args_t, src)]);
|
||||
mov(reg_dst, ptr[param + offsetof(args_t, dst)]);
|
||||
mov(reg_sz, ptr[param + offsetof(args_t, count)]);
|
||||
|
||||
xor_(rsi, rsi);
|
||||
mov(r8, reg_sz);
|
||||
shr(r8, 3);
|
||||
|
||||
foreach (rsi, 1, r8, [&, this](const Xbyak::Reg64& idx) {
|
||||
ctx.count_out_of_range(*this, reg_src);
|
||||
add(reg_src, static_cast<uint32_t>(ctx.data.type_size * vlen));
|
||||
})
|
||||
;
|
||||
|
||||
L(tail);
|
||||
|
||||
shl(rsi, 3);
|
||||
sub(reg_sz, rsi);
|
||||
test(reg_sz, reg_sz);
|
||||
jz(exit);
|
||||
|
||||
// allocate array for 8 floats on stack
|
||||
sub(rsp, vlen * sizeof(float));
|
||||
mov(r8, rsp);
|
||||
|
||||
auto tmp_vec = ymm2; // reuse mask_vec
|
||||
vpxor(tmp_vec, tmp_vec, tmp_vec);
|
||||
vmovups(yword[r8], tmp_vec);
|
||||
|
||||
// Tail conversion
|
||||
(this->*ctx.data.copy)(r8, reg_src, reg_sz);
|
||||
ctx.count_out_of_range(*this, r8);
|
||||
|
||||
// Free the array on stack
|
||||
add(rsp, vlen * sizeof(float));
|
||||
|
||||
L(exit);
|
||||
|
||||
ctx.finalize(*this, reg_dst);
|
||||
|
||||
postamble();
|
||||
}
|
||||
|
||||
public:
|
||||
typedef struct {
|
||||
const void* src;
|
||||
void* dst;
|
||||
const size_t count;
|
||||
} args_t;
|
||||
|
||||
typedef void (*fn_t)(const args_t*);
|
||||
|
||||
template <typename data_t, typename range_t>
|
||||
static fn_t get() {
|
||||
if (is_x64() && mayiuse(avx2)) {
|
||||
static const jit_count_out_of_range::context_t context{
|
||||
{sizeof(data_t), &jit::Generator::copy<data_t>},
|
||||
jit_count_out_of_range_vec_prepare<data_t, range_t>,
|
||||
jit_count_out_of_range_vec<data_t, range_t>,
|
||||
jit_count_out_of_range_vec_finalize<data_t, range_t>};
|
||||
|
||||
static jit_count_out_of_range generator(context);
|
||||
|
||||
return (fn_t)generator.getCode();
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
template <>
|
||||
@@ -231,8 +489,49 @@ void convert<float16, int8_t>(const float16* arg, int8_t* out, size_t count) {
|
||||
convert_impl(arg, out, count);
|
||||
}
|
||||
|
||||
#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
|
||||
|
||||
void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t count) {
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
convert_impl<float, float16, true>(arg, out, count);
|
||||
#else
|
||||
// FIXME: duplicate and stub for ARM, provide more optimized solution
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
if (arg[i] > std::numeric_limits<ov::float16>::max()) {
|
||||
out[i] = std::numeric_limits<ov::float16>::max();
|
||||
} else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
|
||||
out[i] = std::numeric_limits<ov::float16>::lowest();
|
||||
} else {
|
||||
out[i] = static_cast<ov::float16>(arg[i]);
|
||||
}
|
||||
}
|
||||
#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
}
|
||||
|
||||
size_t count_out_of_f16_range(const float* arg, size_t count) {
|
||||
size_t num_out_of_range = 0;
|
||||
|
||||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
|
||||
auto converter = jit_count_out_of_range::get<float, float16>();
|
||||
if (converter) {
|
||||
jit_count_out_of_range::args_t args = {arg, &num_out_of_range, count};
|
||||
converter(&args);
|
||||
return num_out_of_range;
|
||||
}
|
||||
#endif
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
// if abs value is smaller than the smallest positive fp16, but not zero
|
||||
if (std::abs(arg[i]) < ov::float16::from_bits(0x0001) && arg[i] != 0.0f) {
|
||||
num_out_of_range++;
|
||||
} else if (arg[i] > std::numeric_limits<ov::float16>::max()) {
|
||||
num_out_of_range++;
|
||||
} else if (arg[i] < std::numeric_limits<ov::float16>::lowest()) {
|
||||
num_out_of_range++;
|
||||
}
|
||||
}
|
||||
return num_out_of_range;
|
||||
}
|
||||
|
||||
} // namespace reference
|
||||
} // namespace runtime
|
||||
} // namespace ngraph
|
||||
|
||||
#endif // OPENVINO_ARCH_X86 || OPENVINO_ARCH_X86_64
|
||||
|
||||
@@ -356,7 +356,7 @@ void save_model(const std::shared_ptr<const ov::Model>& m, const std::string& ou
|
||||
ov::pass::Manager manager;
|
||||
if (compress_to_fp16) {
|
||||
manager.register_pass<ov::pass::MarkPrecisionSensitiveConstants>();
|
||||
manager.register_pass<ov::pass::CompressFloatConstants>();
|
||||
manager.register_pass<ov::pass::CompressFloatConstants>(/*postponed=*/true);
|
||||
}
|
||||
manager.register_pass<ov::pass::FusedNamesCleanup>();
|
||||
manager.register_pass<ov::pass::Serialize>(output_model, "");
|
||||
|
||||
@@ -12,16 +12,19 @@
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "ngraph/runtime/reference/convert.hpp"
|
||||
#include "openvino/core/coordinate_diff.hpp"
|
||||
#include "openvino/core/except.hpp"
|
||||
#include "openvino/core/meta_data.hpp"
|
||||
#include "openvino/core/model.hpp"
|
||||
#include "openvino/core/type/float16.hpp"
|
||||
#include "openvino/op/util/framework_node.hpp"
|
||||
#include "openvino/opsets/opset1.hpp"
|
||||
#include "openvino/pass/constant_folding.hpp"
|
||||
#include "openvino/util/file_util.hpp"
|
||||
#include "pugixml.hpp"
|
||||
#include "transformations/hash.hpp"
|
||||
#include "transformations/rt_info/disable_fp16_compression.hpp"
|
||||
#include "transformations/rt_info/primitives_priority_attribute.hpp"
|
||||
|
||||
OPENVINO_SUPPRESS_DEPRECATED_START
|
||||
@@ -89,13 +92,28 @@ public:
|
||||
m_enable_compression(enable_compression),
|
||||
m_blob_offset(bin_data.tellp()) {}
|
||||
|
||||
FilePosition write(const char* ptr, size_t size) {
|
||||
FilePosition write(const char* ptr,
|
||||
size_t size,
|
||||
size_t* new_size,
|
||||
bool compress_to_fp16 = false,
|
||||
ov::element::Type src_type = ov::element::dynamic) {
|
||||
const FilePosition write_pos = m_binary_output.tellp();
|
||||
const auto offset = write_pos - m_blob_offset;
|
||||
if (!m_enable_compression) {
|
||||
m_binary_output.write(ptr, size);
|
||||
*new_size = size;
|
||||
|
||||
if (!m_enable_compression || compress_to_fp16) {
|
||||
write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
|
||||
return offset;
|
||||
}
|
||||
// TODO: Find a way to keep both types of compression (m_enable_compression and compress_to_fp16)
|
||||
// simultaneously. Disabled usual compression by m_enable_compression for those constants that are requested to
|
||||
// be compressed by compress_to_fp16 for now. To implement both compression types applied simultaneously
|
||||
// we need to save element_type for each constant in the cache together with the compression status
|
||||
// that implies a wider impact and requires a more accurate implementation of cache handling.
|
||||
// When FP16 compression is turned on together with the usual compression enabled by m_enable_compression, we
|
||||
// can avoid comparing FP32 weights, but it would require comparing with data from a file, because on-the-fly
|
||||
// converted FP16 constants are not kept in memory.
|
||||
|
||||
// This hash is weak (but efficient) and must be replace with some other
|
||||
// more stable hash algorithm. For example current hash algorithms gives
|
||||
// the same hash for {2, 2} and {0, 128} arrays. So we have to compare
|
||||
@@ -107,13 +125,64 @@ public:
|
||||
return found->second.first;
|
||||
}
|
||||
|
||||
m_binary_output.write(ptr, size);
|
||||
write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
|
||||
m_hash_to_file_positions.insert({hash, {offset, static_cast<void const*>(ptr)}});
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
private:
|
||||
void write_with_optional_fp16_compression(const char* ptr,
|
||||
size_t size,
|
||||
size_t* new_size,
|
||||
bool compress_to_fp16 = false,
|
||||
ov::element::Type src_type = ov::element::dynamic) {
|
||||
if (!compress_to_fp16) {
|
||||
m_binary_output.write(ptr, size);
|
||||
} else {
|
||||
OPENVINO_ASSERT(size % src_type.size() == 0);
|
||||
auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size);
|
||||
m_binary_output.write(fp16_buffer.get(), *new_size);
|
||||
// Compressed data is disposed
|
||||
}
|
||||
}
|
||||
|
||||
std::unique_ptr<char[]> compress_data_to_fp16(const char* ptr,
|
||||
size_t size,
|
||||
ov::element::Type src_type,
|
||||
size_t* compressed_size) {
|
||||
auto num_src_elements = size / src_type.size();
|
||||
*compressed_size = num_src_elements * ov::element::f16.size();
|
||||
if (src_type == ov::element::f32) {
|
||||
auto new_ptr = std::unique_ptr<char[]>(new char[*compressed_size]);
|
||||
auto dst_data = reinterpret_cast<ov::float16*>(new_ptr.get());
|
||||
auto src_data = reinterpret_cast<const float*>(ptr);
|
||||
ngraph::runtime::reference::convert_from_f32_to_f16_with_clamp(src_data, dst_data, num_src_elements);
|
||||
return new_ptr;
|
||||
} else if (src_type == ov::element::f64) {
|
||||
auto new_ptr = std::unique_ptr<char[]>(new char[*compressed_size]);
|
||||
auto dst_data = reinterpret_cast<ov::float16*>(new_ptr.get());
|
||||
auto src_data = reinterpret_cast<const double*>(ptr);
|
||||
|
||||
// Reference implementation for fp64 to fp16 conversoin
|
||||
for (size_t i = 0; i < num_src_elements; ++i) {
|
||||
// if abs value is smaller than the smallest positive fp16, but not zero
|
||||
if (std::abs(src_data[i]) < ov::float16::from_bits(0x0001) && src_data[i] != 0.0f) {
|
||||
dst_data[i] = 0;
|
||||
} else if (src_data[i] > std::numeric_limits<ov::float16>::max()) {
|
||||
dst_data[i] = std::numeric_limits<ov::float16>::max();
|
||||
} else if (src_data[i] < std::numeric_limits<ov::float16>::lowest()) {
|
||||
dst_data[i] = std::numeric_limits<ov::float16>::lowest();
|
||||
} else {
|
||||
dst_data[i] = static_cast<ov::float16>(src_data[i]);
|
||||
}
|
||||
}
|
||||
return new_ptr;
|
||||
} else {
|
||||
OPENVINO_THROW("[ INTERNAL ERROR ] Not supported source type for weights compression: ", src_type);
|
||||
}
|
||||
}
|
||||
|
||||
ConstWritePositions m_hash_to_file_positions;
|
||||
std::ostream& m_binary_output;
|
||||
bool m_enable_compression;
|
||||
@@ -237,6 +306,8 @@ class XmlSerializer : public ov::AttributeVisitor {
|
||||
ConstantWriter& m_constant_write_handler;
|
||||
int64_t m_version;
|
||||
bool m_deterministic;
|
||||
bool m_compress_to_fp16;
|
||||
ov::element::Type m_output_element_type;
|
||||
|
||||
template <typename T>
|
||||
std::string create_atribute_list(ov::ValueAccessor<std::vector<T>>& adapter) {
|
||||
@@ -354,13 +425,17 @@ public:
|
||||
const std::map<std::string, ngraph::OpSet>& custom_opsets,
|
||||
ConstantWriter& constant_write_handler,
|
||||
int64_t version,
|
||||
bool deterministic = false)
|
||||
bool deterministic = false,
|
||||
bool compress_to_fp16 = false,
|
||||
ov::element::Type output_element_type = ov::element::dynamic)
|
||||
: m_xml_node(data),
|
||||
m_node_type_name(node_type_name),
|
||||
m_custom_opsets(custom_opsets),
|
||||
m_constant_write_handler(constant_write_handler),
|
||||
m_version(version),
|
||||
m_deterministic(deterministic) {}
|
||||
m_deterministic(deterministic),
|
||||
m_compress_to_fp16(compress_to_fp16),
|
||||
m_output_element_type(output_element_type) {}
|
||||
|
||||
void on_adapter(const std::string& name, ov::ValueAccessor<void>& adapter) override {
|
||||
using BodyTargetNames = std::tuple<std::string, std::string, std::vector<std::string>>;
|
||||
@@ -444,10 +519,15 @@ public:
|
||||
ov::as_type<ov::AttributeAdapter<std::shared_ptr<ngraph::runtime::AlignedBuffer>>>(&adapter)) {
|
||||
if (name == "value" && translate_type_name(m_node_type_name) == "Const") {
|
||||
const int64_t size = a->get()->size();
|
||||
int64_t offset = m_constant_write_handler.write(static_cast<const char*>(a->get()->get_ptr()), size);
|
||||
size_t new_size;
|
||||
int64_t offset = m_constant_write_handler.write(static_cast<const char*>(a->get()->get_ptr()),
|
||||
size,
|
||||
&new_size,
|
||||
m_compress_to_fp16,
|
||||
m_output_element_type);
|
||||
|
||||
m_xml_node.append_attribute("offset").set_value(static_cast<unsigned long long>(offset));
|
||||
m_xml_node.append_attribute("size").set_value(static_cast<unsigned long long>(size));
|
||||
m_xml_node.append_attribute("size").set_value(static_cast<unsigned long long>(new_size));
|
||||
}
|
||||
} else if (const auto& a = ov::as_type<ov::AttributeAdapter<ov::op::util::FrameworkNodeAttrs>>(&adapter)) {
|
||||
const auto& attrs = a->get();
|
||||
@@ -496,7 +576,13 @@ public:
|
||||
m_xml_node.append_attribute(name.c_str()).set_value(adapter.get());
|
||||
}
|
||||
void on_adapter(const std::string& name, ov::ValueAccessor<std::string>& adapter) override {
|
||||
m_xml_node.append_attribute(name.c_str()).set_value(adapter.get().c_str());
|
||||
std::string value;
|
||||
if (m_compress_to_fp16 && name == "element_type") {
|
||||
value = ov::as_string(static_cast<ov::element::Type_t>(ov::element::f16));
|
||||
} else {
|
||||
value = adapter.get();
|
||||
}
|
||||
m_xml_node.append_attribute(name.c_str()).set_value(value.c_str());
|
||||
}
|
||||
void on_adapter(const std::string& name, ov::ValueAccessor<int64_t>& adapter) override {
|
||||
m_xml_node.append_attribute(name.c_str()).set_value(static_cast<long long>(adapter.get()));
|
||||
@@ -913,7 +999,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
|
||||
|
||||
pugi::xml_node port = input.append_child("port");
|
||||
port.append_attribute("id").set_value(port_id++);
|
||||
port.append_attribute("precision").set_value(get_precision_name(i.get_element_type()).c_str());
|
||||
|
||||
auto rt_info = i.get_tensor().get_rt_info();
|
||||
auto port_element_type =
|
||||
is_fp16_compression_postponed(rt_info) ? ov::element::f16 : i.get_element_type();
|
||||
|
||||
port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str());
|
||||
for (auto d : i.get_partial_shape()) {
|
||||
pugi::xml_node dim = port.append_child("dim");
|
||||
if (d.is_dynamic()) {
|
||||
@@ -937,7 +1028,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
|
||||
for (auto& o : node->outputs()) {
|
||||
pugi::xml_node port = output.append_child("port");
|
||||
port.append_attribute("id").set_value(port_id++);
|
||||
port.append_attribute("precision").set_value(get_precision_name(o.get_element_type()).c_str());
|
||||
|
||||
auto rt_info = o.get_tensor().get_rt_info();
|
||||
auto port_element_type =
|
||||
is_fp16_compression_postponed(rt_info) ? ov::element::f16 : o.get_element_type();
|
||||
|
||||
port.append_attribute("precision").set_value(get_precision_name(port_element_type).c_str());
|
||||
|
||||
// Sort tensor names
|
||||
const auto& tensor_names = o.get_tensor().get_names();
|
||||
@@ -973,6 +1069,12 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
|
||||
|
||||
// fill <data> general attributes
|
||||
{
|
||||
bool compress_to_fp16 = false;
|
||||
ov::element::Type output_element_type = ov::element::dynamic;
|
||||
if (is_fp16_compression_postponed(node->get_rt_info())) {
|
||||
compress_to_fp16 = true;
|
||||
output_element_type = node->get_output_element_type(0);
|
||||
}
|
||||
// Backward compatibility: clear padding values for nodes with auto_pad
|
||||
PaddingsFixer fixed_node(node);
|
||||
XmlSerializer visitor(data,
|
||||
@@ -980,7 +1082,9 @@ void ngfunction_2_ir(pugi::xml_node& netXml,
|
||||
custom_opsets,
|
||||
constant_node_write_handler,
|
||||
version,
|
||||
deterministic);
|
||||
deterministic,
|
||||
compress_to_fp16,
|
||||
output_element_type);
|
||||
OPENVINO_ASSERT(fixed_node.get_node()->visit_attributes(visitor), "Visitor API is not supported in ", node);
|
||||
}
|
||||
rt_info::XmlSerializer{data}.serialize(node->get_rt_info());
|
||||
|
||||
@@ -13,7 +13,7 @@ from openvino.tools.ovc.convert_impl import _convert
|
||||
from openvino.tools.ovc.cli_parser import get_model_name_from_args
|
||||
|
||||
# pylint: disable=no-name-in-module,import-error
|
||||
from openvino.runtime import serialize
|
||||
from openvino.runtime import save_model
|
||||
|
||||
|
||||
def main():
|
||||
@@ -24,12 +24,8 @@ def main():
|
||||
|
||||
model_path = get_model_name_from_args(argv)
|
||||
|
||||
# TODO: replace compress_model + serialize with save_model
|
||||
if argv.compress_to_fp16:
|
||||
from openvino.tools.ovc.moc_frontend.offline_transformations import compress_model
|
||||
compress_model(ngraph_function)
|
||||
|
||||
serialize(ngraph_function, model_path.encode('utf-8'), model_path.replace('.xml', '.bin').encode('utf-8'))
|
||||
compress_to_fp16 = 'compress_to_fp16' in argv and argv.compress_to_fp16
|
||||
save_model(ngraph_function, model_path.encode('utf-8'), compress_to_fp16)
|
||||
|
||||
print('[ SUCCESS ] XML file: {}'.format(model_path))
|
||||
print('[ SUCCESS ] BIN file: {}'.format(model_path.replace('.xml', '.bin')))
|
||||
|
||||
Reference in New Issue
Block a user