resolve TODO in efficient compression: return hashing compression back (#21141)

* turn on compression by searching duplicate blobs (by hash) for fp16

* style-fix

* apply comments

* added multimap

* style fix
This commit is contained in:
Pavel Esir 2023-11-25 09:35:35 +01:00 committed by GitHub
parent 8231d57c38
commit 493a338ad2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 152 additions and 56 deletions

View File

@ -496,7 +496,7 @@ void convert_from_f32_to_f16_with_clamp(const float* arg, float16* out, size_t c
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
convert_impl<float, float16, true>(arg, out, count);
#else
// FIXME: duplicate and stub for ARM, provide more optimized solution
// FIXME CVS-125496: duplicate and stub for ARM, provide optimized solution
for (size_t i = 0; i < count; ++i) {
if (arg[i] > std::numeric_limits<ov::float16>::max()) {
out[i] = std::numeric_limits<ov::float16>::max();

View File

@ -89,7 +89,7 @@ class ConstantWriter {
public:
using FilePosition = int64_t;
using HashValue = size_t;
using ConstWritePositions = std::unordered_map<HashValue, std::pair<FilePosition, void const*>>;
using ConstWritePositions = std::multimap<HashValue, std::pair<FilePosition, void const*>>;
ConstantWriter(std::ostream& bin_data, bool enable_compression = true)
: m_binary_output(bin_data),
@ -105,56 +105,53 @@ public:
const auto offset = write_pos - m_blob_offset;
*new_size = size;
if (!m_enable_compression || compress_to_fp16) {
write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
if (!m_enable_compression) {
if (!compress_to_fp16) {
m_binary_output.write(ptr, size);
} else {
OPENVINO_ASSERT(size % src_type.size() == 0);
auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size);
m_binary_output.write(fp16_buffer.get(), *new_size);
}
return offset;
} else {
std::unique_ptr<char[]> fp16_buffer = nullptr;
if (compress_to_fp16) {
OPENVINO_ASSERT(size % src_type.size() == 0);
fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size);
}
const char* ptr_to_write;
if (fp16_buffer) {
ptr_to_write = fp16_buffer.get();
} else {
ptr_to_write = ptr;
}
// This hash is weak (but efficient). For example current hash algorithms gives
// the same hash for {2, 2} and {0, 128} arrays.
// But even strong hashing algorithms sometimes give collisions.
// Therefore we always have to compare values when finding a match in the hash multimap.
const HashValue hash = hash_combine(ptr_to_write, *new_size);
auto found = m_hash_to_file_positions.find(hash);
// iterate over all matches of the key in the multimap
while (found != m_hash_to_file_positions.end()) {
if (memcmp(ptr, found->second.second, size) == 0)
return found->second.first;
found++;
}
// Since fp16_compressed data will be disposed at exit point and since we cannot reread it from the ostream,
// we store pointer to the original uncompressed blob.
m_hash_to_file_positions.insert({hash, {offset, static_cast<void const*>(ptr)}});
m_binary_output.write(ptr_to_write, *new_size);
}
// TODO: Find a way to keep both types of compression (m_enable_compression and compress_to_fp16)
// simultaneously. Disabled usual compression by m_enable_compression for those constants that are requested to
// be compressed by compress_to_fp16 for now. To implement both compression types applied simultaneously
// we need to save element_type for each constant in the cache together with the compression status
// that implies a wider impact and requires a more accurate implementation of cache handling.
// When FP16 compression is turned on together with the usual compression enabled by m_enable_compression, we
// can avoid comparing FP32 weights, but it would require comparing with data from a file, because on-the-fly
// converted FP16 constants are not kept in memory.
// This hash is weak (but efficient) and must be replace with some other
// more stable hash algorithm. For example current hash algorithms gives
// the same hash for {2, 2} and {0, 128} arrays. So we have to compare
// values when finding a match in hash map.
const HashValue hash = hash_combine(ptr, size);
const auto found = m_hash_to_file_positions.find(hash);
if (found != end(m_hash_to_file_positions) &&
memcmp(static_cast<void const*>(ptr), found->second.second, size) == 0) {
return found->second.first;
}
write_with_optional_fp16_compression(ptr, size, new_size, compress_to_fp16, src_type);
m_hash_to_file_positions.insert({hash, {offset, static_cast<void const*>(ptr)}});
return offset;
}
private:
void write_with_optional_fp16_compression(const char* ptr,
size_t size,
size_t* new_size,
bool compress_to_fp16 = false,
ov::element::Type src_type = ov::element::dynamic) {
if (!compress_to_fp16) {
m_binary_output.write(ptr, size);
} else {
OPENVINO_ASSERT(size % src_type.size() == 0);
auto fp16_buffer = compress_data_to_fp16(ptr, size, src_type, new_size);
m_binary_output.write(fp16_buffer.get(), *new_size);
// Compressed data is disposed
}
}
std::unique_ptr<char[]> compress_data_to_fp16(const char* ptr,
size_t size,
ov::element::Type src_type,
size_t* compressed_size) {
static std::unique_ptr<char[]> compress_data_to_fp16(const char* ptr,
size_t size,
ov::element::Type src_type,
size_t* compressed_size) {
auto num_src_elements = size / src_type.size();
*compressed_size = num_src_elements * ov::element::f16.size();
if (src_type == ov::element::f32) {

View File

@ -10,6 +10,7 @@
#include "common_test_utils/test_common.hpp"
#include "openvino/opsets/opset8.hpp"
#include "openvino/pass/serialize.hpp"
#include "transformations/common_optimizations/compress_float_constants.hpp"
class SerializationConstantCompressionTest : public ov::test::TestsCommon {
protected:
@ -51,7 +52,7 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsI32) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsI64) {
@ -68,7 +69,24 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsI64) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int64_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int64_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsFP32_COMPRESSED_TO_F16) {
constexpr int unique_const_count = 1;
const ov::Shape shape{2, 2, 2};
auto A = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto B = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto model = std::make_shared<ov::Model>(ov::NodeVector{A, B}, ov::ParameterVector{});
ov::pass::CompressFloatConstants(/*postponed=*/true).run_on_model(model);
ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_model(model);
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(ov::float16));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsFP16) {
@ -85,7 +103,7 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsFP16) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(ov::float16));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(ov::float16));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsFP32) {
@ -102,7 +120,7 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsFP32) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(float));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(float));
}
TEST_F(SerializationConstantCompressionTest, NonIdenticalConstantsI64) {
@ -120,7 +138,27 @@ TEST_F(SerializationConstantCompressionTest, NonIdenticalConstantsI64) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int64_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int64_t));
}
TEST_F(SerializationConstantCompressionTest, NonIdenticalConstantsI64_CHECK_MULTIMAP) {
constexpr int unique_const_count = 2;
const ov::Shape shape{2};
// hash_combine returns the same hash for this two constants so we also check the content of arrays
auto A = ov::opset8::Constant::create(ov::element::i64, shape, {2, 2});
auto B = ov::opset8::Constant::create(ov::element::i64, shape, {0, 128});
auto C = ov::opset8::Constant::create(ov::element::i64, shape, {2, 2});
auto D = ov::opset8::Constant::create(ov::element::i64, shape, {0, 128});
auto model = std::make_shared<ov::Model>(ov::NodeVector{A, B, C, D}, ov::ParameterVector{});
ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_model(model);
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int64_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwo) {
@ -139,7 +177,26 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwo) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwo_FP32_COMPRESSED_TO_FP16) {
constexpr int unique_const_count = 2;
const ov::Shape shape{2, 2, 2};
auto A = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto B = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto C = ov::opset8::Constant::create(ov::element::f32, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto D = ov::opset8::Constant::create(ov::element::f32, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto model = std::make_shared<ov::Model>(ov::NodeVector{A, B, C, D}, ov::ParameterVector{});
ov::pass::CompressFloatConstants(/*postponed=*/true).run_on_model(model);
ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_model(model);
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(ov::float16));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwoMultipleOccurrences) {
@ -160,7 +217,49 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwoMultipleO
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwoMultipleOccurrences_FP32_COMPRESSED_TO_FP16) {
constexpr int unique_const_count = 2;
const ov::Shape shape{2, 2, 2};
auto A = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto B = ov::opset8::Constant::create(ov::element::f32, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto C = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto D = ov::opset8::Constant::create(ov::element::f32, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto E = ov::opset8::Constant::create(ov::element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto F = ov::opset8::Constant::create(ov::element::f32, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto model = std::make_shared<ov::Model>(ov::NodeVector{A, B, C, D, E, F}, ov::ParameterVector{});
ov::pass::CompressFloatConstants(/*postponed=*/true).run_on_model(model);
ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_model(model);
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(ov::float16));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsTimesTwoMultipleOccurrences_FP64_COMPRESSED_TO_FP16) {
constexpr int unique_const_count = 2;
const ov::Shape shape{2, 2, 2};
auto A = ov::opset8::Constant::create(ov::element::f64, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto B = ov::opset8::Constant::create(ov::element::f64, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto C = ov::opset8::Constant::create(ov::element::f64, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto D = ov::opset8::Constant::create(ov::element::f64, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto E = ov::opset8::Constant::create(ov::element::f64, shape, {1, 2, 3, 4, 5, 6, 7, 8});
auto F = ov::opset8::Constant::create(ov::element::f64, shape, {0, 3, 1, 2, 5, 6, 25, 3});
auto model = std::make_shared<ov::Model>(ov::NodeVector{A, B, C, D, E, F}, ov::ParameterVector{});
ov::pass::CompressFloatConstants(/*postponed=*/true).run_on_model(model);
ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1).run_on_model(model);
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(ov::float16));
}
TEST_F(SerializationConstantCompressionTest, NonIdenticalConstants) {
@ -177,7 +276,7 @@ TEST_F(SerializationConstantCompressionTest, NonIdenticalConstants) {
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsDifferentTypesI32I64) {
@ -194,7 +293,7 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsDifferentTypesI32
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
}
TEST_F(SerializationConstantCompressionTest, IdenticalConstantsDifferentTypesI32I8) {
@ -211,5 +310,5 @@ TEST_F(SerializationConstantCompressionTest, IdenticalConstantsDifferentTypesI32
std::ifstream xml_1(m_out_xml_path_1, std::ios::binary);
std::ifstream bin_1(m_out_bin_path_1, std::ios::binary);
ASSERT_TRUE(file_size(bin_1) == unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
ASSERT_EQ(file_size(bin_1), unique_const_count * ov::shape_size(shape) * sizeof(int32_t));
}