CompressQuantizeWeights optimizations (#20025)
* Optimize CompressQuantizeWeights transformation - remove CoordinateTransform usage from FakeQuantize reference implementation - move ZeroPointOptimizer functionality inside CompressQuantizeWeights - compute scale and zero point in the same loop Ticket: CVS-119273 * review comments * clang format * fix comments
This commit is contained in:
parent
ed45a92e30
commit
4a3ce48f7a
@ -109,7 +109,6 @@ void regmodule_offline_transformations(py::module m) {
|
||||
[](std::shared_ptr<ov::Model> model) {
|
||||
ov::pass::Manager manager;
|
||||
manager.register_pass<ov::pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<ov::pass::ZeroPointOptimizer>();
|
||||
manager.run_passes(model);
|
||||
},
|
||||
py::arg("model"));
|
||||
|
@ -10,7 +10,6 @@ namespace ov {
|
||||
namespace pass {
|
||||
|
||||
class CompressQuantizeWeights;
|
||||
class ZeroPointOptimizer;
|
||||
|
||||
} // namespace pass
|
||||
} // namespace ov
|
||||
@ -57,36 +56,10 @@ class ZeroPointOptimizer;
|
||||
Transformation prepares quantized constant data for Low Precision pipeline.
|
||||
Such constant data packing reduces IR size (.bin file size) in offline transformations.
|
||||
With that we can skip same calculations in the runtime and make loading of such sub-graphs to the plugin faster.
|
||||
Additionally zero point can be fused to weights if it doesn't affect accuracy.
|
||||
*/
|
||||
class ov::pass::CompressQuantizeWeights : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("CompressQuantizeWeights", "0");
|
||||
CompressQuantizeWeights();
|
||||
};
|
||||
|
||||
/*
|
||||
if zero_point == 0 we can eliminate Subtract from following dequantization subgraph:
|
||||
|
||||
+-----------------+
|
||||
| Constant |
|
||||
| (low precision) |
|
||||
+-----------------+
|
||||
|
|
||||
v
|
||||
+------------------+
|
||||
| Convert |
|
||||
| (to high prec) |
|
||||
+------------------+
|
||||
|
|
||||
v
|
||||
+----------+ +------------+
|
||||
|zero point|--->| Subtract |
|
||||
+----------+ +-----+------+
|
||||
|
|
||||
v
|
||||
*/
|
||||
class ov::pass::ZeroPointOptimizer : public ov::pass::MatcherPass {
|
||||
public:
|
||||
OPENVINO_RTTI("ZeroPointOptimizer");
|
||||
ZeroPointOptimizer();
|
||||
};
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -31,6 +31,7 @@ struct CompressQuantizeWeightsParams {
|
||||
std::vector<float> expected_weights;
|
||||
float scale_val;
|
||||
float zero_point_val;
|
||||
bool fuse_zero_point;
|
||||
};
|
||||
|
||||
class CompressQuantizeWeightsTests
|
||||
@ -66,9 +67,14 @@ class CompressQuantizeWeightsTests
|
||||
auto data = opset8::Constant::create(param.expected_type, param.shape, param.expected_weights);
|
||||
auto convert = std::make_shared<opset8::Convert>(data, element::f32);
|
||||
auto scale = opset8::Constant::create(element::f32, Shape{}, {param.scale_val});
|
||||
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
|
||||
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
|
||||
auto mul = std::make_shared<opset8::Multiply>(sub, scale);
|
||||
std::shared_ptr<opset8::Multiply> mul;
|
||||
if (!param.fuse_zero_point) {
|
||||
auto zero_point = opset8::Constant::create(element::f32, Shape{}, {param.zero_point_val});
|
||||
auto sub = std::make_shared<opset8::Subtract>(convert, zero_point);
|
||||
mul = std::make_shared<opset8::Multiply>(sub, scale);
|
||||
} else {
|
||||
mul = std::make_shared<opset8::Multiply>(convert, scale);
|
||||
}
|
||||
model_ref = std::make_shared<Model>(mul, ParameterVector{});
|
||||
}
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
@ -89,7 +95,8 @@ static std::vector<CompressQuantizeWeightsParams> params = {
|
||||
element::i4,
|
||||
{-1.0f, -1.0f, 0.0f, 0.0f, 0.0f, 1.0f},
|
||||
3.0f,
|
||||
-0.666667f},
|
||||
-0.666667f,
|
||||
false},
|
||||
{Shape{2, 3, 1, 1},
|
||||
{-1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 11.0f},
|
||||
0.0f,
|
||||
@ -100,7 +107,8 @@ static std::vector<CompressQuantizeWeightsParams> params = {
|
||||
element::i4,
|
||||
{-8.0f, -5.0f, -4.0f, -2.0f, 0.0f, 7.0f},
|
||||
0.333333f,
|
||||
-5.0f},
|
||||
-5.0f,
|
||||
false},
|
||||
{Shape{2, 4, 1, 1},
|
||||
{-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 11.0f},
|
||||
1.0f,
|
||||
@ -109,9 +117,10 @@ static std::vector<CompressQuantizeWeightsParams> params = {
|
||||
6.0f,
|
||||
17,
|
||||
element::i8,
|
||||
{-8.0f, -8.0f, -8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 8.0f},
|
||||
{-4.0f, -4.0f, -4.0f, -2.0f, 0.0f, 2.0f, 4.0f, 12.0f},
|
||||
0.5f,
|
||||
-4.0f},
|
||||
-4.0f,
|
||||
true},
|
||||
{Shape{2, 4, 1, 1},
|
||||
{-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 11.0f},
|
||||
1.0f,
|
||||
@ -122,7 +131,8 @@ static std::vector<CompressQuantizeWeightsParams> params = {
|
||||
element::i8,
|
||||
{-128.0f, -128.0f, -128.0f, -96.0f, -64.0f, -32.0f, 0.0f, 127.0f},
|
||||
0.0313725f,
|
||||
-64.25f},
|
||||
-64.25f,
|
||||
false},
|
||||
};
|
||||
|
||||
static element::TypeVector data_precisions = {element::f32, element::f16};
|
||||
@ -198,7 +208,7 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithDequantizationSubgraphFP
|
||||
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
|
||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminated) {
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.144816, 0.0858578, 0.110928});
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{3, 1, 1, 1}, {-0.402659, -0.383148, -0.34054});
|
||||
@ -209,7 +219,6 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
|
||||
model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});
|
||||
|
||||
manager.register_pass<ov::pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<ov::pass::ZeroPointOptimizer>();
|
||||
}
|
||||
|
||||
{
|
||||
@ -223,7 +232,7 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizer) {
|
||||
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizerFP16) {
|
||||
TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointEliminatedFP16) {
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f16, Shape{3, 1, 1, 1}, {0.2, 1.2, 1.2});
|
||||
auto input_low =
|
||||
@ -239,7 +248,6 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizerFP16)
|
||||
model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});
|
||||
|
||||
manager.register_pass<ov::pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<ov::pass::ZeroPointOptimizer>();
|
||||
}
|
||||
|
||||
{
|
||||
@ -253,7 +261,7 @@ TEST_F(TransformationTestsF, CompressQuantizeWeightsWithZeroPointOptimizerFP16)
|
||||
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||
}
|
||||
|
||||
TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimizer) {
|
||||
TEST_F(TransformationTestsF, NegativeCompressQuantizeWeights) {
|
||||
{
|
||||
auto data = opset8::Constant::create(element::f32, Shape{2, 4, 1, 1}, {-1, 0, 1, 2, 3, 4, 5, 11});
|
||||
auto input_low = opset8::Constant::create(element::f32, Shape{}, {1});
|
||||
@ -264,7 +272,6 @@ TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsWithZeroPointOptimiz
|
||||
model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{});
|
||||
|
||||
manager.register_pass<ov::pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<ov::pass::ZeroPointOptimizer>();
|
||||
}
|
||||
{
|
||||
auto data = opset8::Constant::create(element::i8, Shape{2, 4, 1, 1}, {-128, -128, -128, -96, -64, -32, 0, 127});
|
||||
@ -289,7 +296,6 @@ TEST_F(TransformationTestsF, NegativeCompressQuantizeWeightsNonConstantInput) {
|
||||
model = std::make_shared<Model>(NodeVector{fq}, ParameterVector{data});
|
||||
|
||||
manager.register_pass<ov::pass::CompressQuantizeWeights>();
|
||||
manager.register_pass<ov::pass::ZeroPointOptimizer>();
|
||||
|
||||
comparator.enable(FunctionsComparator::CmpValues::CONST_VALUES);
|
||||
comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
|
||||
|
@ -21,31 +21,86 @@ namespace ov {
|
||||
namespace reference {
|
||||
namespace fake_quantize_details {
|
||||
template <typename T>
|
||||
inline T quantize(const T& arg,
|
||||
const T& in_low,
|
||||
const T& in_high,
|
||||
const T& out_low,
|
||||
const T& out_high,
|
||||
const size_t& levels) {
|
||||
static inline T quantize(const T arg,
|
||||
const T in_low,
|
||||
const T in_high,
|
||||
const T out_low,
|
||||
const T out_high,
|
||||
const T levels_minus_one) {
|
||||
if (arg <= std::min(in_low, in_high)) {
|
||||
return out_low;
|
||||
} else if (arg > std::max(in_low, in_high)) {
|
||||
return out_high;
|
||||
}
|
||||
return static_cast<T>(std::nearbyint((arg - in_low) / (in_high - in_low) * (levels - 1)) / (levels - 1) *
|
||||
return static_cast<T>(std::nearbyint((arg - in_low) / (in_high - in_low) * levels_minus_one) / levels_minus_one *
|
||||
(out_high - out_low) +
|
||||
out_low);
|
||||
}
|
||||
|
||||
static std::vector<size_t> compute_strides(const ov::Shape& out_shape, const ov::Shape& shape);
|
||||
|
||||
static std::tuple<size_t, size_t> get_inner_stride(size_t num_output_elements,
|
||||
const ov::Shape& output_shape,
|
||||
const ov::Shape& shape,
|
||||
size_t current_output_inner_stride);
|
||||
|
||||
template <typename T, typename F>
|
||||
static void fake_quantize_non_unit_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides);
|
||||
|
||||
template <typename T, typename F>
|
||||
static void fake_quantize_unit_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides);
|
||||
|
||||
template <typename T, typename F>
|
||||
static void fake_quantize_unit_output_intervals_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides);
|
||||
|
||||
template <typename T, typename F>
|
||||
static void fake_quantize_unit_input_intervals_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides);
|
||||
|
||||
} // namespace fake_quantize_details
|
||||
|
||||
template <typename T>
|
||||
void fake_quantize(const T* const arg,
|
||||
const T* const in_low,
|
||||
const T* const in_high,
|
||||
const T* const out_low,
|
||||
const T* const out_high,
|
||||
T* const out,
|
||||
void fake_quantize(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
const Shape& in_low_shape,
|
||||
const Shape& in_high_shape,
|
||||
@ -55,133 +110,452 @@ void fake_quantize(const T* const arg,
|
||||
const op::AutoBroadcastSpec& broadcast) {
|
||||
using namespace fake_quantize_details;
|
||||
|
||||
T levels_minus_one = static_cast<T>(levels - 1);
|
||||
const size_t arg_size = shape_size(arg_shape);
|
||||
|
||||
if (shape_size(in_low_shape) == 1 && shape_size(in_high_shape) == 1 && shape_size(out_low_shape) == 1 &&
|
||||
shape_size(out_high_shape) == 1) {
|
||||
const size_t arg_size = shape_size(arg_shape);
|
||||
const auto q = [=](const T& a) {
|
||||
return quantize(a, *in_low, *in_high, *out_low, *out_high, levels);
|
||||
};
|
||||
for (size_t i = 0; i < arg_size; ++i) {
|
||||
out[i] = q(arg[i]);
|
||||
out[i] = quantize(arg[i], *in_low, *in_high, *out_low, *out_high, levels_minus_one);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
/*
|
||||
* ---------------------------------------------------
|
||||
* Overview:
|
||||
* Numpy broadcasted input tensors can be partitioned into two: outer and inner part (which also defines inner
|
||||
* stride as a product of inner part), so N-dimensional tensors can be processed using two loops.
|
||||
*
|
||||
* For example with two inputs [2, 2, 3, 4] and [1, 1, 3, 4] we can have:
|
||||
* input 1 with shape [2, 2, 3, 4] can be divided into outer part [2, 2] and inner part [3, 4]
|
||||
* with inner stride = 12 (3 * 4).
|
||||
* input 2 with shape [1, 1, 3, 4] can be divided into outer part [1, 1]
|
||||
* and inner part [3, 4] with inner stride = 12 (3 * 4)
|
||||
*
|
||||
* Having that, those inputs can be processed by the following:
|
||||
*
|
||||
* output_shape = {2, 2, 3, 4};
|
||||
* output_inner_stride = 12;
|
||||
* for (i = 0; i < shape_size(shape); i += output_inner_stride) {
|
||||
* first_input_stride = i;
|
||||
* second_input_stride = 0;
|
||||
* for (j = 0; j < 12; j++) {
|
||||
* *out++ = f(first_input[first_input_stride + j], second_input[second_input_stride + j]);
|
||||
* }
|
||||
* }
|
||||
*
|
||||
* ---------------------------------------------------
|
||||
* How the partitioning is done:
|
||||
* Partitioning process starts with the last dimension of input tensor shape and it stops when either one of below
|
||||
* occurs:
|
||||
* - if the last dimension is equal to 1, partitioning stops at the dimension that is greater than 1 (this
|
||||
* dimension is not included in the inner part),
|
||||
* - if the last dimension is greater than 1, partitioning stops at the dimension that is equal to 1 (this
|
||||
* dimension is not included in the inner part).
|
||||
*
|
||||
* Examples:
|
||||
* tensor_shape=[2, 3, 4, 5], inner_part = [2, 3, 4, 5], inner_stride = 120
|
||||
* tensor_shape=[1, 1, 4, 5], inner_part = [4, 5], inner_stride = 20
|
||||
* tensor_shape=[2, 3, 1, 1], inner_part = [1, 1], inner_stride = 1
|
||||
*
|
||||
*
|
||||
* ---------------------------------------------------
|
||||
* How the output inner stride is calculated:
|
||||
* Inner part (and inner stride) for every input tensor is determined. Then the size of output inner part is the
|
||||
* size of inner part with the fewest number of dimensions.
|
||||
*
|
||||
* Example with 5 inputs:
|
||||
* input 1 shape [2, 3, 4, 5], inner_part = [2, 3, 4, 5], inner_stride = 120
|
||||
* input 2 shape [1, 3, 4, 5], inner_part = [3, 4, 5], inner_stride = 60
|
||||
* input 3 shape [2, 3, 1, 1], inner_part = [1, 1], inner_stride = 1
|
||||
* input 4 shape [2, 1, 1, 1], inner_part = [1, 1, 1], inner_stride = 1
|
||||
* input 5 shape [1, 1, 1, 1], inner_part = [1, 1, 1, 1], inner_stride = 1
|
||||
*
|
||||
* output shape [2, 3, 4, 5], inner_part = [4, 5], inner_stride = 20
|
||||
*
|
||||
* Inner part with fewest number of elements is [1, 1] for input 3. So the inner part for output shape is [4, 5]
|
||||
* and output inner stride is 20.
|
||||
*/
|
||||
// clang-format on
|
||||
|
||||
std::vector<size_t> output_strides = compute_strides(arg_shape, arg_shape);
|
||||
std::vector<size_t> in_low_strides = compute_strides(arg_shape, in_low_shape);
|
||||
std::vector<size_t> in_high_strides = compute_strides(arg_shape, in_high_shape);
|
||||
std::vector<size_t> out_low_strides = compute_strides(arg_shape, out_low_shape);
|
||||
std::vector<size_t> out_high_strides = compute_strides(arg_shape, out_high_shape);
|
||||
|
||||
size_t input_inner_stride = arg_size;
|
||||
size_t in_low_inner_stride = 0;
|
||||
size_t in_high_inner_stride = 0;
|
||||
size_t out_low_inner_stride = 0;
|
||||
size_t out_high_inner_stride = 0;
|
||||
|
||||
std::tie(in_low_inner_stride, input_inner_stride) =
|
||||
get_inner_stride(arg_size, arg_shape, in_low_shape, input_inner_stride);
|
||||
std::tie(in_high_inner_stride, input_inner_stride) =
|
||||
get_inner_stride(arg_size, arg_shape, in_high_shape, input_inner_stride);
|
||||
std::tie(out_low_inner_stride, input_inner_stride) =
|
||||
get_inner_stride(arg_size, arg_shape, out_low_shape, input_inner_stride);
|
||||
std::tie(out_high_inner_stride, input_inner_stride) =
|
||||
get_inner_stride(arg_size, arg_shape, out_high_shape, input_inner_stride);
|
||||
|
||||
auto get_outer_strides =
|
||||
[&output_strides, &in_low_strides, &in_high_strides, &out_low_strides, &out_high_strides](size_t flat_index) {
|
||||
size_t in_low_stride = 0;
|
||||
size_t in_high_stride = 0;
|
||||
size_t out_low_stride = 0;
|
||||
size_t out_high_stride = 0;
|
||||
|
||||
for (size_t i = 0; i < output_strides.size(); i++) {
|
||||
size_t div = flat_index / output_strides[i];
|
||||
flat_index = flat_index % output_strides[i];
|
||||
in_low_stride += div * in_low_strides[i];
|
||||
in_high_stride += div * in_high_strides[i];
|
||||
out_low_stride += div * out_low_strides[i];
|
||||
out_high_stride += div * out_high_strides[i];
|
||||
}
|
||||
|
||||
return std::tuple<size_t, size_t, size_t, size_t>{in_low_stride,
|
||||
in_high_stride,
|
||||
out_low_stride,
|
||||
out_high_stride};
|
||||
};
|
||||
|
||||
if (in_low_inner_stride > 1 && in_high_inner_stride > 1 && out_low_inner_stride > 1 && out_high_inner_stride > 1) {
|
||||
fake_quantize_non_unit_inner_stride(arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
arg_shape,
|
||||
levels_minus_one,
|
||||
input_inner_stride,
|
||||
get_outer_strides);
|
||||
} else if (in_low_inner_stride == 1 && in_high_inner_stride == 1 && out_low_inner_stride == 1 &&
|
||||
out_high_inner_stride == 1) {
|
||||
fake_quantize_unit_inner_stride(arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
arg_shape,
|
||||
levels_minus_one,
|
||||
input_inner_stride,
|
||||
get_outer_strides);
|
||||
|
||||
} else if (in_low_inner_stride > 1 && in_high_inner_stride > 1 && out_low_inner_stride == 1 &&
|
||||
out_high_inner_stride == 1) {
|
||||
fake_quantize_unit_output_intervals_inner_stride(arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
arg_shape,
|
||||
levels_minus_one,
|
||||
input_inner_stride,
|
||||
get_outer_strides);
|
||||
|
||||
} else if (in_low_inner_stride == 1 && in_high_inner_stride == 1 && out_low_inner_stride > 1 &&
|
||||
out_high_inner_stride > 1) {
|
||||
fake_quantize_unit_input_intervals_inner_stride(arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
arg_shape,
|
||||
levels_minus_one,
|
||||
input_inner_stride,
|
||||
get_outer_strides);
|
||||
} else {
|
||||
OPENVINO_ASSERT(in_low_shape.size() <= arg_shape.size() && in_high_shape.size() <= arg_shape.size() &&
|
||||
out_low_shape.size() <= arg_shape.size() && out_high_shape.size() <= arg_shape.size(),
|
||||
"Tensors with input\\output ranges should have rank less or "
|
||||
"equal to data tensor rank equal to ",
|
||||
arg_shape.size());
|
||||
size_t in_low_stride = 0;
|
||||
size_t in_high_stride = 0;
|
||||
size_t out_low_stride = 0;
|
||||
size_t out_high_stride = 0;
|
||||
|
||||
Shape arg0_padded_shape = arg_shape;
|
||||
Shape arg1_padded_shape = in_low_shape;
|
||||
Shape arg2_padded_shape = in_high_shape;
|
||||
Shape arg3_padded_shape = out_low_shape;
|
||||
Shape arg4_padded_shape = out_high_shape;
|
||||
|
||||
size_t max_shape_size = arg_shape.size();
|
||||
|
||||
while (arg0_padded_shape.size() < max_shape_size) {
|
||||
arg0_padded_shape.insert(arg0_padded_shape.begin(), 1);
|
||||
}
|
||||
|
||||
while (arg1_padded_shape.size() < max_shape_size) {
|
||||
arg1_padded_shape.insert(arg1_padded_shape.begin(), 1);
|
||||
}
|
||||
|
||||
while (arg2_padded_shape.size() < max_shape_size) {
|
||||
arg2_padded_shape.insert(arg2_padded_shape.begin(), 1);
|
||||
}
|
||||
|
||||
while (arg3_padded_shape.size() < max_shape_size) {
|
||||
arg3_padded_shape.insert(arg3_padded_shape.begin(), 1);
|
||||
}
|
||||
|
||||
while (arg4_padded_shape.size() < max_shape_size) {
|
||||
arg4_padded_shape.insert(arg4_padded_shape.begin(), 1);
|
||||
}
|
||||
|
||||
Shape arg0_squeezed_shape, arg1_squeezed_shape, arg2_squeezed_shape, arg3_squeezed_shape, arg4_squeezed_shape;
|
||||
AxisSet arg0_squeezed_axes, arg1_squeezed_axes, arg2_squeezed_axes, arg3_squeezed_axes, arg4_squeezed_axes;
|
||||
Shape output_shape;
|
||||
|
||||
for (size_t i = 0; i < max_shape_size; i++) {
|
||||
if (arg1_padded_shape[i] == 1) {
|
||||
arg1_squeezed_axes.insert(i);
|
||||
} else {
|
||||
arg1_squeezed_shape.push_back(arg1_padded_shape[i]);
|
||||
}
|
||||
|
||||
if (arg2_padded_shape[i] == 1) {
|
||||
arg2_squeezed_axes.insert(i);
|
||||
} else {
|
||||
arg2_squeezed_shape.push_back(arg2_padded_shape[i]);
|
||||
}
|
||||
|
||||
if (arg0_padded_shape[i] == 1) {
|
||||
arg0_squeezed_axes.insert(i);
|
||||
} else {
|
||||
arg0_squeezed_shape.push_back(arg0_padded_shape[i]);
|
||||
}
|
||||
|
||||
if (arg3_padded_shape[i] == 1) {
|
||||
arg3_squeezed_axes.insert(i);
|
||||
} else {
|
||||
arg3_squeezed_shape.push_back(arg3_padded_shape[i]);
|
||||
}
|
||||
|
||||
if (arg4_padded_shape[i] == 1) {
|
||||
arg4_squeezed_axes.insert(i);
|
||||
} else {
|
||||
arg4_squeezed_shape.push_back(arg4_padded_shape[i]);
|
||||
}
|
||||
|
||||
output_shape.push_back(std::max({arg0_padded_shape[i],
|
||||
arg2_padded_shape[i],
|
||||
arg1_padded_shape[i],
|
||||
arg3_padded_shape[i],
|
||||
arg4_padded_shape[i]}));
|
||||
}
|
||||
|
||||
CoordinateTransformBasic arg0_transform(arg0_squeezed_shape);
|
||||
CoordinateTransformBasic arg1_transform(arg1_squeezed_shape);
|
||||
CoordinateTransformBasic arg2_transform(arg2_squeezed_shape);
|
||||
CoordinateTransformBasic arg3_transform(arg3_squeezed_shape);
|
||||
CoordinateTransformBasic arg4_transform(arg4_squeezed_shape);
|
||||
CoordinateTransformBasic output_transform(output_shape);
|
||||
|
||||
const auto arg0_strides = row_major_strides(arg0_squeezed_shape);
|
||||
const auto arg1_strides = row_major_strides(arg1_squeezed_shape);
|
||||
const auto arg2_strides = row_major_strides(arg2_squeezed_shape);
|
||||
const auto arg3_strides = row_major_strides(arg3_squeezed_shape);
|
||||
const auto arg4_strides = row_major_strides(arg4_squeezed_shape);
|
||||
const auto output_strides = row_major_strides(output_shape);
|
||||
|
||||
for (const Coordinate& output_coord : output_transform) {
|
||||
const auto arg0_coord = util::reduce(output_coord, arg0_squeezed_axes);
|
||||
const auto arg1_coord = util::reduce(output_coord, arg1_squeezed_axes);
|
||||
const auto arg2_coord = util::reduce(output_coord, arg2_squeezed_axes);
|
||||
const auto arg3_coord = util::reduce(output_coord, arg3_squeezed_axes);
|
||||
const auto arg4_coord = util::reduce(output_coord, arg4_squeezed_axes);
|
||||
|
||||
const size_t arg0_idx =
|
||||
std::inner_product(arg0_coord.begin(), arg0_coord.end(), arg0_strides.begin(), uint64_t(0));
|
||||
const size_t arg1_idx =
|
||||
std::inner_product(arg1_coord.begin(), arg1_coord.end(), arg1_strides.begin(), uint64_t(0));
|
||||
const size_t arg2_idx =
|
||||
std::inner_product(arg2_coord.begin(), arg2_coord.end(), arg2_strides.begin(), uint64_t(0));
|
||||
const size_t arg3_idx =
|
||||
std::inner_product(arg3_coord.begin(), arg3_coord.end(), arg3_strides.begin(), uint64_t(0));
|
||||
const size_t arg4_idx =
|
||||
std::inner_product(arg4_coord.begin(), arg4_coord.end(), arg4_strides.begin(), uint64_t(0));
|
||||
const size_t output_idx =
|
||||
std::inner_product(output_coord.begin(), output_coord.end(), output_strides.begin(), uint64_t(0));
|
||||
out[output_idx] = quantize(arg[arg0_idx],
|
||||
in_low[arg1_idx],
|
||||
in_high[arg2_idx],
|
||||
out_low[arg3_idx],
|
||||
out_high[arg4_idx],
|
||||
levels);
|
||||
for (size_t i = 0; i < arg_size; i++) {
|
||||
std::tie(in_low_stride, in_high_stride, out_low_stride, out_high_stride) = get_outer_strides(i);
|
||||
*out++ = quantize(*arg++,
|
||||
*(in_low + in_low_stride),
|
||||
*(in_high + in_high_stride),
|
||||
*(out_low + out_low_stride),
|
||||
*(out_high + out_low_stride),
|
||||
levels_minus_one);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace fake_quantize_details {
|
||||
std::vector<size_t> compute_strides(const ov::Shape& out_shape, const ov::Shape& shape) {
|
||||
size_t stride = 1;
|
||||
size_t out_rank = out_shape.size();
|
||||
size_t shape_rank = shape.size();
|
||||
std::vector<size_t> strides(out_rank);
|
||||
for (size_t i = 0; i < out_rank; i++) {
|
||||
if (i < shape_rank && shape[shape_rank - i - 1] == out_shape[out_rank - i - 1]) {
|
||||
strides[out_rank - i - 1] = stride;
|
||||
stride *= shape[shape_rank - i - 1];
|
||||
} else {
|
||||
strides[out_rank - i - 1] = 0;
|
||||
}
|
||||
}
|
||||
return strides;
|
||||
}
|
||||
|
||||
std::tuple<size_t, size_t> get_inner_stride(size_t num_output_elements,
|
||||
const ov::Shape& output_shape,
|
||||
const ov::Shape& shape,
|
||||
size_t current_output_inner_stride) {
|
||||
if (shape.size() == 0)
|
||||
return std::tuple<size_t, size_t>{1, std::min(current_output_inner_stride, num_output_elements)};
|
||||
const size_t last = shape.back();
|
||||
auto it = std::find_if(shape.rbegin(), shape.rend(), [last](size_t dim) {
|
||||
return (last == 1 && dim > 1) || (last > 1 && dim == 1);
|
||||
});
|
||||
if (it == shape.rend()) {
|
||||
const size_t num_elements = shape_size(shape);
|
||||
return std::tuple<size_t, size_t>{
|
||||
num_elements,
|
||||
last == 1 ? current_output_inner_stride : std::min(current_output_inner_stride, num_elements)};
|
||||
}
|
||||
const size_t idx = std::distance(it, shape.rbegin()) + static_cast<int64_t>(shape.size());
|
||||
const size_t inner_stride =
|
||||
std::accumulate(shape.begin() + idx, shape.end(), static_cast<size_t>(1), std::multiplies<size_t>());
|
||||
const size_t output_inner_stride = std::accumulate(output_shape.begin() + output_shape.size() - shape.size() + idx,
|
||||
output_shape.end(),
|
||||
static_cast<size_t>(1),
|
||||
std::multiplies<size_t>());
|
||||
return std::tuple<size_t, size_t>{inner_stride, std::min(current_output_inner_stride, output_inner_stride)};
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
static void transform(const T* first1, const T* const last1, const T* first2, const T* first3, T* out, const F& f) {
|
||||
while (first1 < last1) {
|
||||
*out++ = f(*first1++, *first2++, *first3++);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
static void transform(const T* first1,
|
||||
const T* const last1,
|
||||
const T* first2,
|
||||
const T* first3,
|
||||
const T* first4,
|
||||
const T* first5,
|
||||
T* out,
|
||||
const F& f) {
|
||||
while (first1 < last1) {
|
||||
*out++ = f(*first1++, *first2++, *first3++, *first4++, *first5++);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename F1, typename F2>
|
||||
static void fake_quantize_loop(const Shape& arg_shape,
|
||||
const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
size_t input_inner_stride,
|
||||
const F1& get_outer_strides,
|
||||
const F2& quantize_loop) {
|
||||
size_t in_low_stride = 0;
|
||||
size_t in_high_stride = 0;
|
||||
size_t out_low_stride = 0;
|
||||
size_t out_high_stride = 0;
|
||||
|
||||
for (size_t i = 0; i < shape_size(arg_shape); i += input_inner_stride) {
|
||||
std::tie(in_low_stride, in_high_stride, out_low_stride, out_high_stride) = get_outer_strides(i);
|
||||
quantize_loop(arg,
|
||||
arg + input_inner_stride,
|
||||
in_low + in_low_stride,
|
||||
in_high + in_high_stride,
|
||||
out_low + out_low_stride,
|
||||
out_high + out_high_stride,
|
||||
out);
|
||||
arg += input_inner_stride;
|
||||
out += input_inner_stride;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
void fake_quantize_non_unit_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides) {
|
||||
fake_quantize_loop(arg_shape,
|
||||
arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
input_inner_stride,
|
||||
get_outer_strides,
|
||||
[levels_minus_one](const T* input,
|
||||
const T* const input_end,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out) {
|
||||
transform(input,
|
||||
input_end,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
[levels_minus_one](T input, T in_low, T in_high, T out_low, T out_high) {
|
||||
return quantize(input, in_low, in_high, out_low, out_high, levels_minus_one);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
void fake_quantize_unit_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides) {
|
||||
auto quantize_with_scalar_intervals = [levels_minus_one](const T* input,
|
||||
const T* const input_end,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out) {
|
||||
const auto in_low_scalar = *in_low;
|
||||
const auto in_high_scalar = *in_high;
|
||||
const auto out_low_scalar = *out_low;
|
||||
const auto out_high_scalar = *out_high;
|
||||
std::transform(input,
|
||||
input_end,
|
||||
out,
|
||||
[levels_minus_one, in_low_scalar, in_high_scalar, out_low_scalar, out_high_scalar](T input) {
|
||||
return quantize(input,
|
||||
in_low_scalar,
|
||||
in_high_scalar,
|
||||
out_low_scalar,
|
||||
out_high_scalar,
|
||||
levels_minus_one);
|
||||
});
|
||||
};
|
||||
|
||||
fake_quantize_loop(arg_shape,
|
||||
arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
input_inner_stride,
|
||||
get_outer_strides,
|
||||
quantize_with_scalar_intervals);
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
void fake_quantize_unit_output_intervals_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides) {
|
||||
auto quantize_with_scalar_output_intervals = [levels_minus_one](const T* input,
|
||||
const T* const input_end,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out) {
|
||||
const auto out_low_scalar = *out_low;
|
||||
const auto out_high_scalar = *out_high;
|
||||
transform(input,
|
||||
input_end,
|
||||
in_low,
|
||||
in_high,
|
||||
out,
|
||||
[levels_minus_one, out_low_scalar, out_high_scalar](T input, T in_low, T in_high) {
|
||||
return quantize(input, in_low, in_high, out_low_scalar, out_high_scalar, levels_minus_one);
|
||||
});
|
||||
};
|
||||
|
||||
fake_quantize_loop(arg_shape,
|
||||
arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
input_inner_stride,
|
||||
get_outer_strides,
|
||||
quantize_with_scalar_output_intervals);
|
||||
}
|
||||
|
||||
template <typename T, typename F>
|
||||
void fake_quantize_unit_input_intervals_inner_stride(const T* arg,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out,
|
||||
const Shape& arg_shape,
|
||||
T levels_minus_one,
|
||||
size_t input_inner_stride,
|
||||
const F& get_outer_strides) {
|
||||
auto quantize_with_scalar_input_intervals = [levels_minus_one](const T* input,
|
||||
const T* const input_end,
|
||||
const T* in_low,
|
||||
const T* in_high,
|
||||
const T* out_low,
|
||||
const T* out_high,
|
||||
T* out) {
|
||||
const auto in_low_scalar = *in_low;
|
||||
const auto in_high_scalar = *in_high;
|
||||
transform(input,
|
||||
input_end,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
[levels_minus_one, in_low_scalar, in_high_scalar](T input, T out_low, T out_high) {
|
||||
return quantize(input, in_low_scalar, in_high_scalar, out_low, out_high, levels_minus_one);
|
||||
});
|
||||
};
|
||||
|
||||
fake_quantize_loop(arg_shape,
|
||||
arg,
|
||||
in_low,
|
||||
in_high,
|
||||
out_low,
|
||||
out_high,
|
||||
out,
|
||||
input_inner_stride,
|
||||
get_outer_strides,
|
||||
quantize_with_scalar_input_intervals);
|
||||
}
|
||||
|
||||
} // namespace fake_quantize_details
|
||||
|
||||
} // namespace reference
|
||||
} // namespace ov
|
||||
|
@ -253,6 +253,22 @@ std::vector<FakeQuantizeParams> generateParamsForFakeQuantize() {
|
||||
}),
|
||||
16,
|
||||
op::AutoBroadcastSpec(op::AutoBroadcastType::NUMPY)),
|
||||
FakeQuantizeParams(
|
||||
ov::Shape{1, 2, 4, 4},
|
||||
ov::Shape{1, 2, 4, 4},
|
||||
IN_ET,
|
||||
IN_ET,
|
||||
iota_vector<T>(shape_size(Shape{1, 2, 4, 4})),
|
||||
std::vector<T>{
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 8.75, 8.75, 8.75, 8.75, 8.75, 8.75, 17.5,
|
||||
23.75, 23.75, 27.5, 27.5, 27.5, 27.5, 27.5, 31.25, 31.25, 31.25, 31.25, 31.25, 35, 35, 35, 35,
|
||||
},
|
||||
op::v0::Constant::create(IN_ET, Shape{1, 2, 1, 1}, {5.f, 10.f}),
|
||||
op::v0::Constant::create(IN_ET, Shape{1, 1}, {30.f}),
|
||||
op::v0::Constant::create(IN_ET, Shape{2, 1, 1}, {0.f, 20.f}),
|
||||
op::v0::Constant::create(IN_ET, Shape{1}, {35.f}),
|
||||
5),
|
||||
|
||||
};
|
||||
return params;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user