[Snippets] [CPU] FMA Fusion (#8066)

This commit is contained in:
Vladislav Golubev 2023-02-15 09:58:37 +01:00 committed by GitHub
parent d73ee9e59b
commit fb3d785c06
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 374 additions and 17 deletions

View File

@ -23,3 +23,8 @@ ie_faster_build(${TARGET_NAME}
UNITY
PCH PRIVATE "src/precomp.hpp"
)
add_library(snippets_test_utils STATIC ${CMAKE_CURRENT_SOURCE_DIR}/include/lowering_utils.hpp
${CMAKE_CURRENT_SOURCE_DIR}/src/lowering_utils.cpp)
target_include_directories(snippets_test_utils PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
target_link_libraries(snippets_test_utils PRIVATE commonTestUtils snippetsNgraphFunctions)

View File

@ -26,7 +26,7 @@ public:
class DummyTargetMachine : public ngraph::snippets::TargetMachine {
public:
DummyTargetMachine();
DummyTargetMachine(const std::vector<ov::Node::type_info_t>& custom_opset = {});
bool is_supported() const override { return true; }
ngraph::snippets::code get_snippet() const override { return nullptr; }
size_t get_lanes() const override { return 10; }
@ -35,6 +35,7 @@ public:
class DummyGenerator : public ngraph::snippets::Generator {
public:
DummyGenerator() : ngraph::snippets::Generator(std::make_shared<DummyTargetMachine>()) {}
DummyGenerator(const std::shared_ptr<ngraph::snippets::TargetMachine>& t) : ngraph::snippets::Generator(t) {}
};
class LoweringTests : public TransformationTestsF {
@ -47,7 +48,9 @@ public:
protected:
static std::shared_ptr<ngraph::snippets::op::Subgraph> getSubgraph(const std::shared_ptr<Model>& f);
static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f,
const ov::PartialShape& master_shape);
const ov::PartialShape& master_shape,
ov::pass::Manager target_optimizations = {},
const std::shared_ptr<ngraph::snippets::Generator> generator = nullptr);
static std::shared_ptr<ngraph::snippets::op::Subgraph> getTokenizedSubgraph(const std::shared_ptr<Model>& f);
ov::PartialShape master_shape{};
};

View File

@ -11,7 +11,7 @@ namespace ov {
namespace test {
namespace snippets {
DummyTargetMachine::DummyTargetMachine() {
DummyTargetMachine::DummyTargetMachine(const std::vector<ov::Node::type_info_t>& custom_opset) {
auto dummy_functor = [](const std::shared_ptr<ngraph::Node>& n) {
return std::make_shared<DummyEmitter>();
};
@ -41,6 +41,10 @@ DummyTargetMachine::DummyTargetMachine() {
jitters[ngraph::snippets::op::Buffer::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::VectorBuffer::get_type_info_static()] = dummy_functor;
jitters[ngraph::snippets::op::Fill::get_type_info_static()] = dummy_functor;
for (const auto& elem : custom_opset) {
jitters[elem] = dummy_functor;
}
}
LoweringTests::LoweringTests() : TransformationTestsF() {
@ -92,9 +96,11 @@ std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const
}
std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f,
const ov::PartialShape& master_shape) {
const ov::PartialShape& master_shape,
ov::pass::Manager target_optimizations,
const std::shared_ptr<ngraph::snippets::Generator> generator) {
auto subgraph = getTokenizedSubgraph(f);
subgraph->set_generator(std::make_shared<DummyGenerator>());
subgraph->set_generator(generator == nullptr ? std::make_shared<DummyGenerator>() : generator);
subgraph->set_master_shape(master_shape);
const auto& body = subgraph->body_ptr();
auto& body_rt_info = body->get_rt_info();
@ -103,19 +109,17 @@ std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgrap
std::vector<std::vector<size_t>> new_shapes;
for (const auto& p : body->get_parameters()) {
const auto pshape = p->get_output_partial_shape(0);
if (pshape.is_dynamic())
IE_THROW() << "getLoweredSubgraph supports only static shapes";
OPENVINO_ASSERT(pshape.is_static(), "getLoweredSubgraph supports only static shapes");
new_shapes.push_back(pshape.get_shape());
}
for (const auto& r : body->get_results()) {
const auto pshape = r->get_input_partial_shape(0);
if (pshape.is_dynamic())
IE_THROW() << "getLoweredSubgraph supports only static shapes";
OPENVINO_ASSERT(pshape.is_static(), "getLoweredSubgraph supports only static shapes");
new_shapes.push_back(pshape.get_shape());
}
body_rt_info["PluginShapesOverride"] = new_shapes;
subgraph->set_tile_rank(2);
subgraph->generate();
subgraph->generate(target_optimizations);
return subgraph;
}

View File

@ -17,6 +17,7 @@
#include "snippets_transformations/op/load_convert.hpp"
#include "snippets_transformations/op/store_convert.hpp"
#include "snippets_transformations/op/fused_mul_add.hpp"
#include "snippets/op/brgemm.hpp"
#include "ngraph_transformations/op/swish_cpu.hpp"
@ -71,6 +72,7 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
// ternary
jitters[ngraph::opset1::Select::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_select_emitter);
jitters[ov::intel_cpu::FusedMulAdd::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_mul_add_emitter);
// binary
jitters[ngraph::opset1::Add::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_add_emitter);

View File

@ -24,6 +24,7 @@
#include "emitters/cpu_generator.hpp"
#include "utils/cpu_utils.hpp"
#include "snippets_transformations/fuse_load_store_and_convert.hpp"
#include "snippets_transformations/mul_add_to_fma.hpp"
#include "ngraph_transformations/convert_to_swish_cpu.hpp"
using namespace InferenceEngine;
@ -503,6 +504,7 @@ void Snippet::generate(const jit_snippets_compile_args* jcp) {
optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
optManager.register_pass<ConvertToSwishCPU>();
optManager.register_pass<ov::intel_cpu::pass::MulAddToFMA>();
// LoadConvert uses Load emitter that support conversion from any type to only f32
optManager.get_pass_config()->set_callback<ov::intel_cpu::pass::FuseLoadConvert>(

View File

@ -0,0 +1,49 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <snippets/itt.hpp>
#include "mul_add_to_fma.hpp"
#include "snippets/snippets_isa.hpp"
#include "op/fused_mul_add.hpp"
#include <ngraph/opsets/opset1.hpp>
#include <ngraph/rt_info.hpp>
#include <ngraph/pattern/op/wrap_type.hpp>
using namespace ngraph;
ov::intel_cpu::pass::MulAddToFMA::MulAddToFMA() {
MATCHER_SCOPE(MulAddToFMA);
auto mul_input_1 = pattern::any_input();
auto mul_input_2 = pattern::any_input();
auto mul_m = pattern::wrap_type<opset1::Multiply>({ mul_input_1, mul_input_2 }, pattern::consumers_count(1));
auto add_input_2 = pattern::any_input();
auto add_m = pattern::wrap_type<opset1::Add>({ mul_m, add_input_2 });
ngraph::matcher_pass_callback callback = [=](pattern::Matcher& m) {
OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::MulAddToFMA_callback")
const auto& pattern_map = m.get_pattern_value_map();
const auto multiply = pattern_map.at(mul_m).get_node_shared_ptr();
const auto add = pattern_map.at(add_m).get_node_shared_ptr();
if (transformation_callback(add)) {
return false;
}
const auto& a = multiply->input_value(0);
const auto& b = multiply->input_value(1);
const auto& c = pattern_map.at(add_input_2);
const auto fma = std::make_shared<ov::intel_cpu::FusedMulAdd>(a, b, c);
ngraph::copy_runtime_info({ a.get_node_shared_ptr(), b.get_node_shared_ptr(), c.get_node_shared_ptr() }, fma);
fma->set_friendly_name(add->get_friendly_name());
ngraph::replace_node(add, fma);
return true;
};
auto m = std::make_shared<ngraph::pattern::Matcher>(add_m, "MulAddToFMA");
register_matcher(m, callback);
}

View File

@ -0,0 +1,25 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/pass/graph_rewrite.hpp"
namespace ov {
namespace intel_cpu {
namespace pass {
/**
* @interface MulAddToFMA
* @brief Replaces mul and add with FusedMulAdd node
* @ingroup snippets
*/
class MulAddToFMA : public ngraph::pass::MatcherPass {
public:
MulAddToFMA();
};
} // namespace pass
} // namespace intel_cpu
} // namespace ov

View File

@ -0,0 +1,47 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include "fused_mul_add.hpp"
#include "snippets/itt.hpp"
#include "ngraph/op/util/elementwise_args.hpp"
#include <ngraph/op/multiply.hpp>
#include <ngraph/op/add.hpp>
#include <ngraph/runtime/host_tensor.hpp>
using namespace ov;
using namespace ov::intel_cpu;
FusedMulAdd::FusedMulAdd(const Output<Node>& a, const Output<Node>& b, const Output<Node>& c) : Op({ a, b, c }) {
constructor_validate_and_infer_types();
}
bool FusedMulAdd::visit_attributes(AttributeVisitor& visitor) {
return true;
}
std::shared_ptr<Node> FusedMulAdd::clone_with_new_inputs(const OutputVector& new_args) const {
INTERNAL_OP_SCOPE(FusedMulAdd);
check_new_args_count(this, new_args);
return std::make_shared<FusedMulAdd>(new_args.at(0), new_args.at(1), new_args.at(2));
}
void FusedMulAdd::validate_and_infer_types() {
const auto input_size = get_input_size();
NGRAPH_CHECK(input_size == 3, "FusedMulAdd must have 3 inputs");
NGRAPH_CHECK(get_output_size() == 1, "FusedMulAdd must have only 1 output");
const auto element_type = get_input_element_type(0);
auto pshape = get_input_partial_shape(0);
for (size_t i = 1; i < input_size; ++i) {
NODE_VALIDATION_CHECK(this,
element_type == get_input_element_type(i),
"Argument element types are inconsistent.");
NODE_VALIDATION_CHECK(this,
PartialShape::broadcast_merge_into(pshape, get_input_partial_shape(i), ov::op::AutoBroadcastType::NUMPY),
"Argument shapes are inconsistent.");
}
set_output_type(0, element_type, pshape);
}

View File

@ -0,0 +1,30 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
#include "ngraph/op/op.hpp"
namespace ov {
namespace intel_cpu {
/**
* @interface FusedMulAdd
* @brief Fused Multiply Add
* @ingroup snippets
*/
class FusedMulAdd : public ngraph::op::Op {
public:
OPENVINO_OP("FusedMulAdd", "SnippetsOpset");
FusedMulAdd() = default;
FusedMulAdd(const Output<Node>& a, const Output<Node>& b, const Output<Node>& c);
bool visit_attributes(AttributeVisitor& visitor) override;
std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
void validate_and_infer_types() override;
};
} // namespace intel_cpu
} // namespace ov

View File

@ -38,6 +38,7 @@ addIeTargetTest(
unitTestUtils
ngraphFunctions
snippetsNgraphFunctions
snippets_test_utils
ADD_CPPLINT
LABELS
CPU

View File

@ -0,0 +1,193 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include <gtest/gtest.h>
#include <subgraph_simple.hpp>
#include <snippets_transformations/mul_add_to_fma.hpp>
#include <snippets_transformations/op/fused_mul_add.hpp>
#include "snippets/pass/loop_helpers.hpp"
#include "lowering_utils.hpp"
namespace ov {
namespace test {
namespace snippets {
/// Simple Eltwise graph fully convertible to Subgraph.
/// Tokenized simply by attaching eltwises.
// in1 in2 in1 in2
// Multiply in3 or in3 Multiply
// Add Add
// Result Result
class EltwiseWithMulAddFunction : public SnippetsFunctionBase {
public:
explicit EltwiseWithMulAddFunction(const std::vector<PartialShape>& inputShapes,
const size_t add_input_idx = 0,
const bool scalar_input = false)
: SnippetsFunctionBase(inputShapes),
add_input_idx(add_input_idx),
scalar_input(scalar_input) {
NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
NGRAPH_CHECK(add_input_idx < 2, "Got invalid input idx for add operation");
}
protected:
std::shared_ptr<ov::Model> initOriginal() const override {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
ParameterVector parameters{data0, data1};
std::shared_ptr<Node> data2;
if (scalar_input) {
data2 = op::v0::Constant::create(precision, {}, {2.f});
} else {
auto parameter = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
parameters.push_back(parameter);
data2 = parameter;
}
auto mul = std::make_shared<op::v1::Multiply>(data0, data1);
const auto& fst_input = add_input_idx == 0 ? mul->output(0) : data2->output(0);
const auto& sec_input = add_input_idx == 0 ? data2->output(0) : mul->output(0);
auto add = std::make_shared<op::v1::Add>(fst_input, sec_input);
return std::make_shared<Model>(NodeVector{add}, parameters);
}
std::shared_ptr<ov::Model> initLowered() const override {
auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
ParameterVector parameters{data0, data1};
std::shared_ptr<Node> data2;
if (scalar_input) {
data2 = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{}, 2.f);
} else {
auto parameter = std::make_shared<op::v0::Parameter>(precision, input_shapes[2]);
parameters.push_back(parameter);
data2 = parameter;
}
auto load0 = std::make_shared<ngraph::snippets::op::Load>(data0);
auto load1 = std::make_shared<ngraph::snippets::op::Load>(data1);
auto load2 = scalar_input ? data2 : std::make_shared<ngraph::snippets::op::Load>(data2);
auto a = scalar_input || add_input_idx == 0 ? load0 : load1;
auto b = scalar_input || add_input_idx == 0 ? load1 : load2;
auto c = scalar_input || add_input_idx == 0 ? load2 : load0;
auto fma = std::make_shared<ov::intel_cpu::FusedMulAdd>(a, b, c);
auto store = std::make_shared<ngraph::snippets::op::Store>(fma);
auto model = std::make_shared<ov::Model>(NodeVector{store}, parameters);
ResultVector results({model->get_results()[0]});
const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(parameters);
// control dependency is added only in the case when scalar are located before loopBegin in topological order
if (scalar_input && add_input_idx == 1) {
data2->add_control_dependency(inner_loop_begin);
}
std::vector<bool> apply_increments(parameters.size() + results.size(), true);
ngraph::snippets::op::insertLoopEnd(results, inner_loop_begin, 1, 1, apply_increments);
const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(parameters);
ngraph::snippets::op::insertLoopEnd(results, outer_loop_begin, 1, 1, apply_increments);
return model;
}
void validate_function(const std::shared_ptr<Model> &m) const override {
NGRAPH_CHECK(m != nullptr, "The test requires Model to be defined");
const auto &params = m->get_parameters();
NGRAPH_CHECK(params.size() == (scalar_input ? input_shapes.size() - 1 : input_shapes.size()),
"Passed input shapes and produced function are inconsistent.");
for (size_t i = 0; i < params.size(); i++)
NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_shape().begin()),
"Passed input shapes and produced function are inconsistent.");
}
private:
size_t add_input_idx;
bool scalar_input;
};
typedef std::tuple<
PartialShape, // Input shape 0
PartialShape, // Input shape 1
PartialShape, // Input shape 2
PartialShape, // Master shape
size_t // Add input index
> MulAddToFMAParams;
class MulAddToFMATests : public LoweringTests, public testing::WithParamInterface<MulAddToFMAParams> {
public:
static std::string getTestCaseName(testing::TestParamInfo<MulAddToFMAParams> obj) {
std::vector<PartialShape> inputShapes(3);
PartialShape master_shape;
size_t add_input_idx;
std::tie(inputShapes[0], inputShapes[1], inputShapes[2], master_shape, add_input_idx) = obj.param;
std::ostringstream result;
for (size_t i = 0; i < inputShapes.size(); i++)
result << "IS[" << i << "]=" << inputShapes[i] << "_";
result << "MS=" << master_shape << "_";
result << "add_input_idx=" << add_input_idx;
return result.str();
}
protected:
void SetUp() override {
LoweringTests::SetUp();
std::vector<PartialShape> inputShapes(3);
size_t add_input_idx;
std::tie(inputShapes[0], inputShapes[1], inputShapes[2], master_shape, add_input_idx) = this->GetParam();
const bool scalar_input = ov::shape_size(inputShapes[2].to_shape()) == 1;
snippets_function = std::make_shared<EltwiseWithMulAddFunction>(inputShapes, add_input_idx, scalar_input);
cpu_manager.register_pass<ov::intel_cpu::pass::MulAddToFMA>();
std::vector<ov::Node::type_info_t> custom_opset{ov::intel_cpu::FusedMulAdd::get_type_info_static()};
auto target_machine = std::make_shared<DummyTargetMachine>(custom_opset);
generator = std::make_shared<DummyGenerator>(target_machine);
}
std::shared_ptr<SnippetsFunctionBase> snippets_function;
std::shared_ptr<ngraph::snippets::Generator> generator;
ov::pass::Manager cpu_manager;
};
TEST_P(MulAddToFMATests, MulAddToFMATests) {
auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), master_shape, cpu_manager, generator);
model = subgraph->body_ptr();
model_ref = snippets_function->getLowered();
}
namespace MulAddToFMATestsInstantiation {
std::vector<PartialShape> in_shapes_0 = {{1, 3, 16, 16}};
std::vector<PartialShape> in_shapes_1 = {{1, 3, 16, 16}};
std::vector<PartialShape> in_shapes_2 = {{1, 3, 16, 16}, {}};
std::vector<size_t> in_idxes_for_add = {0, 1};
INSTANTIATE_TEST_SUITE_P(smoke_Snippets, MulAddToFMATests,
::testing::Combine(
::testing::ValuesIn(in_shapes_0),
::testing::ValuesIn(in_shapes_1),
::testing::ValuesIn(in_shapes_2),
::testing::Values(ov::PartialShape{1, 3, 16, 16}),
::testing::ValuesIn(in_idxes_for_add)),
MulAddToFMATests::getTestCaseName);
} // namespace MulAddToFMATestsInstantiation
TEST_F(TransformationTestsF, smoke_Snippets_MulAddToFMATestsNegative) {
auto data0 = std::make_shared<op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 3, 16, 16});
auto data1 = std::make_shared<op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 3, 16, 16});
auto data2 = std::make_shared<op::v0::Parameter>(ov::element::f32, ov::PartialShape{1, 3, 16, 16});
auto mul = std::make_shared<op::v1::Multiply>(data0, data1);
auto additional_consumer = std::make_shared<op::v0::Relu>(mul);
auto add = std::make_shared<op::v1::Add>(mul, data2);
model = std::make_shared<Model>(ov::NodeVector{add, additional_consumer}, ov::ParameterVector{data0, data1, data2});
manager.register_pass<ov::intel_cpu::pass::MulAddToFMA>();
}
} // namespace snippets
} // namespace test
} // namespace ov

View File

@ -18,8 +18,6 @@
#include <transformations/init_node_info.hpp>
#include <openvino/core/model.hpp>
#include "ie_common.h"
#include "test_common.hpp"
#include "graph_comparator.hpp"

View File

@ -12,7 +12,6 @@
#include <ie_data.h>
#include <ie_input_info.hpp>
#include <ie_blob.h>
#include <ie_common.h>
#include <ie_preprocess.hpp>
#include "openvino/util/pp.hpp"

View File

@ -1,4 +1,4 @@
// Copyright (C) 2022 Intel Corporation
// Copyright (C) 2022-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

View File

@ -1,4 +1,4 @@
// Copyright (C) 2022 Intel Corporation
// Copyright (C) 2022-2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
@ -307,7 +307,6 @@ std::shared_ptr<ov::Model> BroadcastSelectFunction::initOriginal() const {
return std::make_shared<Model>(NodeVector{select}, ParameterVector{data0, data1, data2});
}
} // namespace snippets
} // namespace test
} // namespace ov
} // namespace ov